arpit-gour02 commited on
Commit
fe7005f
·
1 Parent(s): c7411c1

update the datloading.ipynb file

Browse files
Files changed (1) hide show
  1. dataloading.ipynb +130 -926
dataloading.ipynb CHANGED
@@ -2,123 +2,37 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 3,
6
  "id": "ae9bc87a",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
10
  "from datasets import load_dataset\n",
11
- "import datasets"
 
12
  ]
13
  },
14
  {
15
  "cell_type": "code",
16
- "execution_count": 4,
17
- "id": "b2ffd47f",
18
  "metadata": {},
19
- "outputs": [
20
- {
21
- "data": {
22
- "application/vnd.jupyter.widget-view+json": {
23
- "model_id": "0ccb4dc0c6bf4c8f89a0be03b742598f",
24
- "version_major": 2,
25
- "version_minor": 0
26
- },
27
- "text/plain": [
28
- "Resolving data files: 0%| | 0/119 [00:00<?, ?it/s]"
29
- ]
30
- },
31
- "metadata": {},
32
- "output_type": "display_data"
33
- },
34
- {
35
- "data": {
36
- "application/vnd.jupyter.widget-view+json": {
37
- "model_id": "27ddf8817cfb45d1b4da85d0e14e6ad4",
38
- "version_major": 2,
39
- "version_minor": 0
40
- },
41
- "text/plain": [
42
- "Loading dataset shards: 0%| | 0/64 [00:00<?, ?it/s]"
43
- ]
44
- },
45
- "metadata": {},
46
- "output_type": "display_data"
47
- }
48
- ],
49
  "source": [
50
  "ds = load_dataset(\"chainyo/rvl-cdip\")"
51
  ]
52
  },
53
  {
54
- "cell_type": "code",
55
- "execution_count": 6,
56
- "id": "f5aa7605",
57
  "metadata": {},
58
- "outputs": [
59
- {
60
- "data": {
61
- "text/plain": [
62
- "DatasetDict({\n",
63
- " train: Dataset({\n",
64
- " features: ['image', 'label'],\n",
65
- " num_rows: 319999\n",
66
- " })\n",
67
- " test: Dataset({\n",
68
- " features: ['image', 'label'],\n",
69
- " num_rows: 40000\n",
70
- " })\n",
71
- " val: Dataset({\n",
72
- " features: ['image', 'label'],\n",
73
- " num_rows: 40000\n",
74
- " })\n",
75
- "})"
76
- ]
77
- },
78
- "execution_count": 6,
79
- "metadata": {},
80
- "output_type": "execute_result"
81
- }
82
- ],
83
  "source": [
84
- "ds"
85
  ]
86
  },
87
  {
88
  "cell_type": "code",
89
- "execution_count": 24,
90
- "id": "9b19f0c5",
91
- "metadata": {},
92
- "outputs": [
93
- {
94
- "name": "stdout",
95
- "output_type": "stream",
96
- "text": [
97
- "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]\n",
98
- "16\n",
99
- "ClassLabel(names=['advertisement', 'budget', 'email', 'file folder', 'form', 'handwritten', 'invoice', 'letter', 'memo', 'news article', 'presentation', 'questionnaire', 'resume', 'scientific publication', 'scientific report', 'specification'])\n",
100
- "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]\n",
101
- "16\n",
102
- "ClassLabel(names=['advertisement', 'budget', 'email', 'file folder', 'form', 'handwritten', 'invoice', 'letter', 'memo', 'news article', 'presentation', 'questionnaire', 'resume', 'scientific publication', 'scientific report', 'specification'])\n",
103
- "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]\n",
104
- "16\n",
105
- "ClassLabel(names=['advertisement', 'budget', 'email', 'file folder', 'form', 'handwritten', 'invoice', 'letter', 'memo', 'news article', 'presentation', 'questionnaire', 'resume', 'scientific publication', 'scientific report', 'specification'])\n"
106
- ]
107
- }
108
- ],
109
- "source": [
110
- "for split in ds.keys():\n",
111
- " unique_labels = ds[split].unique('label')\n",
112
- " num_labels = len(unique_labels)\n",
113
- " class_names = ds[split].features['label']\n",
114
- " print(unique_labels)\n",
115
- " print(num_labels)\n",
116
- " print(class_names)"
117
- ]
118
- },
119
- {
120
- "cell_type": "code",
121
- "execution_count": 48,
122
  "id": "936deafa",
123
  "metadata": {},
124
  "outputs": [
@@ -126,43 +40,35 @@
126
  "name": "stdout",
127
  "output_type": "stream",
128
  "text": [
129
- " Using 12 workers to save RAW images.\n",
130
- "Creating directory structure...\n",
131
- "\n",
132
- "🚀 Processing TRAIN split...\n"
133
  ]
134
  },
135
  {
136
  "data": {
137
  "application/vnd.jupyter.widget-view+json": {
138
- "model_id": "0f8448ab77c64028b36f0aefa759d429",
139
  "version_major": 2,
140
  "version_minor": 0
141
  },
142
  "text/plain": [
143
- "Saving train (num_proc=12): 0%| | 0/319999 [00:00<?, ? examples/s]"
144
  ]
145
  },
146
  "metadata": {},
147
  "output_type": "display_data"
148
  },
149
- {
150
- "name": "stdout",
151
- "output_type": "stream",
152
- "text": [
153
- "\n",
154
- "🚀 Processing VAL split...\n"
155
- ]
156
- },
157
  {
158
  "data": {
159
  "application/vnd.jupyter.widget-view+json": {
160
- "model_id": "09142d58850347789abf3314df6370d9",
161
  "version_major": 2,
162
  "version_minor": 0
163
  },
164
  "text/plain": [
165
- "Saving val (num_proc=12): 0%| | 0/40000 [00:00<?, ? examples/s]"
166
  ]
167
  },
168
  "metadata": {},
@@ -172,367 +78,177 @@
172
  "name": "stdout",
173
  "output_type": "stream",
174
  "text": [
 
 
175
  "\n",
176
- "🚀 Processing TEST split...\n"
177
  ]
178
  },
179
  {
180
  "data": {
181
  "application/vnd.jupyter.widget-view+json": {
182
- "model_id": "95cc463a42704de19c1282f43b5b5252",
183
  "version_major": 2,
184
  "version_minor": 0
185
  },
186
  "text/plain": [
187
- "Saving test (num_proc=12): 0%| | 0/40000 [00:00<?, ? examples/s]"
188
  ]
189
  },
190
  "metadata": {},
191
  "output_type": "display_data"
192
  },
193
- {
194
- "name": "stderr",
195
- "output_type": "stream",
196
- "text": [
197
- "/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/TiffImagePlugin.py:950: UserWarning: Corrupt EXIF data. Expecting to read 2 bytes but only got 0. \n",
198
- " warnings.warn(str(msg))\n"
199
- ]
200
- },
201
- {
202
- "ename": "UnidentifiedImageError",
203
- "evalue": "cannot identify image file <_io.BytesIO object at 0x3250f9d50>",
204
- "output_type": "error",
205
- "traceback": [
206
- "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
207
- "\u001b[31mRemoteTraceback\u001b[39m Traceback (most recent call last)",
208
- "\u001b[31mRemoteTraceback\u001b[39m: \n\"\"\"\nTraceback (most recent call last):\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py\", line 125, in worker\n result = (True, func(*args, **kwds))\n ^^^^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py\", line 586, in _write_generator_to_queue\n for i, result in enumerate(func(**kwargs)):\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3697, in _map_single\n for i, batch in iter_outputs(shard_iterable):\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3647, in iter_outputs\n yield i, apply_function(example, i, offset=offset)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3570, in apply_function\n processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/var/folders/07/1hr7xxpj3sx52fsnpz87jfj40000gp/T/ipykernel_65139/2515864513.py\", line 27, in save_batch_raw\n images = batch['image']\n ~~~~~^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/formatting/formatting.py\", line 285, in __getitem__\n value = self.format(key)\n ^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/formatting/formatting.py\", line 385, in format\n return self.formatter.format_column(self.pa_table.select([key]))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/formatting/formatting.py\", line 465, in format_column\n column = self.python_features_decoder.decode_column(column, pa_table.column_names[0])\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/formatting/formatting.py\", line 228, in decode_column\n self.features.decode_column(column, column_name, token_per_repo_id=self.token_per_repo_id)\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/features/features.py\", line 2130, in decode_column\n decode_nested_example(self[column_name], value, token_per_repo_id=token_per_repo_id)\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/features/features.py\", line 1414, in decode_nested_example\n return schema.decode_example(obj, token_per_repo_id=token_per_repo_id) if obj is not None else None\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/features/image.py\", line 192, in decode_example\n image = PIL.Image.open(BytesIO(bytes_))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/Image.py\", line 3580, in open\n raise UnidentifiedImageError(msg)\nPIL.UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x3250f9d50>\n\"\"\"",
209
- "\nThe above exception was the direct cause of the following exception:\n",
210
- "\u001b[31mUnidentifiedImageError\u001b[39m Traceback (most recent call last)",
211
- "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[48]\u001b[39m\u001b[32m, line 51\u001b[39m\n\u001b[32m 48\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m splits:\n\u001b[32m 49\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m🚀 Processing \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msplit.upper()\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m split...\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m51\u001b[39m \u001b[43mds\u001b[49m\u001b[43m[\u001b[49m\u001b[43msplit\u001b[49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 52\u001b[39m \u001b[43m \u001b[49m\u001b[43msave_batch_raw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 53\u001b[39m \u001b[43m \u001b[49m\u001b[43mbatched\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 54\u001b[39m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m100\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Process 100 images per chunk\u001b[39;49;00m\n\u001b[32m 55\u001b[39m \u001b[43m \u001b[49m\u001b[43mwith_indices\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Needed for unique filenames\u001b[39;49;00m\n\u001b[32m 56\u001b[39m \u001b[43m \u001b[49m\u001b[43mnum_proc\u001b[49m\u001b[43m=\u001b[49m\u001b[43mNUM_PROC\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# <--- THIS IS THE SPEED BOOST\u001b[39;49;00m\n\u001b[32m 57\u001b[39m \u001b[43m \u001b[49m\u001b[43mfn_kwargs\u001b[49m\u001b[43m=\u001b[49m\u001b[43m{\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43msplit_name\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 58\u001b[39m \u001b[43m \u001b[49m\u001b[43mdesc\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43mf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mSaving \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43msplit\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m 59\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 61\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m✅ DONE! Raw data saved to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mos.path.abspath(OUTPUT_DIR)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n",
212
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py:562\u001b[39m, in \u001b[36mtransmit_format.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 555\u001b[39m self_format = {\n\u001b[32m 556\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mtype\u001b[39m\u001b[33m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m._format_type,\n\u001b[32m 557\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mformat_kwargs\u001b[39m\u001b[33m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m._format_kwargs,\n\u001b[32m 558\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mcolumns\u001b[39m\u001b[33m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m._format_columns,\n\u001b[32m 559\u001b[39m \u001b[33m\"\u001b[39m\u001b[33moutput_all_columns\u001b[39m\u001b[33m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m._output_all_columns,\n\u001b[32m 560\u001b[39m }\n\u001b[32m 561\u001b[39m \u001b[38;5;66;03m# apply actual function\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m562\u001b[39m out: Union[\u001b[33m\"\u001b[39m\u001b[33mDataset\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mDatasetDict\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 563\u001b[39m datasets: \u001b[38;5;28mlist\u001b[39m[\u001b[33m\"\u001b[39m\u001b[33mDataset\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[38;5;28mlist\u001b[39m(out.values()) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(out, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m [out]\n\u001b[32m 564\u001b[39m \u001b[38;5;66;03m# re-apply format to the output\u001b[39;00m\n",
213
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py:3332\u001b[39m, in \u001b[36mDataset.map\u001b[39m\u001b[34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc, try_original_type)\u001b[39m\n\u001b[32m 3329\u001b[39m os.environ = prev_env\n\u001b[32m 3330\u001b[39m logger.info(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSpawning \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnum_proc\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m processes\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m3332\u001b[39m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrank\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdone\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43miflatmap_unordered\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 3333\u001b[39m \u001b[43m \u001b[49m\u001b[43mpool\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mDataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_map_single\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs_iterable\u001b[49m\u001b[43m=\u001b[49m\u001b[43munprocessed_kwargs_per_job\u001b[49m\n\u001b[32m 3334\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 3335\u001b[39m \u001b[43m \u001b[49m\u001b[43mcheck_if_shard_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrank\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdone\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 3337\u001b[39m pool.close()\n",
214
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py:626\u001b[39m, in \u001b[36miflatmap_unordered\u001b[39m\u001b[34m(pool, func, kwargs_iterable)\u001b[39m\n\u001b[32m 623\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 624\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m pool_changed:\n\u001b[32m 625\u001b[39m \u001b[38;5;66;03m# we get the result in case there's an error to raise\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m626\u001b[39m [\u001b[43masync_result\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0.05\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m async_result \u001b[38;5;129;01min\u001b[39;00m async_results]\n",
215
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py:774\u001b[39m, in \u001b[36mApplyResult.get\u001b[39m\u001b[34m(self, timeout)\u001b[39m\n\u001b[32m 772\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._value\n\u001b[32m 773\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m774\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m._value\n",
216
- "\u001b[31mUnidentifiedImageError\u001b[39m: cannot identify image file <_io.BytesIO object at 0x3250f9d50>"
217
- ]
218
- }
219
- ],
220
- "source": [
221
- "import os\n",
222
- "import multiprocessing\n",
223
- "\n",
224
- "# 1. Configuration\n",
225
- "OUTPUT_DIR = \"rvl_cdip\"\n",
226
- "NUM_PROC = os.cpu_count() # Automatically use all CPU cores\n",
227
- "\n",
228
- "# 2. Pre-Calculate Class Names\n",
229
- "# We do this once so workers don't have to look it up repeatedly\n",
230
- "labels_feature = ds['train'].features['label']\n",
231
- "idx_to_class = {idx: name for idx, name in enumerate(labels_feature.names)}\n",
232
- "print(f\"✅ Using {NUM_PROC} workers to save RAW images.\")\n",
233
- "\n",
234
- "# 3. Pre-Create Directories\n",
235
- "# Create all folders upfront to prevent collision errors\n",
236
- "print(\"Creating directory structure...\")\n",
237
- "splits = ['train', 'val', 'test']\n",
238
- "for split in splits:\n",
239
- " for class_name in idx_to_class.values():\n",
240
- " os.makedirs(os.path.join(OUTPUT_DIR, split, class_name), exist_ok=True)\n",
241
- "\n",
242
- "# 4. The Worker Function (Raw Save)\n",
243
- "def save_batch_raw(batch, indices, split_name):\n",
244
- " \"\"\"\n",
245
- " Saves a batch of images in their original, raw format.\n",
246
- " \"\"\"\n",
247
- " images = batch['image']\n",
248
- " labels = batch['label']\n",
249
- " \n",
250
- " for img, label_idx, original_idx in zip(images, labels, indices):\n",
251
- " class_name = idx_to_class[label_idx]\n",
252
- " \n",
253
- " # Define Path\n",
254
- " filename = f\"{original_idx}.png\"\n",
255
- " file_path = os.path.join(OUTPUT_DIR, split_name, class_name, filename)\n",
256
- " \n",
257
- " # Save RAW (No Resize)\n",
258
- " # We only convert to RGB if absolutely necessary (e.g. CMYK/Transparency issues)\n",
259
- " # otherwise we save as is.\n",
260
- " if img.mode not in ['RGB', 'L']: # 'L' is standard grayscale\n",
261
- " img = img.convert('RGB')\n",
262
- " \n",
263
- " img.save(file_path)\n",
264
- " \n",
265
- " return batch\n",
266
- "\n",
267
- "# 5. Execute Parallel Processing\n",
268
- "for split in splits:\n",
269
- " print(f\"\\n🚀 Processing {split.upper()} split...\")\n",
270
- " \n",
271
- " ds[split].map(\n",
272
- " save_batch_raw,\n",
273
- " batched=True,\n",
274
- " batch_size=100, # Process 100 images per chunk\n",
275
- " with_indices=True, # Needed for unique filenames\n",
276
- " num_proc=NUM_PROC, # <--- THIS IS THE SPEED BOOST\n",
277
- " fn_kwargs={'split_name': split},\n",
278
- " desc=f\"Saving {split}\"\n",
279
- " )\n",
280
- "\n",
281
- "print(f\"\\n✅ DONE! Raw data saved to {os.path.abspath(OUTPUT_DIR)}\")"
282
- ]
283
- },
284
- {
285
- "cell_type": "code",
286
- "execution_count": 51,
287
- "id": "5645bccb",
288
- "metadata": {},
289
- "outputs": [
290
  {
291
  "name": "stdout",
292
  "output_type": "stream",
293
  "text": [
294
- "🛠️ Repairing TEST split (with integrity check)...\n"
 
295
  ]
296
  },
297
  {
298
  "data": {
299
  "application/vnd.jupyter.widget-view+json": {
300
- "model_id": "22efa388600c486783774cefabee4455",
301
  "version_major": 2,
302
  "version_minor": 0
303
  },
304
  "text/plain": [
305
- "Checking test: 0%| | 0/40000 [00:00<?, ?it/s]"
306
  ]
307
  },
308
  "metadata": {},
309
  "output_type": "display_data"
310
  },
311
- {
312
- "name": "stderr",
313
- "output_type": "stream",
314
- "text": [
315
- "/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/TiffImagePlugin.py:950: UserWarning: Corrupt EXIF data. Expecting to read 2 bytes but only got 0. \n",
316
- " warnings.warn(str(msg))\n"
317
- ]
318
- },
319
- {
320
- "ename": "UnidentifiedImageError",
321
- "evalue": "cannot identify image file <_io.BytesIO object at 0x10abe6f20>",
322
- "output_type": "error",
323
- "traceback": [
324
- "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
325
- "\u001b[31mUnidentifiedImageError\u001b[39m Traceback (most recent call last)",
326
- "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[51]\u001b[39m\u001b[32m, line 18\u001b[39m\n\u001b[32m 15\u001b[39m current_ds = ds[split]\n\u001b[32m 16\u001b[39m skipped_count = \u001b[32m0\u001b[39m\n\u001b[32m---> \u001b[39m\u001b[32m18\u001b[39m \u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43menumerate\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtqdm\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcurrent_ds\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdesc\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43mf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mChecking \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43msplit\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 19\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mtry\u001b[39;49;00m\u001b[43m:\u001b[49m\n\u001b[32m 20\u001b[39m \u001b[43m \u001b[49m\u001b[43mlabel_idx\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mlabel\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n",
327
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/tqdm/notebook.py:250\u001b[39m, in \u001b[36mtqdm_notebook.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 248\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 249\u001b[39m it = \u001b[38;5;28msuper\u001b[39m().\u001b[34m__iter__\u001b[39m()\n\u001b[32m--> \u001b[39m\u001b[32m250\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mit\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 251\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# return super(tqdm...) will not catch exception\u001b[39;49;00m\n\u001b[32m 252\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01myield\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\n\u001b[32m 253\u001b[39m \u001b[38;5;66;03m# NB: except ... [ as ...] breaks IPython async KeyboardInterrupt\u001b[39;00m\n",
328
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/tqdm/std.py:1181\u001b[39m, in \u001b[36mtqdm.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 1178\u001b[39m time = \u001b[38;5;28mself\u001b[39m._time\n\u001b[32m 1180\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1181\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43miterable\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 1182\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01myield\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\n\u001b[32m 1183\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Update and possibly print the progressbar.\u001b[39;49;00m\n\u001b[32m 1184\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Note: does not call self.update(1) for speed optimisation.\u001b[39;49;00m\n",
329
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py:2483\u001b[39m, in \u001b[36mDataset.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 2481\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(pa_subtable.num_rows):\n\u001b[32m 2482\u001b[39m pa_subtable_ex = pa_subtable.slice(i, \u001b[32m1\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m2483\u001b[39m formatted_output = \u001b[43mformat_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 2484\u001b[39m \u001b[43m \u001b[49m\u001b[43mpa_subtable_ex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2485\u001b[39m \u001b[43m \u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 2486\u001b[39m \u001b[43m \u001b[49m\u001b[43mformatter\u001b[49m\u001b[43m=\u001b[49m\u001b[43mformatter\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2487\u001b[39m \u001b[43m \u001b[49m\u001b[43mformat_columns\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_format_columns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2488\u001b[39m \u001b[43m \u001b[49m\u001b[43moutput_all_columns\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_output_all_columns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2489\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2490\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m formatted_output\n\u001b[32m 2491\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
330
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/formatting/formatting.py:658\u001b[39m, in \u001b[36mformat_table\u001b[39m\u001b[34m(table, key, formatter, format_columns, output_all_columns)\u001b[39m\n\u001b[32m 656\u001b[39m python_formatter = PythonFormatter(features=formatter.features)\n\u001b[32m 657\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m format_columns \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m658\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mformatter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquery_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mquery_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 659\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m query_type == \u001b[33m\"\u001b[39m\u001b[33mcolumn\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 660\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m format_columns:\n",
331
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/formatting/formatting.py:411\u001b[39m, in \u001b[36mFormatter.__call__\u001b[39m\u001b[34m(self, pa_table, query_type)\u001b[39m\n\u001b[32m 409\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, pa_table: pa.Table, query_type: \u001b[38;5;28mstr\u001b[39m) -> Union[RowFormat, ColumnFormat, BatchFormat]:\n\u001b[32m 410\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m query_type == \u001b[33m\"\u001b[39m\u001b[33mrow\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m411\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mformat_row\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 412\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m query_type == \u001b[33m\"\u001b[39m\u001b[33mcolumn\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 413\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.format_column(pa_table)\n",
332
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/formatting/formatting.py:460\u001b[39m, in \u001b[36mPythonFormatter.format_row\u001b[39m\u001b[34m(self, pa_table)\u001b[39m\n\u001b[32m 458\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m LazyRow(pa_table, \u001b[38;5;28mself\u001b[39m)\n\u001b[32m 459\u001b[39m row = \u001b[38;5;28mself\u001b[39m.python_arrow_extractor().extract_row(pa_table)\n\u001b[32m--> \u001b[39m\u001b[32m460\u001b[39m row = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mpython_features_decoder\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdecode_row\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 461\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m row\n",
333
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/formatting/formatting.py:224\u001b[39m, in \u001b[36mPythonFeaturesDecoder.decode_row\u001b[39m\u001b[34m(self, row)\u001b[39m\n\u001b[32m 223\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdecode_row\u001b[39m(\u001b[38;5;28mself\u001b[39m, row: \u001b[38;5;28mdict\u001b[39m) -> \u001b[38;5;28mdict\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m224\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfeatures\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdecode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtoken_per_repo_id\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mtoken_per_repo_id\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.features \u001b[38;5;28;01melse\u001b[39;00m row\n",
334
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/features/features.py:2106\u001b[39m, in \u001b[36mFeatures.decode_example\u001b[39m\u001b[34m(self, example, token_per_repo_id)\u001b[39m\n\u001b[32m 2091\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdecode_example\u001b[39m(\u001b[38;5;28mself\u001b[39m, example: \u001b[38;5;28mdict\u001b[39m, token_per_repo_id: Optional[\u001b[38;5;28mdict\u001b[39m[\u001b[38;5;28mstr\u001b[39m, Union[\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mbool\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m]]] = \u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[32m 2092\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"Decode example with custom feature decoding.\u001b[39;00m\n\u001b[32m 2093\u001b[39m \n\u001b[32m 2094\u001b[39m \u001b[33;03m Args:\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 2102\u001b[39m \u001b[33;03m `dict[str, Any]`\u001b[39;00m\n\u001b[32m 2103\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m 2105\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[32m-> \u001b[39m\u001b[32m2106\u001b[39m column_name: \u001b[43mdecode_nested_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtoken_per_repo_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken_per_repo_id\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2107\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._column_requires_decoding[column_name]\n\u001b[32m 2108\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m value\n\u001b[32m 2109\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m column_name, (feature, value) \u001b[38;5;129;01min\u001b[39;00m zip_dict(\n\u001b[32m 2110\u001b[39m {key: value \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.items() \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m example}, example\n\u001b[32m 2111\u001b[39m )\n\u001b[32m 2112\u001b[39m }\n",
335
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/features/features.py:1414\u001b[39m, in \u001b[36mdecode_nested_example\u001b[39m\u001b[34m(schema, obj, token_per_repo_id)\u001b[39m\n\u001b[32m 1411\u001b[39m \u001b[38;5;66;03m# Object with special decoding:\u001b[39;00m\n\u001b[32m 1412\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(schema, \u001b[33m\"\u001b[39m\u001b[33mdecode_example\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(schema, \u001b[33m\"\u001b[39m\u001b[33mdecode\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mTrue\u001b[39;00m):\n\u001b[32m 1413\u001b[39m \u001b[38;5;66;03m# we pass the token to read and decode files from private repositories in streaming mode\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1414\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mschema\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdecode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtoken_per_repo_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken_per_repo_id\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m obj \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1415\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m obj\n",
336
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/features/image.py:192\u001b[39m, in \u001b[36mImage.decode_example\u001b[39m\u001b[34m(self, value, token_per_repo_id)\u001b[39m\n\u001b[32m 190\u001b[39m image = PIL.Image.open(bytes_)\n\u001b[32m 191\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m192\u001b[39m image = \u001b[43mPIL\u001b[49m\u001b[43m.\u001b[49m\u001b[43mImage\u001b[49m\u001b[43m.\u001b[49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[43mBytesIO\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbytes_\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 193\u001b[39m image.load() \u001b[38;5;66;03m# to avoid \"Too many open files\" errors\u001b[39;00m\n\u001b[32m 194\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m image.getexif().get(PIL.Image.ExifTags.Base.Orientation) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
337
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/Image.py:3580\u001b[39m, in \u001b[36mopen\u001b[39m\u001b[34m(fp, mode, formats)\u001b[39m\n\u001b[32m 3578\u001b[39m warnings.warn(message)\n\u001b[32m 3579\u001b[39m msg = \u001b[33m\"\u001b[39m\u001b[33mcannot identify image file \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[33m\"\u001b[39m % (filename \u001b[38;5;28;01mif\u001b[39;00m filename \u001b[38;5;28;01melse\u001b[39;00m fp)\n\u001b[32m-> \u001b[39m\u001b[32m3580\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m UnidentifiedImageError(msg)\n",
338
- "\u001b[31mUnidentifiedImageError\u001b[39m: cannot identify image file <_io.BytesIO object at 0x10abe6f20>"
339
- ]
340
- }
341
- ],
342
- "source": [
343
- "import os\n",
344
- "from tqdm.auto import tqdm\n",
345
- "from PIL import UnidentifiedImageError\n",
346
- "\n",
347
- "# Configuration\n",
348
- "OUTPUT_DIR = \"rvl_cdip\"\n",
349
- "split = \"test\" \n",
350
- "\n",
351
- "# Get Class Mapping\n",
352
- "labels_feature = ds['train'].features['label']\n",
353
- "idx_to_class = {idx: name for idx, name in enumerate(labels_feature.names)}\n",
354
- "\n",
355
- "print(f\"🛠️ Repairing {split.upper()} split (with integrity check)...\")\n",
356
- "\n",
357
- "current_ds = ds[split]\n",
358
- "skipped_count = 0\n",
359
- "\n",
360
- "for i, example in enumerate(tqdm(current_ds, desc=f\"Checking {split}\")):\n",
361
- " try:\n",
362
- " label_idx = example['label']\n",
363
- " class_name = idx_to_class[label_idx]\n",
364
- " \n",
365
- " target_folder = os.path.join(OUTPUT_DIR, split, class_name)\n",
366
- " filename = f\"{i}.png\"\n",
367
- " file_path = os.path.join(target_folder, filename)\n",
368
- " \n",
369
- " # --- IMPROVED CHECK ---\n",
370
- " # Only skip if file exists AND is not empty (larger than 0 bytes)\n",
371
- " # This fixes the edge case where the crash left a 0-byte file\n",
372
- " if os.path.exists(file_path) and os.path.getsize(file_path) > 0:\n",
373
- " continue\n",
374
- " \n",
375
- " # If we reach here, the file is missing OR corrupt (empty). So we save it.\n",
376
- " if not os.path.exists(target_folder):\n",
377
- " os.makedirs(target_folder, exist_ok=True)\n",
378
- "\n",
379
- " image = example['image'] \n",
380
- " if image.mode not in ['RGB', 'L']:\n",
381
- " image = image.convert('RGB')\n",
382
- " \n",
383
- " image.save(file_path)\n",
384
- "\n",
385
- " except (UnidentifiedImageError, OSError) as e:\n",
386
- " print(f\"\\n❌ SKIPPING CORRUPT IMAGE: Index {i}\")\n",
387
- " skipped_count += 1\n",
388
- "\n",
389
- "print(f\"\\n✅ Repair Complete.\")\n",
390
- "print(f\"Total corrupt/unreadable images skipped: {skipped_count}\")"
391
- ]
392
- },
393
- {
394
- "cell_type": "code",
395
- "execution_count": 57,
396
- "id": "41f94f27",
397
- "metadata": {},
398
- "outputs": [
399
  {
400
  "name": "stdout",
401
  "output_type": "stream",
402
  "text": [
403
- "🛠️ Repairing TEST split (Safe Mode)...\n"
 
404
  ]
405
  },
406
  {
407
  "data": {
408
  "application/vnd.jupyter.widget-view+json": {
409
- "model_id": "6b4725f0cad048119e282d008f56fe5f",
410
  "version_major": 2,
411
  "version_minor": 0
412
  },
413
  "text/plain": [
414
- "Checking test: 0%| | 0/40000 [00:00<?, ?it/s]"
415
  ]
416
  },
417
  "metadata": {},
418
  "output_type": "display_data"
419
  },
 
 
 
 
 
 
 
 
420
  {
421
  "name": "stdout",
422
  "output_type": "stream",
423
  "text": [
 
424
  "\n",
425
- " SKIPPING CORRUPT IMAGE: Index 34965\n",
426
- "\n",
427
- "✅ Repair Complete.\n",
428
- "Skipped 1 corrupt files.\n"
429
  ]
430
  }
431
  ],
432
  "source": [
433
  "import os\n",
434
  "import io\n",
435
- "from tqdm.auto import tqdm\n",
436
  "from PIL import Image, UnidentifiedImageError\n",
437
  "\n",
438
- "# 1. Configuration\n",
439
- "OUTPUT_DIR = \"rvl_cdip\"\n",
440
- "split = \"test\" \n",
441
  "\n",
442
- "# 2. Get Class Mapping\n",
443
- "labels_feature = ds['train'].features['label']\n",
444
- "idx_to_class = {idx: name for idx, name in enumerate(labels_feature.names)}\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
  "\n",
446
- "# 3. Enable \"Raw\" Mode (The Critical Fix)\n",
447
- "# This prevents Hugging Face from crashing when iterating over bad images\n",
448
- "ds[split] = ds[split].with_format(\"python\") \n",
 
449
  "\n",
450
- "print(f\"🛠️ Repairing {split.upper()} split (Safe Mode)...\")\n",
451
- "skipped_count = 0\n",
452
- "current_ds = ds[split]\n",
 
 
 
 
 
453
  "\n",
454
- "for i in tqdm(range(len(current_ds)), desc=f\"Checking {split}\"):\n",
455
- " try:\n",
456
- " # A. Get the raw item safely\n",
457
- " # We access by index 'i' directly to avoid iterator crashes\n",
458
- " example = current_ds[i] \n",
459
- " \n",
460
- " label_idx = example['label']\n",
461
- " class_name = idx_to_class[label_idx]\n",
462
- " \n",
463
- " # B. Define Paths\n",
464
- " target_folder = os.path.join(OUTPUT_DIR, split, class_name)\n",
465
- " filename = f\"{i}.png\"\n",
466
- " file_path = os.path.join(target_folder, filename)\n",
467
  "\n",
468
- " # C. Integrity Check (Resume Logic)\n",
469
- " if os.path.exists(file_path) and os.path.getsize(file_path) > 0:\n",
470
- " continue\n",
471
- " \n",
472
- " # D. Create Folder\n",
473
- " if not os.path.exists(target_folder):\n",
474
- " os.makedirs(target_folder, exist_ok=True)\n",
475
  "\n",
476
- " # E. Manual Decoding (The Safe Way)\n",
477
- " # Hugging Face gives us a dict with 'bytes' when decoding is off\n",
478
- " image_data = example['image'] \n",
479
  " \n",
480
- " # Check if it's already a PIL object (some versions vary)\n",
481
- " if isinstance(image_data, dict) and 'bytes' in image_data:\n",
482
- " image_bytes = image_data['bytes']\n",
483
- " image = Image.open(io.BytesIO(image_bytes))\n",
484
- " else:\n",
485
- " image = image_data # It might already be loaded\n",
486
- " \n",
487
- " # F. Save\n",
488
- " if image.mode not in ['RGB', 'L']:\n",
489
- " image = image.convert('RGB')\n",
490
- " \n",
491
- " image.save(file_path)\n",
 
 
 
 
492
  "\n",
493
- " except (UnidentifiedImageError, OSError, ValueError) as e:\n",
494
- " print(f\"\\n❌ SKIPPING CORRUPT IMAGE: Index {i}\")\n",
495
- " # Create a placeholder or just log it\n",
496
- " skipped_count += 1\n",
497
  "\n",
498
- "print(f\"\\n✅ Repair Complete.\")\n",
499
- "print(f\"Skipped {skipped_count} corrupt files.\")"
500
- ]
501
- },
502
- {
503
- "cell_type": "code",
504
- "execution_count": 29,
505
- "id": "b829c704",
506
- "metadata": {},
507
- "outputs": [
508
- {
509
- "data": {
510
- "text/plain": [
511
- "Counter({13: 2572,\n",
512
- " 12: 2537,\n",
513
- " 5: 2532,\n",
514
- " 3: 2527,\n",
515
- " 2: 2516,\n",
516
- " 0: 2515,\n",
517
- " 4: 2506,\n",
518
- " 1: 2505,\n",
519
- " 14: 2498,\n",
520
- " 8: 2492,\n",
521
- " 10: 2489,\n",
522
- " 6: 2477,\n",
523
- " 15: 2472,\n",
524
- " 7: 2464,\n",
525
- " 9: 2463,\n",
526
- " 11: 2435})"
527
- ]
528
- },
529
- "execution_count": 29,
530
- "metadata": {},
531
- "output_type": "execute_result"
532
- }
533
- ],
534
- "source": [
535
- "Counter(ds[split]['label'])"
536
  ]
537
  },
538
  {
@@ -540,12 +256,12 @@
540
  "id": "c8530c8e",
541
  "metadata": {},
542
  "source": [
543
- "## Checking the balance in ds"
544
  ]
545
  },
546
  {
547
  "cell_type": "code",
548
- "execution_count": 20,
549
  "id": "2785360c",
550
  "metadata": {},
551
  "outputs": [
@@ -613,29 +329,29 @@
613
  "from collections import Counter\n",
614
  "import pandas as pd\n",
615
  "\n",
616
- "# 1. Setup\n",
617
  "splits = ['train', 'val', 'test']\n",
618
  "label_feature = ds['train'].features['label']\n",
619
- "int2str = label_feature.int2str # Helper to convert ID (0) -> Name (\"letter\")\n",
620
  "\n",
621
  "print(f\"{'SPLIT':<10} {'CLASS NAME':<25} {'COUNT':<10} {'STATUS'}\")\n",
622
  "print(\"-\" * 60)\n",
623
  "\n",
624
  "for split in splits:\n",
625
- " # 2. Get all labels (Load only the label column into memory)\n",
626
  " # This is instant compared to loading images\n",
627
  " labels = ds[split]['label']\n",
628
  " \n",
629
- " # 3. Count frequencies\n",
630
  " counts = Counter(labels)\n",
631
  " \n",
632
- " # 4. Analyze each class\n",
633
  " # We sort by class ID to keep it organized\n",
634
  " for label_id in sorted(counts.keys()):\n",
635
  " count = counts[label_id]\n",
636
  " class_name = int2str(label_id)\n",
637
  " \n",
638
- " # 5. Define Expected Counts based on the Paper\n",
639
  " # Train: 320k / 16 = 20,000\n",
640
  " # Test/Val: 40k / 16 = 2,500\n",
641
  " if split == 'train':\n",
@@ -655,12 +371,12 @@
655
  "id": "5f7b75a2",
656
  "metadata": {},
657
  "source": [
658
- "## checking the balance in dir"
659
  ]
660
  },
661
  {
662
  "cell_type": "code",
663
- "execution_count": 30,
664
  "id": "059bfaa5",
665
  "metadata": {},
666
  "outputs": [
@@ -668,7 +384,7 @@
668
  "name": "stdout",
669
  "output_type": "stream",
670
  "text": [
671
- "📂 Scanning directory: /Users/arpit-zstch1557/Projects/DL/Course 4/document-classification/rvl_cdip\n",
672
  "SPLIT CLASS NAME FILES STATUS\n",
673
  "-----------------------------------------------------------------\n",
674
  "TRAIN advertisement 19963 ❌ MISMATCH (Exp: 20000)\n",
@@ -732,7 +448,7 @@
732
  "import pandas as pd\n",
733
  "\n",
734
  "# Configuration\n",
735
- "DATA_DIR = \"rvl_cdip\" # Your directory name\n",
736
  "splits = ['train', 'val', 'test']\n",
737
  "\n",
738
  "print(f\"📂 Scanning directory: {os.path.abspath(DATA_DIR)}\")\n",
@@ -769,12 +485,10 @@
769
  " # Determine Expected Count based on the paper\n",
770
  " if split == 'train':\n",
771
  " expected = 20000 \n",
772
- " # Note: We know 'train' has 1 missing file in total from the source (319,999)\n",
773
  " else:\n",
774
  " expected = 2500\n",
775
  "\n",
776
  " # Status Check\n",
777
- " # We allow a small tolerance because we know source data has noise\n",
778
  " if file_count == expected:\n",
779
  " status = \"✅ OK\"\n",
780
  " elif abs(file_count - expected) < 5: \n",
@@ -790,520 +504,10 @@
790
  "print(\"\\nAnalysis Complete.\")"
791
  ]
792
  },
793
- {
794
- "cell_type": "code",
795
- "execution_count": 10,
796
- "id": "99ca1af8",
797
- "metadata": {},
798
- "outputs": [
799
- {
800
- "name": "stdout",
801
- "output_type": "stream",
802
- "text": [
803
- "🚀 Starting RVL-CDIP Downloader\n",
804
- " Target Folder: /Users/arpit-zstch1557/Projects/DL/Course 4/document-classification/rvl_cdip_data\n",
805
- " Workers: 12\n",
806
- " Loading dataset structure from Hugging Face...\n"
807
- ]
808
- },
809
- {
810
- "data": {
811
- "application/vnd.jupyter.widget-view+json": {
812
- "model_id": "cf6fb62548ea45ebad0d16066ad8a895",
813
- "version_major": 2,
814
- "version_minor": 0
815
- },
816
- "text/plain": [
817
- "Resolving data files: 0%| | 0/119 [00:00<?, ?it/s]"
818
- ]
819
- },
820
- "metadata": {},
821
- "output_type": "display_data"
822
- },
823
- {
824
- "data": {
825
- "application/vnd.jupyter.widget-view+json": {
826
- "model_id": "90ff7339d3014385987f7a133cea179a",
827
- "version_major": 2,
828
- "version_minor": 0
829
- },
830
- "text/plain": [
831
- "Loading dataset shards: 0%| | 0/64 [00:00<?, ?it/s]"
832
- ]
833
- },
834
- "metadata": {},
835
- "output_type": "display_data"
836
- },
837
- {
838
- "name": "stdout",
839
- "output_type": "stream",
840
- "text": [
841
- " Found 16 categories.\n",
842
- " Configuring dataset for safe raw access...\n",
843
- "\n",
844
- "📦 Processing SPLIT: TRAIN\n"
845
- ]
846
- },
847
- {
848
- "data": {
849
- "application/vnd.jupyter.widget-view+json": {
850
- "model_id": "e83199d60f794dedb211c17775c27808",
851
- "version_major": 2,
852
- "version_minor": 0
853
- },
854
- "text/plain": [
855
- "Saving train (num_proc=12): 0%| | 0/319999 [00:00<?, ? examples/s]"
856
- ]
857
- },
858
- "metadata": {},
859
- "output_type": "display_data"
860
- },
861
- {
862
- "name": "stderr",
863
- "output_type": "stream",
864
- "text": [
865
- "Process ForkPoolWorker-17:\n",
866
- "Process ForkPoolWorker-16:\n",
867
- "Process ForkPoolWorker-18:\n",
868
- "Process ForkPoolWorker-22:\n",
869
- "Process ForkPoolWorker-23:\n",
870
- "Process ForkPoolWorker-27:\n",
871
- "Traceback (most recent call last):\n",
872
- "Traceback (most recent call last):\n",
873
- "Traceback (most recent call last):\n",
874
- "Traceback (most recent call last):\n",
875
- "Traceback (most recent call last):\n",
876
- "Traceback (most recent call last):\n",
877
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
878
- " fh = fp.fileno()\n",
879
- " ^^^^^^^^^\n",
880
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
881
- " fh = fp.fileno()\n",
882
- " ^^^^^^^^^\n",
883
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
884
- " fh = fp.fileno()\n",
885
- " ^^^^^^^^^\n",
886
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
887
- " fh = fp.fileno()\n",
888
- " ^^^^^^^^^\n",
889
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
890
- " fh = fp.fileno()\n",
891
- " ^^^^^^^^^\n",
892
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
893
- " fh = fp.fileno()\n",
894
- " ^^^^^^^^^\n",
895
- "AttributeError: '_idat' object has no attribute 'fileno'\n",
896
- "AttributeError: '_idat' object has no attribute 'fileno'\n",
897
- "AttributeError: '_idat' object has no attribute 'fileno'\n",
898
- "AttributeError: '_idat' object has no attribute 'fileno'\n",
899
- "AttributeError: '_idat' object has no attribute 'fileno'\n",
900
- "AttributeError: '_idat' object has no attribute 'fileno'\n",
901
- "\n",
902
- "During handling of the above exception, another exception occurred:\n",
903
- "\n",
904
- "\n",
905
- "During handling of the above exception, another exception occurred:\n",
906
- "\n",
907
- "\n",
908
- "During handling of the above exception, another exception occurred:\n",
909
- "\n",
910
- "\n",
911
- "During handling of the above exception, another exception occurred:\n",
912
- "\n",
913
- "\n",
914
- "During handling of the above exception, another exception occurred:\n",
915
- "\n",
916
- "\n",
917
- "During handling of the above exception, another exception occurred:\n",
918
- "\n",
919
- "Traceback (most recent call last):\n",
920
- "Traceback (most recent call last):\n",
921
- "Traceback (most recent call last):\n",
922
- "Traceback (most recent call last):\n",
923
- "Traceback (most recent call last):\n",
924
- "Traceback (most recent call last):\n",
925
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
926
- " self.run()\n",
927
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
928
- " self.run()\n",
929
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
930
- " self.run()\n",
931
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
932
- " self.run()\n",
933
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
934
- " self.run()\n",
935
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
936
- " self._target(*self._args, **self._kwargs)\n",
937
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
938
- " self.run()\n",
939
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
940
- " self._target(*self._args, **self._kwargs)\n",
941
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
942
- " self._target(*self._args, **self._kwargs)\n",
943
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
944
- " self._target(*self._args, **self._kwargs)\n",
945
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
946
- " self._target(*self._args, **self._kwargs)\n",
947
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py\", line 125, in worker\n",
948
- " result = (True, func(*args, **kwds))\n",
949
- " ^^^^^^^^^^^^^^^^^^^\n",
950
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
951
- " self._target(*self._args, **self._kwargs)\n",
952
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py\", line 125, in worker\n",
953
- " result = (True, func(*args, **kwds))\n",
954
- " ^^^^^^^^^^^^^^^^^^^\n",
955
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py\", line 125, in worker\n",
956
- " result = (True, func(*args, **kwds))\n",
957
- " ^^^^^^^^^^^^^^^^^^^\n",
958
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py\", line 125, in worker\n",
959
- " result = (True, func(*args, **kwds))\n",
960
- " ^^^^^^^^^^^^^^^^^^^\n",
961
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py\", line 125, in worker\n",
962
- " result = (True, func(*args, **kwds))\n",
963
- " ^^^^^^^^^^^^^^^^^^^\n",
964
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py\", line 586, in _write_generator_to_queue\n",
965
- " for i, result in enumerate(func(**kwargs)):\n",
966
- " ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
967
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py\", line 586, in _write_generator_to_queue\n",
968
- " for i, result in enumerate(func(**kwargs)):\n",
969
- " ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
970
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py\", line 125, in worker\n",
971
- " result = (True, func(*args, **kwds))\n",
972
- " ^^^^^^^^^^^^^^^^^^^\n",
973
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py\", line 586, in _write_generator_to_queue\n",
974
- " for i, result in enumerate(func(**kwargs)):\n",
975
- " ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
976
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py\", line 586, in _write_generator_to_queue\n",
977
- " for i, result in enumerate(func(**kwargs)):\n",
978
- " ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
979
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py\", line 586, in _write_generator_to_queue\n",
980
- " for i, result in enumerate(func(**kwargs)):\n",
981
- " ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
982
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3697, in _map_single\n",
983
- " for i, batch in iter_outputs(shard_iterable):\n",
984
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
985
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3697, in _map_single\n",
986
- " for i, batch in iter_outputs(shard_iterable):\n",
987
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
988
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3697, in _map_single\n",
989
- " for i, batch in iter_outputs(shard_iterable):\n",
990
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
991
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py\", line 586, in _write_generator_to_queue\n",
992
- " for i, result in enumerate(func(**kwargs)):\n",
993
- " ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
994
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3697, in _map_single\n",
995
- " for i, batch in iter_outputs(shard_iterable):\n",
996
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
997
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3647, in iter_outputs\n",
998
- " yield i, apply_function(example, i, offset=offset)\n",
999
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1000
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3697, in _map_single\n",
1001
- " for i, batch in iter_outputs(shard_iterable):\n",
1002
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1003
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3647, in iter_outputs\n",
1004
- " yield i, apply_function(example, i, offset=offset)\n",
1005
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1006
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3697, in _map_single\n",
1007
- " for i, batch in iter_outputs(shard_iterable):\n",
1008
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1009
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3647, in iter_outputs\n",
1010
- " yield i, apply_function(example, i, offset=offset)\n",
1011
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1012
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3570, in apply_function\n",
1013
- " processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)\n",
1014
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1015
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3647, in iter_outputs\n",
1016
- " yield i, apply_function(example, i, offset=offset)\n",
1017
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1018
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3647, in iter_outputs\n",
1019
- " yield i, apply_function(example, i, offset=offset)\n",
1020
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1021
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3570, in apply_function\n",
1022
- " processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)\n",
1023
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1024
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3647, in iter_outputs\n",
1025
- " yield i, apply_function(example, i, offset=offset)\n",
1026
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1027
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3570, in apply_function\n",
1028
- " processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)\n",
1029
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1030
- " File \"/var/folders/07/1hr7xxpj3sx52fsnpz87jfj40000gp/T/ipykernel_72359/180152894.py\", line 49, in save_image_worker\n",
1031
- " img.save(file_path)\n",
1032
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3570, in apply_function\n",
1033
- " processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)\n",
1034
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1035
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3570, in apply_function\n",
1036
- " processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)\n",
1037
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1038
- " File \"/var/folders/07/1hr7xxpj3sx52fsnpz87jfj40000gp/T/ipykernel_72359/180152894.py\", line 49, in save_image_worker\n",
1039
- " img.save(file_path)\n",
1040
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3570, in apply_function\n",
1041
- " processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)\n",
1042
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1043
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/Image.py\", line 2588, in save\n",
1044
- " save_handler(self, fp, filename)\n",
1045
- " File \"/var/folders/07/1hr7xxpj3sx52fsnpz87jfj40000gp/T/ipykernel_72359/180152894.py\", line 49, in save_image_worker\n",
1046
- " img.save(file_path)\n",
1047
- " File \"/var/folders/07/1hr7xxpj3sx52fsnpz87jfj40000gp/T/ipykernel_72359/180152894.py\", line 49, in save_image_worker\n",
1048
- " img.save(file_path)\n",
1049
- " File \"/var/folders/07/1hr7xxpj3sx52fsnpz87jfj40000gp/T/ipykernel_72359/180152894.py\", line 49, in save_image_worker\n",
1050
- " img.save(file_path)\n",
1051
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/Image.py\", line 2588, in save\n",
1052
- " save_handler(self, fp, filename)\n",
1053
- " File \"/var/folders/07/1hr7xxpj3sx52fsnpz87jfj40000gp/T/ipykernel_72359/180152894.py\", line 49, in save_image_worker\n",
1054
- " img.save(file_path)\n",
1055
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/PngImagePlugin.py\", line 1495, in _save\n",
1056
- " ImageFile._save(\n",
1057
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/Image.py\", line 2588, in save\n",
1058
- " save_handler(self, fp, filename)\n",
1059
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/Image.py\", line 2588, in save\n",
1060
- " save_handler(self, fp, filename)\n",
1061
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/Image.py\", line 2588, in save\n",
1062
- " save_handler(self, fp, filename)\n",
1063
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/PngImagePlugin.py\", line 1495, in _save\n",
1064
- " ImageFile._save(\n",
1065
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/Image.py\", line 2588, in save\n",
1066
- " save_handler(self, fp, filename)\n",
1067
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 648, in _save\n",
1068
- " _encode_tile(im, fp, tile, bufsize, None, exc)\n",
1069
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/PngImagePlugin.py\", line 1495, in _save\n",
1070
- " ImageFile._save(\n",
1071
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/PngImagePlugin.py\", line 1495, in _save\n",
1072
- " ImageFile._save(\n",
1073
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 674, in _encode_tile\n",
1074
- " errcode, data = encoder.encode(bufsize)[1:]\n",
1075
- " ^^^^^^^^^^^^^^^^^^^^^^^\n",
1076
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/PngImagePlugin.py\", line 1495, in _save\n",
1077
- " ImageFile._save(\n",
1078
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/PngImagePlugin.py\", line 1495, in _save\n",
1079
- " ImageFile._save(\n",
1080
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 648, in _save\n",
1081
- " _encode_tile(im, fp, tile, bufsize, None, exc)\n",
1082
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 648, in _save\n",
1083
- " _encode_tile(im, fp, tile, bufsize, None, exc)\n",
1084
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 648, in _save\n",
1085
- " _encode_tile(im, fp, tile, bufsize, None, exc)\n",
1086
- "KeyboardInterrupt\n",
1087
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 674, in _encode_tile\n",
1088
- " errcode, data = encoder.encode(bufsize)[1:]\n",
1089
- " ^^^^^^^^^^^^^^^^^^^^^^^\n",
1090
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 648, in _save\n",
1091
- " _encode_tile(im, fp, tile, bufsize, None, exc)\n",
1092
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 648, in _save\n",
1093
- " _encode_tile(im, fp, tile, bufsize, None, exc)\n",
1094
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 674, in _encode_tile\n",
1095
- " errcode, data = encoder.encode(bufsize)[1:]\n",
1096
- " ^^^^^^^^^^^^^^^^^^^^^^^\n",
1097
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 674, in _encode_tile\n",
1098
- " errcode, data = encoder.encode(bufsize)[1:]\n",
1099
- " ^^^^^^^^^^^^^^^^^^^^^^^\n",
1100
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 674, in _encode_tile\n",
1101
- " errcode, data = encoder.encode(bufsize)[1:]\n",
1102
- " ^^^^^^^^^^^^^^^^^^^^^^^\n",
1103
- "KeyboardInterrupt\n",
1104
- "KeyboardInterrupt\n",
1105
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 674, in _encode_tile\n",
1106
- " errcode, data = encoder.encode(bufsize)[1:]\n",
1107
- " ^^^^^^^^^^^^^^^^^^^^^^^\n",
1108
- "KeyboardInterrupt\n",
1109
- "KeyboardInterrupt\n",
1110
- "KeyboardInterrupt\n",
1111
- "Process ForkPoolWorker-26:\n",
1112
- "Process ForkPoolWorker-19:\n",
1113
- "Process ForkPoolWorker-20:\n",
1114
- "Process ForkPoolWorker-25:\n",
1115
- "Traceback (most recent call last):\n",
1116
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
1117
- " fh = fp.fileno()\n",
1118
- " ^^^^^^^^^\n",
1119
- "Traceback (most recent call last):\n",
1120
- "AttributeError: '_idat' object has no attribute 'fileno'\n",
1121
- "Traceback (most recent call last):\n",
1122
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
1123
- " fh = fp.fileno()\n",
1124
- " ^^^^^^^^^\n",
1125
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
1126
- " fh = fp.fileno()\n",
1127
- " ^^^^^^^^^\n",
1128
- "\n",
1129
- "During handling of the above exception, another exception occurred:\n",
1130
- "\n",
1131
- "AttributeError: '_idat' object has no attribute 'fileno'\n",
1132
- "Traceback (most recent call last):\n",
1133
- "AttributeError: '_idat' object has no attribute 'fileno'\n",
1134
- "Traceback (most recent call last):\n",
1135
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
1136
- " fh = fp.fileno()\n",
1137
- " ^^^^^^^^^\n",
1138
- "\n",
1139
- "During handling of the above exception, another exception occurred:\n",
1140
- "\n",
1141
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
1142
- " self.run()\n",
1143
- "\n",
1144
- "During handling of the above exception, another exception occurred:\n",
1145
- "\n",
1146
- "AttributeError: '_idat' object has no attribute 'fileno'\n",
1147
- "Traceback (most recent call last):\n",
1148
- "Traceback (most recent call last):\n",
1149
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
1150
- " self._target(*self._args, **self._kwargs)\n",
1151
- "\n",
1152
- "During handling of the above exception, another exception occurred:\n",
1153
- "\n",
1154
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
1155
- " self.run()\n",
1156
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py\", line 125, in worker\n",
1157
- " result = (True, func(*args, **kwds))\n",
1158
- " ^^^^^^^^^^^^^^^^^^^\n",
1159
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
1160
- " self.run()\n",
1161
- "Traceback (most recent call last):\n",
1162
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
1163
- " self._target(*self._args, **self._kwargs)\n",
1164
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py\", line 586, in _write_generator_to_queue\n",
1165
- " for i, result in enumerate(func(**kwargs)):\n",
1166
- " ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1167
- " File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
1168
- " self._target(*self._args, **self._kwargs)\n"
1169
- ]
1170
- },
1171
- {
1172
- "ename": "TimeoutError",
1173
- "evalue": "",
1174
- "output_type": "error",
1175
- "traceback": [
1176
- "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
1177
- "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
1178
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py:612\u001b[39m, in \u001b[36miflatmap_unordered\u001b[39m\u001b[34m(pool, func, kwargs_iterable)\u001b[39m\n\u001b[32m 611\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m612\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m \u001b[43mqueue\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0.05\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 613\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m Empty:\n",
1179
- "\u001b[36mFile \u001b[39m\u001b[32m<string>:2\u001b[39m, in \u001b[36mget\u001b[39m\u001b[34m(self, *args, **kwds)\u001b[39m\n",
1180
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/managers.py:828\u001b[39m, in \u001b[36mBaseProxy._callmethod\u001b[39m\u001b[34m(self, methodname, args, kwds)\u001b[39m\n\u001b[32m 827\u001b[39m conn.send((\u001b[38;5;28mself\u001b[39m._id, methodname, args, kwds))\n\u001b[32m--> \u001b[39m\u001b[32m828\u001b[39m kind, result = \u001b[43mconn\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrecv\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 830\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m kind == \u001b[33m'\u001b[39m\u001b[33m#RETURN\u001b[39m\u001b[33m'\u001b[39m:\n",
1181
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/connection.py:253\u001b[39m, in \u001b[36m_ConnectionBase.recv\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 252\u001b[39m \u001b[38;5;28mself\u001b[39m._check_readable()\n\u001b[32m--> \u001b[39m\u001b[32m253\u001b[39m buf = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_recv_bytes\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 254\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m _ForkingPickler.loads(buf.getbuffer())\n",
1182
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/connection.py:433\u001b[39m, in \u001b[36mConnection._recv_bytes\u001b[39m\u001b[34m(self, maxsize)\u001b[39m\n\u001b[32m 432\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_recv_bytes\u001b[39m(\u001b[38;5;28mself\u001b[39m, maxsize=\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[32m--> \u001b[39m\u001b[32m433\u001b[39m buf = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_recv\u001b[49m\u001b[43m(\u001b[49m\u001b[32;43m4\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 434\u001b[39m size, = struct.unpack(\u001b[33m\"\u001b[39m\u001b[33m!i\u001b[39m\u001b[33m\"\u001b[39m, buf.getvalue())\n",
1183
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/connection.py:398\u001b[39m, in \u001b[36mConnection._recv\u001b[39m\u001b[34m(self, size, read)\u001b[39m\n\u001b[32m 397\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m remaining > \u001b[32m0\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m398\u001b[39m chunk = \u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mremaining\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 399\u001b[39m n = \u001b[38;5;28mlen\u001b[39m(chunk)\n",
1184
- "\u001b[31mKeyboardInterrupt\u001b[39m: ",
1185
- "\nDuring handling of the above exception, another exception occurred:\n",
1186
- "\u001b[31mTimeoutError\u001b[39m Traceback (most recent call last)",
1187
- "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[10]\u001b[39m\u001b[32m, line 103\u001b[39m\n\u001b[32m 100\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m datasets.ImageFolder(root=\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mOUTPUT_DIR\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/train\u001b[39m\u001b[33m'\u001b[39m\u001b[33m)\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 102\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[34m__name__\u001b[39m == \u001b[33m\"\u001b[39m\u001b[33m__main__\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m103\u001b[39m \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
1188
- "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[10]\u001b[39m\u001b[32m, line 84\u001b[39m, in \u001b[36mmain\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 81\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m SPLITS:\n\u001b[32m 82\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m📦 Processing SPLIT: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msplit.upper()\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m84\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m[\u001b[49m\u001b[43msplit\u001b[49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 85\u001b[39m \u001b[43m \u001b[49m\u001b[43msave_image_worker\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 86\u001b[39m \u001b[43m \u001b[49m\u001b[43mbatched\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 87\u001b[39m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m100\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Process 100 images per task\u001b[39;49;00m\n\u001b[32m 88\u001b[39m \u001b[43m \u001b[49m\u001b[43mwith_indices\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# We need the index for the filename\u001b[39;49;00m\n\u001b[32m 89\u001b[39m \u001b[43m \u001b[49m\u001b[43mnum_proc\u001b[49m\u001b[43m=\u001b[49m\u001b[43mNUM_PROC\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Parallel speed!\u001b[39;49;00m\n\u001b[32m 90\u001b[39m \u001b[43m \u001b[49m\u001b[43mfn_kwargs\u001b[49m\u001b[43m=\u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m 91\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43msplit_name\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 92\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43moutput_root\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mOUTPUT_DIR\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 93\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43midx_to_class\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43midx_to_class\u001b[49m\n\u001b[32m 94\u001b[39m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 95\u001b[39m \u001b[43m \u001b[49m\u001b[43mdesc\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43mf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mSaving \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43msplit\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m 96\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 98\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m✅ Download and Extraction Complete!\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 99\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m You can now load this in PyTorch using:\u001b[39m\u001b[33m\"\u001b[39m)\n",
1189
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py:562\u001b[39m, in \u001b[36mtransmit_format.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 555\u001b[39m self_format = {\n\u001b[32m 556\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mtype\u001b[39m\u001b[33m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m._format_type,\n\u001b[32m 557\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mformat_kwargs\u001b[39m\u001b[33m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m._format_kwargs,\n\u001b[32m 558\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mcolumns\u001b[39m\u001b[33m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m._format_columns,\n\u001b[32m 559\u001b[39m \u001b[33m\"\u001b[39m\u001b[33moutput_all_columns\u001b[39m\u001b[33m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m._output_all_columns,\n\u001b[32m 560\u001b[39m }\n\u001b[32m 561\u001b[39m \u001b[38;5;66;03m# apply actual function\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m562\u001b[39m out: Union[\u001b[33m\"\u001b[39m\u001b[33mDataset\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mDatasetDict\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 563\u001b[39m datasets: \u001b[38;5;28mlist\u001b[39m[\u001b[33m\"\u001b[39m\u001b[33mDataset\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[38;5;28mlist\u001b[39m(out.values()) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(out, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m [out]\n\u001b[32m 564\u001b[39m \u001b[38;5;66;03m# re-apply format to the output\u001b[39;00m\n",
1190
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py:3332\u001b[39m, in \u001b[36mDataset.map\u001b[39m\u001b[34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc, try_original_type)\u001b[39m\n\u001b[32m 3329\u001b[39m os.environ = prev_env\n\u001b[32m 3330\u001b[39m logger.info(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSpawning \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnum_proc\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m processes\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m3332\u001b[39m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrank\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdone\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43miflatmap_unordered\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 3333\u001b[39m \u001b[43m \u001b[49m\u001b[43mpool\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mDataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_map_single\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs_iterable\u001b[49m\u001b[43m=\u001b[49m\u001b[43munprocessed_kwargs_per_job\u001b[49m\n\u001b[32m 3334\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 3335\u001b[39m \u001b[43m \u001b[49m\u001b[43mcheck_if_shard_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrank\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdone\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 3337\u001b[39m pool.close()\n",
1191
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py:626\u001b[39m, in \u001b[36miflatmap_unordered\u001b[39m\u001b[34m(pool, func, kwargs_iterable)\u001b[39m\n\u001b[32m 623\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 624\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m pool_changed:\n\u001b[32m 625\u001b[39m \u001b[38;5;66;03m# we get the result in case there's an error to raise\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m626\u001b[39m [\u001b[43masync_result\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0.05\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m async_result \u001b[38;5;129;01min\u001b[39;00m async_results]\n",
1192
- "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py:770\u001b[39m, in \u001b[36mApplyResult.get\u001b[39m\u001b[34m(self, timeout)\u001b[39m\n\u001b[32m 768\u001b[39m \u001b[38;5;28mself\u001b[39m.wait(timeout)\n\u001b[32m 769\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.ready():\n\u001b[32m--> \u001b[39m\u001b[32m770\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m\n\u001b[32m 771\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._success:\n\u001b[32m 772\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._value\n",
1193
- "\u001b[31mTimeoutError\u001b[39m: "
1194
- ]
1195
- }
1196
- ],
1197
- "source": [
1198
- "import os\n",
1199
- "import io\n",
1200
- "import multiprocessing\n",
1201
- "from datasets import load_dataset, Image as HFImage\n",
1202
- "from PIL import Image, UnidentifiedImageError\n",
1203
- "\n",
1204
- "# ================= CONFIGURATION =================\n",
1205
- "OUTPUT_DIR = \"rvl_cdip_data\" # Where data will be saved\n",
1206
- "NUM_PROC = os.cpu_count() # Use all available CPU cores\n",
1207
- "SPLITS = ['train', 'val', 'test'] # Splits to process\n",
1208
- "# =================================================\n",
1209
- "\n",
1210
- "def save_image_worker(batch, indices, split_name, output_root, idx_to_class):\n",
1211
- " \"\"\"\n",
1212
- " Worker function that runs on multiple CPU cores.\n",
1213
- " Receives raw image bytes, decodes them safely, and saves to disk.\n",
1214
- " \"\"\"\n",
1215
- " # 1. Unpack batch\n",
1216
- " # Since we used decode=False, 'image' contains a dict with 'bytes'\n",
1217
- " images_data = batch['image'] \n",
1218
- " labels = batch['label']\n",
1219
- " \n",
1220
- " for i, (img_data, label_idx, original_idx) in enumerate(zip(images_data, labels, indices)):\n",
1221
- " try:\n",
1222
- " # 2. Determine Paths\n",
1223
- " class_name = idx_to_class[label_idx]\n",
1224
- " target_folder = os.path.join(output_root, split_name, class_name)\n",
1225
- " filename = f\"{original_idx}.png\"\n",
1226
- " file_path = os.path.join(target_folder, filename)\n",
1227
- " \n",
1228
- " # 3. RESUME LOGIC (The \"Skip\" Check)\n",
1229
- " # If file exists and is not empty, skip it.\n",
1230
- " if os.path.exists(file_path) and os.path.getsize(file_path) > 0:\n",
1231
- " continue\n",
1232
- " \n",
1233
- " # 4. Create Directory (Lazy Creation)\n",
1234
- " # We do this here to ensure it exists before writing\n",
1235
- " os.makedirs(target_folder, exist_ok=True)\n",
1236
- " \n",
1237
- " # 5. Decode Image Safely\n",
1238
- " # We manually open the bytes. If this fails, we catch the error below.\n",
1239
- " image_bytes = img_data['bytes']\n",
1240
- " with Image.open(io.BytesIO(image_bytes)) as img:\n",
1241
- " # Convert to RGB (standard for PyTorch ResNet)\n",
1242
- " if img.mode != 'RGB':\n",
1243
- " img = img.convert('RGB')\n",
1244
- " \n",
1245
- " # Save to disk\n",
1246
- " img.save(file_path)\n",
1247
- "\n",
1248
- " except (UnidentifiedImageError, OSError, ValueError) as e:\n",
1249
- " # 6. Error Handling\n",
1250
- " # Instead of crashing the whole script, we just log this one failure.\n",
1251
- " print(f\"[Worker] Skipping corrupt image ID {original_idx} in {split_name}: {e}\")\n",
1252
- " \n",
1253
- " return batch\n",
1254
- "\n",
1255
- "def main():\n",
1256
- " print(f\"🚀 Starting RVL-CDIP Downloader\")\n",
1257
- " print(f\" Target Folder: {os.path.abspath(OUTPUT_DIR)}\")\n",
1258
- " print(f\" Workers: {NUM_PROC}\")\n",
1259
- " \n",
1260
- " # 1. Load Dataset\n",
1261
- " # Assuming you are logged into Hugging Face or have access\n",
1262
- " print(\" Loading dataset structure from Hugging Face...\")\n",
1263
- " dataset = load_dataset(\"chainyo/rvl-cdip\") \n",
1264
- "\n",
1265
- " # 2. Setup Class Mapping\n",
1266
- " labels_feature = dataset['train'].features['label']\n",
1267
- " idx_to_class = {idx: name for idx, name in enumerate(labels_feature.names)}\n",
1268
- " print(f\" Found {len(idx_to_class)} categories.\")\n",
1269
- "\n",
1270
- " # 3. CRITICAL: Disable Auto-Decoding\n",
1271
- " # This prevents the Iterator from crashing when it hits a corrupt file.\n",
1272
- " # We will handle decoding manually in the worker function.\n",
1273
- " print(\" Configuring dataset for safe raw access...\")\n",
1274
- " for split in SPLITS:\n",
1275
- " dataset[split] = dataset[split].cast_column(\"image\", HFImage(decode=False))\n",
1276
- "\n",
1277
- " # 4. Execute Parallel Processing\n",
1278
- " for split in SPLITS:\n",
1279
- " print(f\"\\n📦 Processing SPLIT: {split.upper()}\")\n",
1280
- " \n",
1281
- " dataset[split].map(\n",
1282
- " save_image_worker,\n",
1283
- " batched=True,\n",
1284
- " batch_size=100, # Process 100 images per task\n",
1285
- " with_indices=True, # We need the index for the filename\n",
1286
- " num_proc=NUM_PROC, # Parallel speed!\n",
1287
- " fn_kwargs={\n",
1288
- " 'split_name': split,\n",
1289
- " 'output_root': OUTPUT_DIR,\n",
1290
- " 'idx_to_class': idx_to_class\n",
1291
- " },\n",
1292
- " desc=f\"Saving {split}\"\n",
1293
- " )\n",
1294
- "\n",
1295
- " print(f\"\\n✅ Download and Extraction Complete!\")\n",
1296
- " print(f\" You can now load this in PyTorch using:\")\n",
1297
- " print(f\" datasets.ImageFolder(root='{OUTPUT_DIR}/train')\")\n",
1298
- "\n",
1299
- "if __name__ == \"__main__\":\n",
1300
- " main()"
1301
- ]
1302
- },
1303
  {
1304
  "cell_type": "code",
1305
  "execution_count": null,
1306
- "id": "f440bb56",
1307
  "metadata": {},
1308
  "outputs": [],
1309
  "source": []
@@ -1325,7 +529,7 @@
1325
  "name": "python",
1326
  "nbconvert_exporter": "python",
1327
  "pygments_lexer": "ipython3",
1328
- "version": "3.12.12"
1329
  }
1330
  },
1331
  "nbformat": 4,
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 7,
6
  "id": "ae9bc87a",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
10
  "from datasets import load_dataset\n",
11
+ "import datasets\n",
12
+ "from tqdm.notebook import tqdm"
13
  ]
14
  },
15
  {
16
  "cell_type": "code",
17
+ "execution_count": null,
18
+ "id": "d5bc67fe",
19
  "metadata": {},
20
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  "source": [
22
  "ds = load_dataset(\"chainyo/rvl-cdip\")"
23
  ]
24
  },
25
  {
26
+ "cell_type": "markdown",
27
+ "id": "85f49eeb",
 
28
  "metadata": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  "source": [
30
+ "## Creates the \"rvl_cdip_data\" dir"
31
  ]
32
  },
33
  {
34
  "cell_type": "code",
35
+ "execution_count": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  "id": "936deafa",
37
  "metadata": {},
38
  "outputs": [
 
40
  "name": "stdout",
41
  "output_type": "stream",
42
  "text": [
43
+ "🚀 Starting RVL-CDIP Downloader (Disk Optimized)\n",
44
+ " Target Folder: /Users/arpit-zstch1557/Projects/document-classification/rvl_cdip_data\n",
45
+ " Workers: 12\n",
46
+ " Loading dataset structure from Hugging Face...\n"
47
  ]
48
  },
49
  {
50
  "data": {
51
  "application/vnd.jupyter.widget-view+json": {
52
+ "model_id": "0a79c4079dd44915af9193231077adc9",
53
  "version_major": 2,
54
  "version_minor": 0
55
  },
56
  "text/plain": [
57
+ "Resolving data files: 0%| | 0/119 [00:00<?, ?it/s]"
58
  ]
59
  },
60
  "metadata": {},
61
  "output_type": "display_data"
62
  },
 
 
 
 
 
 
 
 
63
  {
64
  "data": {
65
  "application/vnd.jupyter.widget-view+json": {
66
+ "model_id": "105602455af94c04a85e8dd5eed8e1bb",
67
  "version_major": 2,
68
  "version_minor": 0
69
  },
70
  "text/plain": [
71
+ "Loading dataset shards: 0%| | 0/64 [00:00<?, ?it/s]"
72
  ]
73
  },
74
  "metadata": {},
 
78
  "name": "stdout",
79
  "output_type": "stream",
80
  "text": [
81
+ " Found 16 categories.\n",
82
+ " Configuring dataset for safe raw access...\n",
83
  "\n",
84
+ "📦 Processing SPLIT: TRAIN\n"
85
  ]
86
  },
87
  {
88
  "data": {
89
  "application/vnd.jupyter.widget-view+json": {
90
+ "model_id": "59511ab60fd047758ad0d5671f5f6789",
91
  "version_major": 2,
92
  "version_minor": 0
93
  },
94
  "text/plain": [
95
+ "Saving train (num_proc=12): 0%| | 0/319999 [00:00<?, ? examples/s]"
96
  ]
97
  },
98
  "metadata": {},
99
  "output_type": "display_data"
100
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  {
102
  "name": "stdout",
103
  "output_type": "stream",
104
  "text": [
105
+ "\n",
106
+ "📦 Processing SPLIT: VAL\n"
107
  ]
108
  },
109
  {
110
  "data": {
111
  "application/vnd.jupyter.widget-view+json": {
112
+ "model_id": "19ac18978db046e0aea0cbf7da2748ba",
113
  "version_major": 2,
114
  "version_minor": 0
115
  },
116
  "text/plain": [
117
+ "Saving val (num_proc=12): 0%| | 0/40000 [00:00<?, ? examples/s]"
118
  ]
119
  },
120
  "metadata": {},
121
  "output_type": "display_data"
122
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  {
124
  "name": "stdout",
125
  "output_type": "stream",
126
  "text": [
127
+ "\n",
128
+ "📦 Processing SPLIT: TEST\n"
129
  ]
130
  },
131
  {
132
  "data": {
133
  "application/vnd.jupyter.widget-view+json": {
134
+ "model_id": "890739f28bd2495bacc55ea33099c2f2",
135
  "version_major": 2,
136
  "version_minor": 0
137
  },
138
  "text/plain": [
139
+ "Saving test (num_proc=12): 0%| | 0/40000 [00:00<?, ? examples/s]"
140
  ]
141
  },
142
  "metadata": {},
143
  "output_type": "display_data"
144
  },
145
+ {
146
+ "name": "stderr",
147
+ "output_type": "stream",
148
+ "text": [
149
+ "/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.13/site-packages/PIL/TiffImagePlugin.py:949: UserWarning: Corrupt EXIF data. Expecting to read 2 bytes but only got 0. \n",
150
+ " warnings.warn(str(msg))\n"
151
+ ]
152
+ },
153
  {
154
  "name": "stdout",
155
  "output_type": "stream",
156
  "text": [
157
+ "[Worker] Skipping corrupt image ID 34965 in test: cannot identify image file <_io.BytesIO object at 0x371a331a0>\n",
158
  "\n",
159
+ " Download and Extraction Complete!\n",
160
+ " You can now load this in PyTorch using:\n",
161
+ " datasets.ImageFolder(root='rvl_cdip_data/train')\n"
 
162
  ]
163
  }
164
  ],
165
  "source": [
166
  "import os\n",
167
  "import io\n",
168
+ "from datasets import load_dataset, Image as HFImage\n",
169
  "from PIL import Image, UnidentifiedImageError\n",
170
  "\n",
171
+ "OUTPUT_DIR = \"rvl_cdip_data\" # Where data will be saved\n",
172
+ "NUM_PROC = os.cpu_count() # Use all available CPU cores\n",
173
+ "SPLITS = ['train', 'val', 'test'] # Splits to process\n",
174
  "\n",
175
+ "def save_image_worker(batch, indices, split_name, output_root, idx_to_class):\n",
176
+ " # Unpack batch\n",
177
+ " images_data = batch['image'] \n",
178
+ " labels = batch['label']\n",
179
+ " \n",
180
+ " for i, (img_data, label_idx, original_idx) in enumerate(zip(images_data, labels, indices)):\n",
181
+ " try:\n",
182
+ " # Determine Paths\n",
183
+ " class_name = idx_to_class[label_idx]\n",
184
+ " target_folder = os.path.join(output_root, split_name, class_name)\n",
185
+ " filename = f\"{original_idx}.png\"\n",
186
+ " file_path = os.path.join(target_folder, filename)\n",
187
+ " \n",
188
+ " if os.path.exists(file_path) and os.path.getsize(file_path) > 0:\n",
189
+ " continue\n",
190
+ " \n",
191
+ " # Create Directory\n",
192
+ " os.makedirs(target_folder, exist_ok=True)\n",
193
+ " \n",
194
+ " # 5. Decode Image Safely\n",
195
+ " image_bytes = img_data['bytes']\n",
196
+ " with Image.open(io.BytesIO(image_bytes)) as img:\n",
197
+ " if img.mode != 'RGB':\n",
198
+ " img = img.convert('RGB')\n",
199
+ " img.save(file_path)\n",
200
  "\n",
201
+ " except (UnidentifiedImageError, OSError, ValueError) as e:\n",
202
+ " print(f\"[Worker] Skipping corrupt image ID {original_idx} in {split_name}: {e}\")\n",
203
+ " \n",
204
+ " return {}\n",
205
  "\n",
206
+ "def main():\n",
207
+ " print(f\"🚀 Starting RVL-CDIP Downloader (Disk Optimized)\")\n",
208
+ " print(f\" Target Folder: {os.path.abspath(OUTPUT_DIR)}\")\n",
209
+ " print(f\" Workers: {NUM_PROC}\")\n",
210
+ " \n",
211
+ " # Load Dataset\n",
212
+ " print(\" Loading dataset structure from Hugging Face...\")\n",
213
+ " dataset = load_dataset(\"chainyo/rvl-cdip\") \n",
214
  "\n",
215
+ " # Setup Class Mapping\n",
216
+ " labels_feature = dataset['train'].features['label']\n",
217
+ " idx_to_class = {idx: name for idx, name in enumerate(labels_feature.names)}\n",
218
+ " print(f\" Found {len(idx_to_class)} categories.\")\n",
 
 
 
 
 
 
 
 
 
219
  "\n",
220
+ " # Disable Auto-Decoding (Prevents crashes on corrupt files)\n",
221
+ " print(\" Configuring dataset for safe raw access...\")\n",
222
+ " for split in SPLITS:\n",
223
+ " dataset[split] = dataset[split].cast_column(\"image\", HFImage(decode=False))\n",
 
 
 
224
  "\n",
225
+ " # Execute Parallel Processing\n",
226
+ " for split in SPLITS:\n",
227
+ " print(f\"\\n📦 Processing SPLIT: {split.upper()}\")\n",
228
  " \n",
229
+ " # We use remove_columns to ensure the output dataset is empty\n",
230
+ " # This prevents the 50GB duplicate cache file.\n",
231
+ " dataset[split].map(\n",
232
+ " save_image_worker,\n",
233
+ " batched=True,\n",
234
+ " batch_size=100,\n",
235
+ " with_indices=True,\n",
236
+ " num_proc=NUM_PROC,\n",
237
+ " remove_columns=dataset[split].column_names, \n",
238
+ " fn_kwargs={\n",
239
+ " 'split_name': split,\n",
240
+ " 'output_root': OUTPUT_DIR,\n",
241
+ " 'idx_to_class': idx_to_class\n",
242
+ " },\n",
243
+ " desc=f\"Saving {split}\"\n",
244
+ " )\n",
245
  "\n",
246
+ " print(f\"\\n✅ Download and Extraction Complete!\")\n",
247
+ " print(f\" You can now load this in PyTorch using:\")\n",
248
+ " print(f\" datasets.ImageFolder(root='{OUTPUT_DIR}/train')\")\n",
 
249
  "\n",
250
+ "if __name__ == \"__main__\":\n",
251
+ " main()"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  ]
253
  },
254
  {
 
256
  "id": "c8530c8e",
257
  "metadata": {},
258
  "source": [
259
+ "## Checking the Data Imbalance in ds (from HF)"
260
  ]
261
  },
262
  {
263
  "cell_type": "code",
264
+ "execution_count": 8,
265
  "id": "2785360c",
266
  "metadata": {},
267
  "outputs": [
 
329
  "from collections import Counter\n",
330
  "import pandas as pd\n",
331
  "\n",
332
+ "#Setup\n",
333
  "splits = ['train', 'val', 'test']\n",
334
  "label_feature = ds['train'].features['label']\n",
335
+ "int2str = label_feature.int2str \n",
336
  "\n",
337
  "print(f\"{'SPLIT':<10} {'CLASS NAME':<25} {'COUNT':<10} {'STATUS'}\")\n",
338
  "print(\"-\" * 60)\n",
339
  "\n",
340
  "for split in splits:\n",
341
+ " # Get all labels (Load only the label column into memory)\n",
342
  " # This is instant compared to loading images\n",
343
  " labels = ds[split]['label']\n",
344
  " \n",
345
+ " # Count frequencies\n",
346
  " counts = Counter(labels)\n",
347
  " \n",
348
+ " # Analyze each class\n",
349
  " # We sort by class ID to keep it organized\n",
350
  " for label_id in sorted(counts.keys()):\n",
351
  " count = counts[label_id]\n",
352
  " class_name = int2str(label_id)\n",
353
  " \n",
354
+ " # Define Expected Counts based on the Paper\n",
355
  " # Train: 320k / 16 = 20,000\n",
356
  " # Test/Val: 40k / 16 = 2,500\n",
357
  " if split == 'train':\n",
 
371
  "id": "5f7b75a2",
372
  "metadata": {},
373
  "source": [
374
+ "## Checking the data imbalance in \"rvl_cdip_data\" dir"
375
  ]
376
  },
377
  {
378
  "cell_type": "code",
379
+ "execution_count": 9,
380
  "id": "059bfaa5",
381
  "metadata": {},
382
  "outputs": [
 
384
  "name": "stdout",
385
  "output_type": "stream",
386
  "text": [
387
+ "📂 Scanning directory: /Users/arpit-zstch1557/Projects/document-classification/rvl_cdip_data\n",
388
  "SPLIT CLASS NAME FILES STATUS\n",
389
  "-----------------------------------------------------------------\n",
390
  "TRAIN advertisement 19963 ❌ MISMATCH (Exp: 20000)\n",
 
448
  "import pandas as pd\n",
449
  "\n",
450
  "# Configuration\n",
451
+ "DATA_DIR = \"rvl_cdip_data\" # Your directory name\n",
452
  "splits = ['train', 'val', 'test']\n",
453
  "\n",
454
  "print(f\"📂 Scanning directory: {os.path.abspath(DATA_DIR)}\")\n",
 
485
  " # Determine Expected Count based on the paper\n",
486
  " if split == 'train':\n",
487
  " expected = 20000 \n",
 
488
  " else:\n",
489
  " expected = 2500\n",
490
  "\n",
491
  " # Status Check\n",
 
492
  " if file_count == expected:\n",
493
  " status = \"✅ OK\"\n",
494
  " elif abs(file_count - expected) < 5: \n",
 
504
  "print(\"\\nAnalysis Complete.\")"
505
  ]
506
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  {
508
  "cell_type": "code",
509
  "execution_count": null,
510
+ "id": "4ef697a2",
511
  "metadata": {},
512
  "outputs": [],
513
  "source": []
 
529
  "name": "python",
530
  "nbconvert_exporter": "python",
531
  "pygments_lexer": "ipython3",
532
+ "version": "3.13.11"
533
  }
534
  },
535
  "nbformat": 4,