DaniKaEp commited on
Commit
f7e5558
·
verified ·
1 Parent(s): 58811f3

Upload era_data.py

Browse files
Files changed (1) hide show
  1. era_data.py +311 -0
era_data.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from glob import glob
3
+ from torch.utils.data import Dataset
4
+ import os
5
+ from PIL import Image
6
+ import numpy as np
7
+ import cv2
8
+
9
+
10
+ def get_IDS(IMG_DIR='output/images_preprocessed', era=False, CATALOGUE_FN='output/cdli_catalogue_data.csv'):
11
+ img_fns = glob(os.path.join(IMG_DIR, '*.png'))
12
+ IDS = [os.path.basename(fn).rstrip('.png') for fn in img_fns]
13
+ if era:
14
+ IDS = list(set(IDS) & set(pd.read_csv(
15
+ CATALOGUE_FN, usecols=['id_text', 'era'], dtype={'id_text': object}
16
+ ).dropna(subset=['era']).set_index('id_text').to_dict()['era'].keys()))
17
+ return IDS
18
+
19
+ def pad_zeros(x):
20
+ x_new = str(x)
21
+ return (6-len(x_new))*'0'+x_new
22
+
23
+ class TabletEraDataset(Dataset):
24
+
25
+ ERA_INDICES = {
26
+ 'early_bronze': 0,
27
+ 'mid_late_bronze': 1,
28
+ 'iron': 2
29
+ }
30
+
31
+ def __init__(self, CATALOGUE_FN='output/cdli_catalogue_data.csv', IMG_DIR='output/images_preprocessed', IDS=None):
32
+ self.id2era = pd.read_csv(
33
+ CATALOGUE_FN, usecols=['id_text', 'era'], dtype={'id_text': object}
34
+ ).dropna(subset=['era']).set_index('id_text').to_dict()['era']
35
+
36
+ self.img_fns = glob(os.path.join(IMG_DIR, '*.png'))
37
+ self.IDS = [os.path.basename(fn).rstrip('.png') for fn in self.img_fns]
38
+
39
+ if IDS is not None:
40
+ print(f'Filtering {len(self.IDS)} IDS down to provided {len(IDS)}...')
41
+ IDS_set = set(IDS)
42
+ indices = [i for i, ID in enumerate(self.IDS) if ID in IDS_set]
43
+ self.img_fns = [self.img_fns[i] for i in indices]
44
+ self.IDS = [self.IDS[i] for i in indices]
45
+
46
+ def __len__(self):
47
+ return len(self.IDS)
48
+
49
+ def __getitem__(self, idx):
50
+ fn = self.img_fns[idx]
51
+ ID = self.IDS[idx]
52
+ era = self.id2era[ID]
53
+ img = np.asarray(Image.open(fn))
54
+ return img.astype(np.float32) / 255, self.ERA_INDICES[era]
55
+
56
+
57
+ class TabletPeriodDataset(Dataset):
58
+
59
+ # based on (normed) periods with at least 100 photos:
60
+ PERIOD_INDICES = {
61
+
62
+ 'other': 0,
63
+ 'Ur III': 1,
64
+ 'Neo-Assyrian': 2,
65
+ 'Old Babylonian': 3,
66
+ 'Middle Babylonian': 4,
67
+ 'Neo-Babylonian': 5,
68
+ 'Old Akkadian': 6,
69
+ 'Achaemenid': 7,
70
+ 'Early Old Babylonian': 8,
71
+ 'ED IIIb': 9,
72
+ 'Middle Assyrian': 10,
73
+ 'Old Assyrian': 11,
74
+ 'Uruk III': 12,
75
+ 'Proto-Elamite': 13,
76
+ 'Lagash II': 14,
77
+ 'Ebla': 15,
78
+ 'ED IIIa': 16,
79
+ 'Hellenistic': 17,
80
+ 'ED I-II': 18,
81
+ 'Middle Elamite': 19,
82
+ 'Hittite': 20,
83
+ 'Uruk IV': 21
84
+ }
85
+
86
+ PROVENIENCE_INDICES = {
87
+ 'Nineveh': 1,
88
+ 'Nippur': 2,
89
+ 'unknown': 3,
90
+ 'Umma': 4,
91
+ 'Puzris-Dagan': 5,
92
+ 'Girsu': 6,
93
+ 'Ur': 7,
94
+ 'Uruk': 8,
95
+ 'Kanesh': 9,
96
+ 'Assur': 10,
97
+ 'Adab': 11,
98
+ 'Garsana': 12,
99
+ 'Gasur/Nuzi': 13,
100
+ 'Susa': 14,
101
+ 'Sippar-Yahrurum': 15,
102
+ 'Larsa': 16,
103
+ 'Nerebtum': 17,
104
+ 'mod. Babylonia': 18,
105
+ 'Parsa': 19,
106
+ 'Kish': 20,
107
+ 'Kalhu': 21,
108
+ 'Tuttul': 22,
109
+ 'Suruppak': 23,
110
+ 'Babili': 24,
111
+ 'Ebla': 25,
112
+ 'mod. Beydar': 26,
113
+ 'Akhetaten': 27,
114
+ 'Esnunna': 28,
115
+ 'Borsippa': 29,
116
+ 'Kar-Tukulti-Ninurta': 30,
117
+ 'mod. Jemdet Nasr': 31,
118
+ 'mod. northern Babylonia': 32,
119
+ 'Alalakh': 33,
120
+ 'Hattusa': 34,
121
+ 'Isin': 35,
122
+ 'Elbonia': 36,
123
+ 'Sibaniba': 37,
124
+ 'Tutub': 38,
125
+ 'Pi-Kasi': 39,
126
+ 'Irisagrig': 40,
127
+ 'Ansan': 41,
128
+ 'Dilbat': 42,
129
+ 'Zabalam': 43,
130
+ 'mod. Mugdan/ Umm al-Jir': 44,
131
+ 'Marad': 45,
132
+ 'Eridu': 46,
133
+ 'Seleucia': 47,
134
+ 'mod. Abu Halawa': 48,
135
+ 'Dur-Untas': 49,
136
+ 'Nagar': 50,
137
+ 'Lagaba': 51,
138
+ 'Asnakkum': 52,
139
+ 'Dur-Kurigalzu': 53,
140
+ 'mod. Tell Sabaa': 54,
141
+ 'mod. Abu Jawan': 55,
142
+ 'mod. Tell Fakhariyah': 56,
143
+ 'Dur-Abi-esuh': 57,
144
+ 'Ugarit': 58,
145
+ 'mod. Diqdiqqah': 59,
146
+ 'Tarbisu': 60,
147
+ 'Lagash': 61,
148
+ 'Kisurra': 62,
149
+ 'Elammu': 63,
150
+ 'Du-Enlila': 64,
151
+ 'Kutha': 65,
152
+ 'mod. Umm el-Hafriyat': 66,
153
+ 'Dur-Sarrukin': 67,
154
+ 'Bad-Tibira': 68,
155
+ 'Bit-zerija': 69,
156
+ 'Kilizu': 70,
157
+ 'mod. Pasargadae': 71,
158
+ 'Abdju': 72,
159
+ 'Surmes': 73,
160
+ 'mod. Qatibat': 74,
161
+ 'Tigunanum': 75,
162
+ 'mod. Tell al-Lahm': 76,
163
+ 'mod. Mesopotamia': 77,
164
+ 'Subat-Enlil': 78,
165
+ 'mod. Konar Sandal': 79,
166
+ 'Gissi': 80,
167
+ 'Agamatanu': 81,
168
+ 'Aqa': 82,
169
+ 'Kapri-sa-naqidati': 83,
170
+ 'Esura': 84,
171
+ 'Nahalla': 85,
172
+ 'Bit-Sahtu': 86,
173
+ 'mod. Sepphoris': 87,
174
+ 'Dusabar': 88,
175
+ 'mod. Tell Sifr': 89,
176
+ 'Nasir': 90,
177
+ 'Kumu': 91,
178
+ 'Kazallu': 92,
179
+ 'Kapru': 93,
180
+ 'Hurruba': 94,
181
+ 'mod. Deh-e-no, Iran': 95,
182
+ "mod. Za'aleh": 96,
183
+ 'mod. Tepe Farukhabad': 97,
184
+ 'Hursagkalama': 98,
185
+ 'Carchemish': 99,
186
+ 'mod. Ben Shemen, Israel': 100,
187
+ 'Kutalla': 101,
188
+ 'Der': 102,
189
+ 'Imgur-Enlil': 103,
190
+ 'mod. Hillah': 104,
191
+ 'mod. Uhudu': 105,
192
+ 'mod. Mahmudiyah': 106,
193
+ 'Terqa': 107,
194
+ 'Arrapha': 108,
195
+ 'mod. Tell en-Nasbeh': 109,
196
+ 'mod. Kalah Shergat': 110,
197
+ 'Kar-Nabu': 111,
198
+ 'Harran': 112,
199
+ 'mod. Til-Buri': 113,
200
+ 'Shuruppak': 114,
201
+ 'mod. Abu Salabikh': 115,
202
+ "Ma'allanate": 116,
203
+ 'Kar-Mullissu': 117,
204
+ 'mod. Naqs-i-Rustam': 118
205
+ }
206
+
207
+ GENRE_INDICES = {
208
+
209
+ 'Administrative': 1,
210
+ 'Letter': 2,
211
+ 'Legal': 3,
212
+ 'Royal/Monumental': 4,
213
+ 'Literary': 5,
214
+ 'Lexical': 6,
215
+ 'Omen': 7,
216
+ 'uncertain': 8,
217
+ 'Administrative ?': 1,
218
+ 'School': 9,
219
+ 'Mathematical': 10,
220
+ 'Prayer/Incantation': 11,
221
+ 'Lexical ?': 6,
222
+ 'Scientific': 12,
223
+ 'Ritual': 13,
224
+ 'Letter ?': 2,
225
+ 'Literary ?': 5,
226
+ 'fake (modern)': 14,
227
+ 'Lexical; Literary': 6,
228
+ 'Legal ?': 3,
229
+ 'Literary; Mathematical': 5,
230
+ 'Astronomical': 15,
231
+ 'Lexical; Mathematical': 6,
232
+ 'School ?': 9,
233
+ 'Mathematical ?': 10,
234
+ 'Royal/Monumental ?': 4,
235
+ 'Private/Votive': 16,
236
+ 'fake (modern) ?': 14,
237
+ 'Other (see subgenre)': 8,
238
+ 'Historical': 2,
239
+ 'Literary; Lexical': 5,
240
+ 'Lexical; Literary; Mathematical': 6,
241
+ 'Literary; Administrative': 5,
242
+ 'Literary; Letter': 5,
243
+ 'Scientific ?': 12,
244
+ 'Royal/Monumental; Literary': 4,
245
+ 'Private/Votive ?': 16,
246
+ 'School; Literary': 9,
247
+ 'Prayer/Incantation ?': 11,
248
+ 'Ritual ?': 13,
249
+ 'Lexical; School': 6
250
+ }
251
+
252
+ def __init__(self, CATALOGUE_FN='output/cdli_catalogue_data.csv', IMG_DIR='output/images', IDS=None, mask=False):
253
+
254
+ df = pd.read_csv(
255
+ CATALOGUE_FN, usecols=['id_text', 'era', 'period_normed', 'provenience_normed', 'genre'], dtype={'id_text': object}
256
+ ).dropna(subset=['era'])
257
+
258
+ df["id_text"] = df.id_text.apply(lambda x: pad_zeros(x))
259
+ df = df[df['period_normed'].isin(TabletPeriodDataset.PERIOD_INDICES.keys())]
260
+
261
+ self.id2period = df.set_index('id_text').to_dict()['period_normed']
262
+ self.id2provenience = df.set_index('id_text').to_dict()['provenience_normed']
263
+ self.id2genre = df.set_index('id_text').to_dict()['genre']
264
+ self.genre = df.set_index('id_text').to_dict()['genre']
265
+ self.img_fns = glob(os.path.join(IMG_DIR, '*.png'))
266
+ self.IDS = [os.path.basename(fn).rstrip('.png') for fn in self.img_fns]
267
+
268
+ if IDS is not None:
269
+ print(f'Filtering {len(self.IDS)} IDS down to provided {len(IDS)}...')
270
+ IDS_set = set(IDS)
271
+ indices = [i for i, ID in enumerate(self.IDS) if ID in IDS_set]
272
+ self.img_fns = [self.img_fns[i] for i in indices]
273
+ self.IDS = [self.IDS[i] for i in indices]
274
+
275
+ self.mask = mask
276
+
277
+ def __len__(self):
278
+ return len(self.IDS)
279
+
280
+ def __getitem__(self, idx):
281
+ fn = self.img_fns[idx]
282
+ ID = self.IDS[idx]
283
+ try:
284
+ period = self.id2period[ID]
285
+ except KeyError as ke:
286
+ #print('Key Not Found in Period Dictionary:', ke)
287
+ period = 0
288
+
289
+ try:
290
+ genre = self.id2genre[ID]
291
+ except KeyError as ke:
292
+ #print('Key Not Found in Period Dictionary:', ke)
293
+ genre = 8 # other/uncertain
294
+
295
+ try:
296
+ provenience = self.id2provenience[ID]
297
+ except KeyError as ke:
298
+ #print('Key Not Found in Period Dictionary:', ke)
299
+ provenience = 3 # unknown
300
+
301
+ img = np.asarray(Image.open(fn))
302
+ alpha = 3 # Contrast control (1.0-3.0)
303
+ beta = 0 # Brightness control (0-100)
304
+ adjusted = cv2.convertScaleAbs(img, alpha=alpha, beta=beta)
305
+ img = img.astype(np.float32) / 255
306
+ img = cv2.GaussianBlur(img, (11,11), 0)
307
+ if self.mask:
308
+ img = (img > 0.125).astype(np.float32) ### 0.25 was great for most besides the really dark ones
309
+
310
+
311
+ return ID, img, self.PERIOD_INDICES.get(period, 0), self.GENRE_INDICES.get(genre, 8), self.PROVENIENCE_INDICES.get(provenience, 3) # 0: other