Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| from glob import glob | |
| from torch.utils.data import Dataset | |
| import os | |
| from PIL import Image | |
| import numpy as np | |
| import cv2 | |
| def get_IDS(IMG_DIR='output/images_preprocessed', era=False, CATALOGUE_FN='output/cdli_catalogue_data.csv'): | |
| img_fns = glob(os.path.join(IMG_DIR, '*.png')) | |
| IDS = [os.path.basename(fn).rstrip('.png') for fn in img_fns] | |
| if era: | |
| IDS = list(set(IDS) & set(pd.read_csv( | |
| CATALOGUE_FN, usecols=['id_text', 'era'], dtype={'id_text': object} | |
| ).dropna(subset=['era']).set_index('id_text').to_dict()['era'].keys())) | |
| return IDS | |
| def pad_zeros(x): | |
| x_new = str(x) | |
| return (6-len(x_new))*'0'+x_new | |
| class TabletEraDataset(Dataset): | |
| ERA_INDICES = { | |
| 'early_bronze': 0, | |
| 'mid_late_bronze': 1, | |
| 'iron': 2 | |
| } | |
| def __init__(self, CATALOGUE_FN='output/cdli_catalogue_data.csv', IMG_DIR='output/images_preprocessed', IDS=None): | |
| self.id2era = pd.read_csv( | |
| CATALOGUE_FN, usecols=['id_text', 'era'], dtype={'id_text': object} | |
| ).dropna(subset=['era']).set_index('id_text').to_dict()['era'] | |
| self.img_fns = glob(os.path.join(IMG_DIR, '*.png')) | |
| self.IDS = [os.path.basename(fn).rstrip('.png') for fn in self.img_fns] | |
| if IDS is not None: | |
| print(f'Filtering {len(self.IDS)} IDS down to provided {len(IDS)}...') | |
| IDS_set = set(IDS) | |
| indices = [i for i, ID in enumerate(self.IDS) if ID in IDS_set] | |
| self.img_fns = [self.img_fns[i] for i in indices] | |
| self.IDS = [self.IDS[i] for i in indices] | |
| def __len__(self): | |
| return len(self.IDS) | |
| def __getitem__(self, idx): | |
| fn = self.img_fns[idx] | |
| ID = self.IDS[idx] | |
| era = self.id2era[ID] | |
| img = np.asarray(Image.open(fn)) | |
| return img.astype(np.float32) / 255, self.ERA_INDICES[era] | |
| class TabletPeriodDataset(Dataset): | |
| # based on (normed) periods with at least 100 photos: | |
| PERIOD_INDICES = { | |
| 'other': 0, | |
| 'Ur III': 1, | |
| 'Neo-Assyrian': 2, | |
| 'Old Babylonian': 3, | |
| 'Middle Babylonian': 4, | |
| 'Neo-Babylonian': 5, | |
| 'Old Akkadian': 6, | |
| 'Achaemenid': 7, | |
| 'Early Old Babylonian': 8, | |
| 'ED IIIb': 9, | |
| 'Middle Assyrian': 10, | |
| 'Old Assyrian': 11, | |
| 'Uruk III': 12, | |
| 'Proto-Elamite': 13, | |
| 'Lagash II': 14, | |
| 'Ebla': 15, | |
| 'ED IIIa': 16, | |
| 'Hellenistic': 17, | |
| 'ED I-II': 18, | |
| 'Middle Elamite': 19, | |
| 'Hittite': 20, | |
| 'Uruk IV': 21 | |
| } | |
| PROVENIENCE_INDICES = { | |
| 'Nineveh': 1, | |
| 'Nippur': 2, | |
| 'unknown': 3, | |
| 'Umma': 4, | |
| 'Puzris-Dagan': 5, | |
| 'Girsu': 6, | |
| 'Ur': 7, | |
| 'Uruk': 8, | |
| 'Kanesh': 9, | |
| 'Assur': 10, | |
| 'Adab': 11, | |
| 'Garsana': 12, | |
| 'Gasur/Nuzi': 13, | |
| 'Susa': 14, | |
| 'Sippar-Yahrurum': 15, | |
| 'Larsa': 16, | |
| 'Nerebtum': 17, | |
| 'mod. Babylonia': 18, | |
| 'Parsa': 19, | |
| 'Kish': 20, | |
| 'Kalhu': 21, | |
| 'Tuttul': 22, | |
| 'Suruppak': 23, | |
| 'Babili': 24, | |
| 'Ebla': 25, | |
| 'mod. Beydar': 26, | |
| 'Akhetaten': 27, | |
| 'Esnunna': 28, | |
| 'Borsippa': 29, | |
| 'Kar-Tukulti-Ninurta': 30, | |
| 'mod. Jemdet Nasr': 31, | |
| 'mod. northern Babylonia': 32, | |
| 'Alalakh': 33, | |
| 'Hattusa': 34, | |
| 'Isin': 35, | |
| 'Elbonia': 36, | |
| 'Sibaniba': 37, | |
| 'Tutub': 38, | |
| 'Pi-Kasi': 39, | |
| 'Irisagrig': 40, | |
| 'Ansan': 41, | |
| 'Dilbat': 42, | |
| 'Zabalam': 43, | |
| 'mod. Mugdan/ Umm al-Jir': 44, | |
| 'Marad': 45, | |
| 'Eridu': 46, | |
| 'Seleucia': 47, | |
| 'mod. Abu Halawa': 48, | |
| 'Dur-Untas': 49, | |
| 'Nagar': 50, | |
| 'Lagaba': 51, | |
| 'Asnakkum': 52, | |
| 'Dur-Kurigalzu': 53, | |
| 'mod. Tell Sabaa': 54, | |
| 'mod. Abu Jawan': 55, | |
| 'mod. Tell Fakhariyah': 56, | |
| 'Dur-Abi-esuh': 57, | |
| 'Ugarit': 58, | |
| 'mod. Diqdiqqah': 59, | |
| 'Tarbisu': 60, | |
| 'Lagash': 61, | |
| 'Kisurra': 62, | |
| 'Elammu': 63, | |
| 'Du-Enlila': 64, | |
| 'Kutha': 65, | |
| 'mod. Umm el-Hafriyat': 66, | |
| 'Dur-Sarrukin': 67, | |
| 'Bad-Tibira': 68, | |
| 'Bit-zerija': 69, | |
| 'Kilizu': 70, | |
| 'mod. Pasargadae': 71, | |
| 'Abdju': 72, | |
| 'Surmes': 73, | |
| 'mod. Qatibat': 74, | |
| 'Tigunanum': 75, | |
| 'mod. Tell al-Lahm': 76, | |
| 'mod. Mesopotamia': 77, | |
| 'Subat-Enlil': 78, | |
| 'mod. Konar Sandal': 79, | |
| 'Gissi': 80, | |
| 'Agamatanu': 81, | |
| 'Aqa': 82, | |
| 'Kapri-sa-naqidati': 83, | |
| 'Esura': 84, | |
| 'Nahalla': 85, | |
| 'Bit-Sahtu': 86, | |
| 'mod. Sepphoris': 87, | |
| 'Dusabar': 88, | |
| 'mod. Tell Sifr': 89, | |
| 'Nasir': 90, | |
| 'Kumu': 91, | |
| 'Kazallu': 92, | |
| 'Kapru': 93, | |
| 'Hurruba': 94, | |
| 'mod. Deh-e-no, Iran': 95, | |
| "mod. Za'aleh": 96, | |
| 'mod. Tepe Farukhabad': 97, | |
| 'Hursagkalama': 98, | |
| 'Carchemish': 99, | |
| 'mod. Ben Shemen, Israel': 100, | |
| 'Kutalla': 101, | |
| 'Der': 102, | |
| 'Imgur-Enlil': 103, | |
| 'mod. Hillah': 104, | |
| 'mod. Uhudu': 105, | |
| 'mod. Mahmudiyah': 106, | |
| 'Terqa': 107, | |
| 'Arrapha': 108, | |
| 'mod. Tell en-Nasbeh': 109, | |
| 'mod. Kalah Shergat': 110, | |
| 'Kar-Nabu': 111, | |
| 'Harran': 112, | |
| 'mod. Til-Buri': 113, | |
| 'Shuruppak': 114, | |
| 'mod. Abu Salabikh': 115, | |
| "Ma'allanate": 116, | |
| 'Kar-Mullissu': 117, | |
| 'mod. Naqs-i-Rustam': 118 | |
| } | |
| GENRE_INDICES = { | |
| 'Administrative': 1, | |
| 'Letter': 2, | |
| 'Legal': 3, | |
| 'Royal/Monumental': 4, | |
| 'Literary': 5, | |
| 'Lexical': 6, | |
| 'Omen': 7, | |
| 'uncertain': 8, | |
| 'Administrative ?': 1, | |
| 'School': 9, | |
| 'Mathematical': 10, | |
| 'Prayer/Incantation': 11, | |
| 'Lexical ?': 6, | |
| 'Scientific': 12, | |
| 'Ritual': 13, | |
| 'Letter ?': 2, | |
| 'Literary ?': 5, | |
| 'fake (modern)': 14, | |
| 'Lexical; Literary': 6, | |
| 'Legal ?': 3, | |
| 'Literary; Mathematical': 5, | |
| 'Astronomical': 15, | |
| 'Lexical; Mathematical': 6, | |
| 'School ?': 9, | |
| 'Mathematical ?': 10, | |
| 'Royal/Monumental ?': 4, | |
| 'Private/Votive': 16, | |
| 'fake (modern) ?': 14, | |
| 'Other (see subgenre)': 8, | |
| 'Historical': 2, | |
| 'Literary; Lexical': 5, | |
| 'Lexical; Literary; Mathematical': 6, | |
| 'Literary; Administrative': 5, | |
| 'Literary; Letter': 5, | |
| 'Scientific ?': 12, | |
| 'Royal/Monumental; Literary': 4, | |
| 'Private/Votive ?': 16, | |
| 'School; Literary': 9, | |
| 'Prayer/Incantation ?': 11, | |
| 'Ritual ?': 13, | |
| 'Lexical; School': 6 | |
| } | |
| def __init__(self, CATALOGUE_FN='output/cdli_catalogue_data.csv', IMG_DIR='output/images', IDS=None, mask=False): | |
| df = pd.read_csv( | |
| CATALOGUE_FN, usecols=['id_text', 'era', 'period_normed', 'provenience_normed', 'genre'], dtype={'id_text': object} | |
| ).dropna(subset=['era']) | |
| df["id_text"] = df.id_text.apply(lambda x: pad_zeros(x)) | |
| df = df[df['period_normed'].isin(TabletPeriodDataset.PERIOD_INDICES.keys())] | |
| self.id2period = df.set_index('id_text').to_dict()['period_normed'] | |
| self.id2provenience = df.set_index('id_text').to_dict()['provenience_normed'] | |
| self.id2genre = df.set_index('id_text').to_dict()['genre'] | |
| self.genre = df.set_index('id_text').to_dict()['genre'] | |
| self.img_fns = glob(os.path.join(IMG_DIR, '*.png')) | |
| self.IDS = [os.path.basename(fn).rstrip('.png') for fn in self.img_fns] | |
| if IDS is not None: | |
| print(f'Filtering {len(self.IDS)} IDS down to provided {len(IDS)}...') | |
| IDS_set = set(IDS) | |
| indices = [i for i, ID in enumerate(self.IDS) if ID in IDS_set] | |
| self.img_fns = [self.img_fns[i] for i in indices] | |
| self.IDS = [self.IDS[i] for i in indices] | |
| self.mask = mask | |
| def __len__(self): | |
| return len(self.IDS) | |
| def __getitem__(self, idx): | |
| fn = self.img_fns[idx] | |
| ID = self.IDS[idx] | |
| try: | |
| period = self.id2period[ID] | |
| except KeyError as ke: | |
| #print('Key Not Found in Period Dictionary:', ke) | |
| period = 0 | |
| try: | |
| genre = self.id2genre[ID] | |
| except KeyError as ke: | |
| #print('Key Not Found in Period Dictionary:', ke) | |
| genre = 8 # other/uncertain | |
| try: | |
| provenience = self.id2provenience[ID] | |
| except KeyError as ke: | |
| #print('Key Not Found in Period Dictionary:', ke) | |
| provenience = 3 # unknown | |
| img = np.asarray(Image.open(fn)) | |
| alpha = 3 # Contrast control (1.0-3.0) | |
| beta = 0 # Brightness control (0-100) | |
| adjusted = cv2.convertScaleAbs(img, alpha=alpha, beta=beta) | |
| img = img.astype(np.float32) / 255 | |
| img = cv2.GaussianBlur(img, (11,11), 0) | |
| if self.mask: | |
| img = (img > 0.125).astype(np.float32) ### 0.25 was great for most besides the really dark ones | |
| return ID, img, self.PERIOD_INDICES.get(period, 0), self.GENRE_INDICES.get(genre, 8), self.PROVENIENCE_INDICES.get(provenience, 3) # 0: other |