from ..imports import * class ImageTextLoader: def __init__(self): with open(self.getDirectory()) as f: data = json.load(f) self.data = data self.batches = [list(range(i * BATCH_SIZE, (i + 1) * BATCH_SIZE)) for i in range(len(self.data) // BATCH_SIZE)] @abstractmethod def getDirectory(self) -> str: pass def __len__(self): return len(self.data) def __getitem__(self, item): img, inp, label = self.getData(item) return (img, inp), label def __iter__(self): return (self[i] for i in range(len(self))) def getData(self, item): directory, vectors = self.data[item] vectors: list good_vectors = [vector[:-1] for vector in vectors] [good_vector.extend([0] * (MAXIMUM_LENGTH - len(good_vector))) for good_vector in good_vectors] [vector.extend([0] * (MAXIMUM_LENGTH - len(vector))) for vector in vectors] img = tf.constant(cv2.resize(cv2.imread(directory), IDEAL_SHAPE[:-1]), dtype=tf.float32) textTensor = tf.constant(good_vectors, dtype=tf.int32) shiftedVectors = [vector[1:] + [0] for vector in vectors] shiftedTensor = tf.constant(shiftedVectors, dtype=tf.int32) return img, textTensor, shiftedTensor def getDirWithCorpus(self, item: str | int): directory = self.data[item][0] if type(item) is int else item all_vector = [value[1] for value in self.data if value[0] == directory] return directory, all_vector class TrainDataset(ImageTextLoader): def __init__(self): super().__init__() random.shuffle(self.data) def getDirectory(self) -> str: return 'train.json' class TestDataset(ImageTextLoader): def __init__(self): super().__init__() random.shuffle(self.data) def getDirectory(self) -> str: return 'test.json' def get_dataset(train=True): ds = tf.data.Dataset.from_generator(TrainDataset if train else TestDataset, output_signature=( ( tf.TensorSpec((None, None, 3)), tf.TensorSpec((None, MAXIMUM_LENGTH)) ), tf.TensorSpec((None, MAXIMUM_LENGTH)) ) ) ds = ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE) return ds