| | import os
|
| | import json
|
| | import numpy as np
|
| | from PIL import Image
|
| | from pathlib import Path
|
| | import shutil
|
| | from PIL import Image, ImageFont, ImageDraw
|
| | import cv2 as cv
|
| | import os
|
| | import shutil
|
| | from pathlib import Path
|
| | import os
|
| | import json
|
| |
|
| | def get_data_and_annots():
|
| | """
|
| | Load image file paths and their corresponding annotations from the PubLayNet dataset.
|
| |
|
| | Returns:
|
| | images (dict): A dictionary where keys are image IDs and values are dictionaries
|
| | data (dict): The entire JSON data loaded from the annotations file
|
| |
|
| | """
|
| | images = {}
|
| | with open('data/raw/label/publaynet/train.json') as t:
|
| | data = json.load(t)
|
| |
|
| | for train_images in os.walk('data/raw/train/publaynet/train'):
|
| | train_imgs = train_images[2]
|
| |
|
| | for image in data['images']:
|
| | if image['file_name'] in train_imgs:
|
| | images[image['id']] = {'file_name': "data/raw/train/publaynet/train/" + image['file_name'], 'annotations': []}
|
| |
|
| |
|
| |
|
| | for ann in data['annotations']:
|
| | if ann['image_id'] in images.keys():
|
| | images[ann['image_id']]['annotations'].append(ann)
|
| |
|
| | return images, data
|
| |
|
| |
|
| |
|
| | def write_file(image_id, inside, filename, content, check_set):
|
| | """
|
| | Writes content to a file
|
| |
|
| | Inputs:
|
| | image_id (str): The ID of the image.
|
| | inside (bool): Flag to determine if content should be appended or overwritten.
|
| | filename (str): The path to the file.
|
| | content (str): The content to write to the file.
|
| | check_set (set): A set to keep track of image IDs
|
| | Returns:
|
| |
|
| |
|
| | """
|
| | if inside:
|
| | with open(filename, "a") as file:
|
| | file.write("\n")
|
| | file.write(content)
|
| | else:
|
| | check_set.add(image_id)
|
| | with open(filename, "w") as file:
|
| | file.write(content)
|
| |
|
| | def get_bb_shape(bboxe, img):
|
| | """
|
| | Calculates the shape of the bounding box in the image.
|
| |
|
| | Inputs:
|
| | bboxe (list): Bounding box coordinates [x, y, width, height].
|
| | img (numpy.ndarray): The image array.
|
| |
|
| | Returns:
|
| | tuple: The shape (height, width) of the bounding box
|
| |
|
| | """
|
| | tleft = (bboxe[0], bboxe[1])
|
| | tright = (bboxe[0] + bboxe[2], bboxe[1])
|
| | bleft = (bboxe[0], bboxe[1] + bboxe[3])
|
| | bright = (bboxe[0] + bboxe[2], bboxe[1] + bboxe[3])
|
| |
|
| | top_left_x = min([tleft[0], tright[0], bleft[0], bright[0]])
|
| | top_left_y = min([tleft[1], tright[1], bleft[1], bright[1]])
|
| | bot_right_x = max([tleft[0], tright[0], bleft[0], bright[0]])
|
| | bot_right_y = max([tleft[1], tright[1], bleft[1], bright[1]])
|
| |
|
| | image = img[int(top_left_y):int(bot_right_y) + 1, int(top_left_x):int(bot_right_x) + 1]
|
| |
|
| | return image.shape[:2]
|
| |
|
| | def coco_to_yolo(x1, y1, w, h, image_w, image_h):
|
| | """
|
| | Converts COCO format bounding box to YOLO format.
|
| |
|
| | Inputs:
|
| | x1 (float): Top-left x coordinate.
|
| | y1 (float): Top-left y coordinate.
|
| | w (float): Width of the bounding box.
|
| | h (float): Height of the bounding box.
|
| | image_w (int): Width of the image.
|
| | image_h (int): Height of the image.
|
| |
|
| | Returns:
|
| | list: YOLO format bounding box [x_center, y_center, width, height]
|
| |
|
| | """
|
| | return [((2 * x1 + w) / (2 * image_w)), ((2 * y1 + h) / (2 * image_h)), w / image_w, h / image_h]
|
| |
|
| | def create_directory(path):
|
| | """
|
| | Creates a directory, deleting it first if it already exists.
|
| |
|
| | Inputs:
|
| | path (str): The path to the directory
|
| |
|
| | """
|
| | dirpath = Path(path)
|
| | if dirpath.exists() and dirpath.is_dir():
|
| | shutil.rmtree(dirpath)
|
| | os.mkdir(dirpath)
|
| |
|
| | def generate_yolo_labels(images):
|
| | """
|
| | Generates YOLO format labels from the given images and annotations.
|
| |
|
| | Inputs:
|
| | images (dict): Dictionary containing image data and annotations
|
| |
|
| | """
|
| | check_set = set()
|
| |
|
| | create_directory(os.getcwd() + '/data/processed/yolo')
|
| |
|
| | for key in images:
|
| | image_id = ','.join(map(str, [image_id['image_id'] for image_id in images[key]['annotations']]))
|
| | category_id = ''.join(map(str, [cat_id['category_id'] - 1 for cat_id in images[key]['annotations']]))
|
| | bbox = [bbox['bbox'] for bbox in images[key]['annotations']]
|
| | image_path = images[key]['file_name']
|
| | filename = os.getcwd() + '/data/processed/yolo/' + image_path.split('/')[-1].split(".")[0] + '.txt'
|
| |
|
| | for index, b in enumerate(bbox):
|
| | bbox = [b[0], b[1], b[2], b[3]]
|
| | shape = get_bb_shape(bbox, cv.imread(image_path))
|
| | yolo_bbox = coco_to_yolo(bbox[0], bbox[1], shape[1], shape[0], cv.imread(image_path).shape[1], cv.imread(image_path).shape[0])
|
| | content = category_id[index] + ' ' + str(yolo_bbox[0]) + ' ' + str(yolo_bbox[1]) + ' ' + str(yolo_bbox[2]) + ' ' + str(yolo_bbox[3])
|
| |
|
| | if image_id in check_set:
|
| | write_file(image_id, True, filename, content, check_set)
|
| | else:
|
| | write_file(image_id, False, filename, content, check_set)
|
| |
|
| | def delete_additional_images(old_train_path, temp_images_path, yolo_path):
|
| | """
|
| | Delete additional images that don't have corresponding YOLO labels.
|
| |
|
| | This function moves images from the old training path to a temporary path
|
| | if they have corresponding YOLO label files.
|
| |
|
| | Inputs:
|
| | old_train_path (str): Path to the original training images.
|
| | temp_images_path (str): Path to store the temporary images.
|
| | yolo_path (str): Path to the YOLO label files.
|
| |
|
| | Returns:
|
| |
|
| | """
|
| | train = next(os.walk(old_train_path), (None, None, []))[2]
|
| | label = next(os.walk(yolo_path), (None, None, []))[2]
|
| |
|
| | dirpath = Path(temp_images_path)
|
| | if dirpath.exists() and dirpath.is_dir():
|
| | shutil.rmtree(dirpath)
|
| | os.mkdir(dirpath)
|
| |
|
| | for img in train:
|
| | splited = img.split(".")[0]
|
| | txt = f"{splited}.txt"
|
| | if txt in label:
|
| | shutil.move(f"{old_train_path}/{img}", f"{temp_images_path}/{img}")
|
| |
|
| | def split_data(temp_images_path):
|
| | """
|
| | Split the dataset into training and validation sets.
|
| |
|
| | This function splits the images in the temporary path into training (90%)
|
| | and validation (10%) sets, and moves them to their respective directories.
|
| |
|
| | Inputs:
|
| | temp_images_path (str): Path to the temporary images.
|
| |
|
| | Returns:
|
| | list: List of validation image names without file extensions
|
| |
|
| | """
|
| | image = next(os.walk(temp_images_path), (None, None, []))[2]
|
| | train = image[int(len(image) * .1) : int(len(image) * .90)]
|
| | validation = list(set(image) - set(train))
|
| |
|
| | create_directory(os.getcwd() + '/data/processed/training')
|
| | create_directory(os.getcwd() + '/data/processed/validation')
|
| | create_directory(os.getcwd() + '/data/processed/training/images/')
|
| | create_directory(os.getcwd() + '/data/processed/validation/images/')
|
| |
|
| | for train_img in train:
|
| | shutil.move(f'{temp_images_path}/{train_img}', os.getcwd() + '/data/processed/training/images/')
|
| |
|
| | for valid_img in validation:
|
| | shutil.move(f'{temp_images_path}/{valid_img}', os.getcwd() + '/data/processed/validation/images/')
|
| |
|
| | validation_without_ext = [i.split('.')[0] for i in validation]
|
| | return validation_without_ext
|
| |
|
| | def create_directory(path):
|
| | """
|
| | Create a new directory, removing it first if it already exists.
|
| |
|
| | Inputs:
|
| | path (str): Path to the directory to be created.
|
| |
|
| | Returns:
|
| |
|
| | """
|
| | dirpath = Path(path)
|
| | if dirpath.exists() and dirpath.is_dir():
|
| | shutil.rmtree(dirpath)
|
| | os.mkdir(dirpath)
|
| |
|
| | def get_labels(yolo_path, valid_without_extension):
|
| | """
|
| | Move YOLO label files to their respective training and validation directories.
|
| |
|
| | Inputs:
|
| | yolo_path (str): Path to the YOLO label files.
|
| | valid_without_extension (list): List of validation image names without file extensions.
|
| |
|
| | Returns:
|
| |
|
| | """
|
| | create_directory(os.getcwd() + '/data/processed/training/labels')
|
| | create_directory(os.getcwd() + '/data/processed/validation/labels')
|
| |
|
| | label = next(os.walk(yolo_path), (None, None, []))[2]
|
| | for lab in label:
|
| | split = lab.split(".")[0]
|
| | if split in valid_without_extension:
|
| | shutil.move(f"{yolo_path}/{lab}", os.getcwd() + f'/data/processed/validation/labels/{lab}')
|
| | else:
|
| | shutil.move(f"{yolo_path}/{lab}", os.getcwd() + f'/data/processed/training/labels/{lab}')
|
| |
|
| | def final_preparation(old_train_path, temp_images_path, yolo_path):
|
| | """
|
| | Perform final preparation steps for the dataset.
|
| |
|
| | This function orchestrates the entire data preparation process, including
|
| | deleting additional images, splitting the data, and organizing labels.
|
| |
|
| | Inputs:
|
| | old_train_path (str): Path to the original training images.
|
| | temp_images_path (str): Path to store the temporary images.
|
| | yolo_path (str): Path to the YOLO label files.
|
| |
|
| | Returns:
|
| |
|
| | """
|
| | delete_additional_images(old_train_path, temp_images_path, yolo_path)
|
| | valid_without_extension = split_data(temp_images_path)
|
| |
|
| | dirpath = Path(temp_images_path)
|
| | if dirpath.exists() and dirpath.is_dir():
|
| | shutil.rmtree(dirpath)
|
| |
|
| | return get_labels(yolo_path, valid_without_extension)
|
| |
|
| | def annotate_tables(directory):
|
| | """
|
| | Annotate and crop tables from images based on YOLO labels.
|
| |
|
| | This function processes images in the given directory, reads corresponding
|
| | YOLO labels, crops table regions, and saves them as separate images.
|
| |
|
| | Inputs:
|
| | directory (str): Path to the directory containing images to be processed.
|
| |
|
| | Returns:
|
| |
|
| | """
|
| | dirpath = Path(os.getcwd() + f'/data/processed/tables')
|
| | if dirpath.exists() and dirpath.is_dir():
|
| | shutil.rmtree(dirpath)
|
| | os.mkdir(dirpath)
|
| |
|
| | for filename in os.listdir(directory):
|
| | file_path = os.path.join(directory, filename)
|
| |
|
| | if os.path.isfile(file_path):
|
| | img_name = filename.split('.')[0]
|
| | print(f'\f {img_name}')
|
| | if os.path.isfile(os.getcwd() + f'/data/processed/training/images/{img_name}.jpg'):
|
| | with open(os.getcwd() + f'/data/processed/training/labels/{img_name}.txt', 'r') as f:
|
| | results = f.read()
|
| | original_image = Image.open(os.getcwd() + f'/data/processed/training/images/{img_name}.jpg')
|
| |
|
| | elif os.path.isfile(os.getcwd() + f'/data/processed/validation/images/{img_name}.jpg'):
|
| | with open(os.getcwd() + f'/data/processed/validation/labels/{img_name}.txt', 'r') as f:
|
| | results = f.read()
|
| | original_image = Image.open(os.getcwd() + f'/data/processed/validation/images/{img_name}.jpg')
|
| |
|
| | for r in results:
|
| | boxes = r.boxes
|
| |
|
| | for box in boxes:
|
| | if box.cls == 3:
|
| | x1, y1, x2, y2 = box.xyxy[0]
|
| | x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
|
| | table_image = original_image.crop((x1, y1, x2, y2))
|
| | table_image.show()
|
| | table_image.save(os.getcwd() + f'/data/processed/tables/{img_name}.jpg')
|
| | break
|
| | break
|
| |
|
| |
|
| | if __name__ == '__main__':
|
| |
|
| | images,data = get_data_and_annots()
|
| | generate_labels = generate_yolo_labels(images)
|
| | finalPrep = final_preparation(os.path.join(os.getcwd() + r'\data\raw\train\publaynet\train'),os.path.join(os.getcwd() + r"\data\processed\images"), os.getcwd() + '/data/processed/yolo',images)
|
| | annotate_tables(os.getcwd() + '/data/processed/hand_labeled_tables/hand_labeled_tables') |