| |
|
| |
|
| | |
| | |
| |
|
| | import os |
| | import lmdb, tqdm |
| | import cv2 |
| | import numpy as np |
| | import argparse |
| | import shutil |
| | import sys |
| | from PIL import Image |
| | import random |
| | import io |
| | import xmltodict |
| | import html |
| | from sklearn.decomposition import PCA |
| | import math |
| | from tqdm import tqdm |
| | from itertools import compress |
| | import glob |
| | def checkImageIsValid(imageBin): |
| | if imageBin is None: |
| | return False |
| | imageBuf = np.fromstring(imageBin, dtype=np.uint8) |
| | img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE) |
| | imgH, imgW = img.shape[0], img.shape[1] |
| | if imgH * imgW == 0: |
| | return False |
| | return True |
| |
|
| |
|
| | def writeCache(env, cache): |
| | with env.begin(write=True) as txn: |
| | for k, v in cache.items(): |
| | if type(k) == str: |
| | k = k.encode() |
| | if type(v) == str: |
| | v = v.encode() |
| | txn.put(k, v) |
| |
|
| |
|
| | def find_rot_angle(idx_letters): |
| | idx_letters = np.array(idx_letters).transpose() |
| | pca = PCA(n_components=2) |
| | pca.fit(idx_letters) |
| | comp = pca.components_ |
| | angle = math.atan(comp[0][0]/comp[0][1]) |
| | return math.degrees(angle) |
| |
|
| | def read_data_from_folder(folder_path): |
| | image_path_list = [] |
| | label_list = [] |
| | pics = os.listdir(folder_path) |
| | pics.sort(key=lambda i: len(i)) |
| | for pic in pics: |
| | image_path_list.append(folder_path + '/' + pic) |
| | label_list.append(pic.split('_')[0]) |
| | return image_path_list, label_list |
| |
|
| |
|
| | def read_data_from_file(file_path): |
| | image_path_list = [] |
| | label_list = [] |
| | f = open(file_path) |
| | while True: |
| | line1 = f.readline() |
| | line2 = f.readline() |
| | if not line1 or not line2: |
| | break |
| | line1 = line1.replace('\r', '').replace('\n', '') |
| | line2 = line2.replace('\r', '').replace('\n', '') |
| | image_path_list.append(line1) |
| | label_list.append(line2) |
| |
|
| | return image_path_list, label_list |
| |
|
| |
|
| | def show_demo(demo_number, image_path_list, label_list): |
| | print('\nShow some demo to prevent creating wrong lmdb data') |
| | print('The first line is the path to image and the second line is the image label') |
| | for i in range(demo_number): |
| | print('image: %s\nlabel: %s\n' % (image_path_list[i], label_list[i])) |
| |
|
| | def create_img_label_list(top_dir,dataset, mode, words, author_number, remove_punc): |
| | root_dir = os.path.join(top_dir, dataset) |
| | output_dir = root_dir + (dataset=='IAM')*('/words'*words + '/lines'*(not words)) |
| | image_path_list, label_list = [], [] |
| | author_id = 'None' |
| | mode = 'all' |
| | if dataset=='CVL': |
| | root_dir = os.path.join(root_dir, 'cvl-database-1-1') |
| | if words: |
| | images_name = 'words' |
| | else: |
| | images_name = 'lines' |
| | if mode == 'tr' or mode == 'val': |
| | mode_dir = ['trainset'] |
| | elif mode == 'te': |
| | mode_dir = ['testset'] |
| | elif mode == 'all': |
| | mode_dir = ['testset', 'trainset'] |
| | idx = 1 |
| | for mod in mode_dir: |
| | images_dir = os.path.join(root_dir, mod, images_name) |
| | for path, subdirs, files in os.walk(images_dir): |
| | for name in files: |
| | if (mode == 'tr' and idx >= 10000) or ( |
| | mode == 'val' and idx < 10000) or mode == 'te' or mode == 'all' or mode == 'tr_3te': |
| | if os.path.splitext(name)[0].split('-')[1] == '6': |
| | continue |
| | label = os.path.splitext(name)[0].split('-')[-1] |
| |
|
| | imagePath = os.path.join(path, name) |
| | label_list.append(label) |
| | image_path_list.append(imagePath) |
| | idx += 1 |
| |
|
| |
|
| |
|
| | elif dataset=='IAM': |
| | labels_name = 'original' |
| | if mode=='all': |
| | mode = ['te', 'va1', 'va2', 'tr'] |
| | elif mode=='valtest': |
| | mode=['te', 'va1', 'va2'] |
| | else: |
| | mode = [mode] |
| | if words: |
| | images_name = 'wordImages' |
| | else: |
| | images_name = 'lineImages' |
| | images_dir = os.path.join(root_dir, images_name) |
| | labels_dir = os.path.join(root_dir, labels_name) |
| | full_ann_files = [] |
| | im_dirs = [] |
| | line_ann_dirs = [] |
| | image_path_list, label_list = [], [] |
| | for mod in mode: |
| | part_file = os.path.join(root_dir, 'original_partition', mod + '.lst') |
| | with open(part_file)as fp: |
| | for line in fp: |
| | name = line.split('-') |
| | if int(name[-1][:-1]) == 0: |
| | anno_file = os.path.join(labels_dir, '-'.join(name[:2]) + '.xml') |
| | full_ann_files.append(anno_file) |
| | im_dir = os.path.join(images_dir, name[0], '-'.join(name[:2])) |
| | im_dirs.append(im_dir) |
| |
|
| | if author_number >= 0: |
| | full_ann_files = [full_ann_files[author_number]] |
| | im_dirs = [im_dirs[author_number]] |
| | author_id = im_dirs[0].split('/')[-1] |
| |
|
| | lables_to_skip = ['.', '', ',', '"', "'", '(', ')', ':', ';', '!'] |
| | for i, anno_file in enumerate(full_ann_files): |
| | with open(anno_file) as f: |
| | try: |
| | line = f.read() |
| | annotation_content = xmltodict.parse(line) |
| | lines = annotation_content['form']['handwritten-part']['line'] |
| | if words: |
| | lines_list = [] |
| | for j in range(len(lines)): |
| | lines_list.extend(lines[j]['word']) |
| | lines = lines_list |
| | except: |
| | print('line is not decodable') |
| | for line in lines: |
| | try: |
| | label = html.unescape(line['@text']) |
| | except: |
| | continue |
| | if remove_punc and label in lables_to_skip: |
| | continue |
| | id = line['@id'] |
| | imagePath = os.path.join(im_dirs[i], id + '.png') |
| | image_path_list.append(imagePath) |
| | label_list.append(label) |
| |
|
| | elif dataset=='RIMES': |
| | if mode=='tr': |
| | images_dir = os.path.join(root_dir, 'orig','training_WR') |
| | gt_file = os.path.join(root_dir, 'orig', |
| | 'groundtruth_training_icdar2011.txt') |
| | elif mode=='te': |
| | images_dir = os.path.join(root_dir, 'orig', 'testdataset_ICDAR') |
| | gt_file = os.path.join(root_dir, 'orig', |
| | 'ground_truth_test_icdar2011.txt') |
| | elif mode=='val': |
| | images_dir = os.path.join(root_dir, 'orig', 'valdataset_ICDAR') |
| | gt_file = os.path.join(root_dir, 'orig', |
| | 'ground_truth_validation_icdar2011.txt') |
| | with open(gt_file, 'r') as f: |
| | lines = f.readlines() |
| | image_path_list = [os.path.join(images_dir, line.split(' ')[0]) for line in lines if len(line.split(' ')) > 1] |
| |
|
| | label_list = [line.split(' ')[1][:-1] for line in lines if len(line.split(' ')) > 1] |
| |
|
| | return image_path_list, label_list, output_dir, author_id |
| |
|
| | def createDataset(IMG_DATA, image_path_list, label_list, outputPath, mode, author_id, remove_punc, resize, imgH, init_gap, h_gap, charminW, charmaxW, discard_wide, discard_narr, labeled): |
| | assert (len(image_path_list) == len(label_list)) |
| | nSamples = len(image_path_list) |
| |
|
| | outputPath = outputPath + (resize=='charResize') * ('/h%schar%sto%s/'%(imgH, charminW, charmaxW)) + (resize=='keepRatio') * ('/h%s/'%(imgH)) \ |
| | + (resize=='noResize') * ('/noResize/') + (author_id!='None') * ('single_authors/'+author_id+'/' ) \ |
| | + mode + (resize!='noResize') * (('_initGap%s'%(init_gap)) * (init_gap>0) + ('_hGap%s'%(h_gap)) * (h_gap>0) \ |
| | + '_NoDiscard_wide' * (not discard_wide) + '_NoDiscard_wide' * (not discard_narr))+'_unlabeld' * (not labeled) +\ |
| | (('IAM' in outputPath) and remove_punc) *'_removePunc' |
| |
|
| | outputPath_ = '/root/Handwritten_data/IAM/authors' + (resize=='charResize') * ('/h%schar%sto%s/'%(imgH, charminW, charmaxW)) + (resize=='keepRatio') * ('/h%s/'%(imgH)) \ |
| | + (resize=='noResize') * ('/noResize/') + (author_id!='None') * ('single_authors/'+author_id+'/' ) \ |
| | + mode + (resize!='noResize') * (('_initGap%s'%(init_gap)) * (init_gap>0) + ('_hGap%s'%(h_gap)) * (h_gap>0) \ |
| | + '_NoDiscard_wide' * (not discard_wide) + '_NoDiscard_wide' * (not discard_narr))+'_unlabeld' * (not labeled) +\ |
| | (('IAM' in outputPath) and remove_punc) *'_removePunc' |
| | print(outputPath) |
| | if os.path.exists(outputPath): |
| | shutil.rmtree(outputPath) |
| | os.makedirs(outputPath) |
| | else: |
| | os.makedirs(outputPath) |
| | env = lmdb.open(outputPath, map_size=1099511627776) |
| | cache = {} |
| | cnt = 1 |
| | discard_wide = False |
| |
|
| | |
| |
|
| |
|
| | for i in tqdm(range(nSamples)): |
| | imagePath = image_path_list[i] |
| | |
| | label = label_list[i] |
| | if not os.path.exists(imagePath): |
| | print('%s does not exist' % imagePath) |
| | continue |
| | try: |
| | im = Image.open(imagePath) |
| | except: |
| | continue |
| | if resize in ['charResize', 'keepRatio']: |
| | width, height = im.size |
| | new_height = imgH - (h_gap * 2) |
| | len_word = len(label) |
| | width = int(width * imgH / height) |
| | new_width = width |
| | if resize=='charResize': |
| | if (width/len_word > (charmaxW-1)) or (width/len_word < charminW) : |
| | if discard_wide and width/len_word > 3*((charmaxW-1)): |
| | print('%s has a width larger than max image width' % imagePath) |
| | continue |
| | if discard_narr and (width / len_word) < (charminW/3): |
| | print('%s has a width smaller than min image width' % imagePath) |
| | continue |
| | else: |
| | new_width = len_word * random.randrange(charminW, charmaxW) |
| |
|
| | |
| | im = im.resize((new_width, new_height)) |
| | |
| | init_w = int(random.normalvariate(init_gap, init_gap / 2)) |
| | new_im = Image.new("RGB", (new_width+init_gap, imgH), color=(256,256,256)) |
| | new_im.paste(im, (abs(init_w), h_gap)) |
| | im = new_im |
| | |
| | if author_id in IMG_DATA.keys(): |
| | IMG_DATA[author_id].append({'img':im, 'label':label}) |
| |
|
| | else: |
| | IMG_DATA[author_id] = [] |
| | IMG_DATA[author_id].append({'img':im, 'label':label}) |
| |
|
| | imgByteArr = io.BytesIO() |
| | |
| | im.save(imgByteArr, format='tiff') |
| | wordBin = imgByteArr.getvalue() |
| | imageKey = 'image-%09d' % cnt |
| | labelKey = 'label-%09d' % cnt |
| |
|
| | cache[imageKey] = wordBin |
| | if labeled: |
| | cache[labelKey] = label |
| | if cnt % 1000 == 0: |
| | writeCache(env, cache) |
| | cache = {} |
| | print('Written %d / %d' % (cnt, nSamples)) |
| | cnt += 1 |
| |
|
| | nSamples = cnt - 1 |
| | cache['num-samples'] = str(nSamples) |
| | writeCache(env, cache) |
| | env.close() |
| | print('Created dataset with %d samples' % nSamples) |
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | return IMG_DATA |
| |
|
| | def createDict(label_list, top_dir, dataset, mode, words, remove_punc): |
| | lex_name = dataset+'_' + mode + (dataset in ['IAM','RIMES'])*('_words' * words) + (dataset=='IAM') * ('_removePunc' * remove_punc) |
| | all_words = '-'.join(label_list).split('-') |
| | unique_words = [] |
| | words = [] |
| | for x in tqdm(all_words): |
| | if x!='' and x!=' ': |
| | words.append(x) |
| | if x not in unique_words: |
| | unique_words.append(x) |
| | print(len(words)) |
| | print(len(unique_words)) |
| | with open(os.path.join(top_dir, 'Lexicon', lex_name+'_stratified.txt'), "w") as file: |
| | file.write("\n".join(unique_words)) |
| | file.close() |
| | with open(os.path.join(top_dir, 'Lexicon', lex_name + '_NOTstratified.txt'), "w") as file: |
| | file.write("\n".join(words)) |
| | file.close() |
| |
|
| | def printAlphabet(label_list): |
| | |
| | all_chars = ''.join(label_list) |
| | unique_chars = [] |
| | for x in all_chars: |
| | if x not in unique_chars and len(x) == 1: |
| | unique_chars.append(x) |
| |
|
| | |
| | print(''.join(unique_chars)) |
| |
|
| | if __name__ == '__main__': |
| |
|
| | TRAIN_IDX = 'gan.iam.tr_va.gt.filter27' |
| | TEST_IDX = 'gan.iam.test.gt.filter27' |
| | IAM_WORD_DATASET_PATH = '../../data/IAM/nfs/users/ext_ankan.bhunia/data/Handwritten_data/IAM/wordImages/' |
| | XMLS_PATH = '../../data/IAM/nfs/users/ext_ankan.bhunia/data/Handwritten_data/IAM/xmls/' |
| | word_paths = {i.split('/')[-1][:-4]:i for i in glob.glob(IAM_WORD_DATASET_PATH + '*/*/*.png')} |
| | id_to_wid = {i.split('/')[-1][:-4]:xmltodict.parse(open(i).read())['form']['@writer-id'] for i in glob.glob(XMLS_PATH+'/**')} |
| | trainslist = [i[:-1] for i in open(TRAIN_IDX, 'r').readlines()] |
| | testslist = [i[:-1] for i in open(TEST_IDX, 'r').readlines()] |
| |
|
| | dict_ = {'train':{}, 'test':{}} |
| |
|
| | for i in trainslist: |
| |
|
| | author_id = i.split(',')[0] |
| | file_id, string = i.split(',')[1].split(' ') |
| |
|
| | file_path = word_paths[file_id] |
| |
|
| | if author_id in dict_['train']: |
| | dict_['train'][author_id].append({'path':file_path, 'label':string}) |
| | else: |
| | dict_['train'][author_id] = [{'path':file_path, 'label':string}] |
| |
|
| | for i in testslist: |
| |
|
| | author_id = i.split(',')[0] |
| | file_id, string = i.split(',')[1].split(' ') |
| |
|
| | file_path = word_paths[file_id] |
| |
|
| | if author_id in dict_['test']: |
| | dict_['test'][author_id].append({'path':file_path, 'label':string}) |
| | else: |
| | dict_['test'][author_id] = [{'path':file_path, 'label':string}] |
| |
|
| |
|
| | create_Dict = True |
| | dataset = 'IAM' |
| | mode = 'all' |
| | labeled = True |
| | top_dir = '../../data/IAM/nfs/users/ext_ankan.bhunia/data/Handwritten_data/' |
| | |
| | words = True |
| | |
| | author_number = -1 |
| | remove_punc = True |
| |
|
| | resize = 'charResize' |
| | |
| | |
| | |
| | imgH = 32 |
| | init_gap = 0 |
| | charmaxW = 17 |
| | charminW = 16 |
| | h_gap = 0 |
| | discard_wide = True |
| | discard_narr = True |
| | |
| |
|
| | IMG_DATA = {} |
| |
|
| |
|
| |
|
| | for idx_auth in range(1669999): |
| |
|
| | |
| | |
| | print ('Processing '+ str(idx_auth)) |
| | image_path_list, label_list, outputPath, author_id = create_img_label_list(top_dir,dataset, mode, words, idx_auth, remove_punc) |
| | IMG_DATA[author_id] = [] |
| | |
| | |
| | IMG_DATA = createDataset(IMG_DATA, image_path_list, label_list, outputPath, mode, author_id, remove_punc, resize, imgH, init_gap, h_gap, charminW, charmaxW, discard_wide, discard_narr, labeled) |
| | |
| | |
| | |
| | |
| | |
| | import pickle |
| |
|
| | dict_ = {} |
| | for id_ in IMG_DATA.keys(): |
| | author_id = id_to_wid[id_] |
| |
|
| | if author_id in dict_.keys(): |
| | dict_[author_id].extend(IMG_DATA[id_]) |
| | else: |
| | dict_[author_id] = IMG_DATA[id_] |
| |
|
| | |