SamaritanOCR / py3 /utils /character_set.py
johnlockejrr's picture
Upload 80 files
43bca44 verified
import sys
import json
import os
from collections import defaultdict
def load_char_set(char_set_path):
with open(char_set_path) as f:
char_set = json.load(f)
idx_to_char = {}
for k,v in char_set['idx_to_char'].items():
idx_to_char[int(k)] = v
return idx_to_char, char_set['char_to_idx']
if __name__ == "__main__":
character_set_path = sys.argv[-1]
out_char_to_idx = {}
out_idx_to_char = {}
char_freq = defaultdict(int)
for i in range(1, len(sys.argv)-1):
data_file = sys.argv[i]
with open(data_file) as f:
paths = json.load(f)
for json_path, image_path in paths:
with open(json_path) as f:
data = json.load(f)
cnt = 1 # this is important that this starts at 1 not 0
for data_item in data:
for c in data_item.get('gt', None):
if c is None:
print("There was a None GT")
continue
if c not in out_char_to_idx:
out_char_to_idx[c] = cnt
out_idx_to_char[cnt] = c
cnt += 1
char_freq[c] += 1
out_char_to_idx2 = {}
out_idx_to_char2 = {}
for i, c in enumerate(sorted(out_char_to_idx.keys())):
out_char_to_idx2[c] = i+1
out_idx_to_char2[i+1] = c
output_data = {
"char_to_idx": out_char_to_idx2,
"idx_to_char": out_idx_to_char2
}
for k,v in sorted(iter(char_freq.items()), key=lambda x: x[1]):
print(k, v)
print(("Size:", len(output_data['char_to_idx'])))
with open(character_set_path, 'w') as outfile:
json.dump(output_data, outfile)