| import os | |
| def test_split(): | |
| iam_path = r"C:\Users\bramv\Documents\Werk\Research\Unimore\datasets\IAM" | |
| original_set_names = ["trainset.txt", "validationset1.txt", "validationset2.txt", "testset.txt"] | |
| original_set_ids = [] | |
| print("ORIGINAL IAM") | |
| print("---------------------") | |
| for set_name in original_set_names: | |
| with open(os.path.join(iam_path, set_name), 'r') as f: | |
| set_form_ids = ["-".join(l.rstrip().split("-")[:-1]) for l in f] | |
| form_to_id = {} | |
| with open(os.path.join(iam_path, "forms.txt"), 'r') as f: | |
| for line in f: | |
| if line.startswith("#"): | |
| continue | |
| form, id, *_ = line.split(" ") | |
| assert form not in form_to_id.keys() or form_to_id[form] == id | |
| form_to_id[form] = int(id) | |
| set_authors = [form_to_id[form] for form in set_form_ids] | |
| set_authors = set(sorted(set_authors)) | |
| original_set_ids.append(set_authors) | |
| print(f"{set_name} count: {len(set_authors)}") | |
| htg_set_names = ["gan.iam.tr_va.gt.filter27", "gan.iam.test.gt.filter27"] | |
| print("\n\nHTG IAM") | |
| print("---------------------") | |
| for set_name in htg_set_names: | |
| with open(os.path.join(iam_path, set_name), 'r') as f: | |
| set_authors = [int(l.split(",")[0]) for l in f] | |
| set_authors = set(set_authors) | |
| print(f"{set_name} count: {len(set_authors)}") | |
| for name, original_set in zip(original_set_names, original_set_ids): | |
| intr = set_authors.intersection(original_set) | |
| print(f"\t intersection with {name}: {len(intr)}") | |
| if __name__ == "__main__": | |
| test_split() | |