Spaces:
Running
Running
| import json | |
| import shutil | |
| import tempfile | |
| from io import BytesIO | |
| import cv2 | |
| import numpy as np | |
| import pytest | |
| import requests | |
| import scipy.io as sio | |
| from PIL import Image | |
| from doctr.datasets.generator.base import synthesize_text_img | |
| from doctr.io import reader | |
| from doctr.utils import geometry | |
| def mock_vocab(): | |
| return ( | |
| "3K}7eé;5àÎYho]QwV6qU~W\"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-kçHëÀÂ2É/ûIJ'j" | |
| "(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l" | |
| ) | |
| def mock_pdf(tmpdir_factory): | |
| # Page 1 | |
| text_img = synthesize_text_img("I am a jedi!", background_color=(255, 255, 255), text_color=(0, 0, 0)) | |
| page = Image.new(text_img.mode, (1240, 1754), (255, 255, 255)) | |
| page.paste(text_img, (50, 100)) | |
| # Page 2 | |
| text_img = synthesize_text_img("No, I am your father.", background_color=(255, 255, 255), text_color=(0, 0, 0)) | |
| _page = Image.new(text_img.mode, (1240, 1754), (255, 255, 255)) | |
| _page.paste(text_img, (40, 300)) | |
| # Save the PDF | |
| fn = tmpdir_factory.mktemp("data").join("mock_pdf_file.pdf") | |
| page.save(str(fn), "PDF", save_all=True, append_images=[_page]) | |
| return str(fn) | |
| def mock_payslip(tmpdir_factory): | |
| url = "https://3.bp.blogspot.com/-Es0oHTCrVEk/UnYA-iW9rYI/AAAAAAAAAFI/hWExrXFbo9U/s1600/003.jpg" | |
| file = BytesIO(requests.get(url).content) | |
| folder = tmpdir_factory.mktemp("data") | |
| fn = str(folder.join("mock_payslip.jpeg")) | |
| with open(fn, "wb") as f: | |
| f.write(file.getbuffer()) | |
| return fn | |
| def mock_tilted_payslip(mock_payslip, tmpdir_factory): | |
| image = reader.read_img_as_numpy(mock_payslip) | |
| image = geometry.rotate_image(image, 30, expand=True) | |
| tmp_path = str(tmpdir_factory.mktemp("data").join("mock_tilted_payslip.jpg")) | |
| cv2.imwrite(tmp_path, image) | |
| return tmp_path | |
| def mock_text_box_stream(): | |
| url = "https://doctr-static.mindee.com/models?id=v0.5.1/word-crop.png&src=0" | |
| return requests.get(url).content | |
| def mock_text_box(mock_text_box_stream, tmpdir_factory): | |
| file = BytesIO(mock_text_box_stream) | |
| fn = tmpdir_factory.mktemp("data").join("mock_text_box_file.png") | |
| with open(fn, "wb") as f: | |
| f.write(file.getbuffer()) | |
| return str(fn) | |
| def mock_image_stream(): | |
| url = "https://miro.medium.com/max/3349/1*mk1-6aYaf_Bes1E3Imhc0A.jpeg" | |
| return requests.get(url).content | |
| def mock_artefact_image_stream(): | |
| url = "https://github.com/mindee/doctr/releases/download/v0.8.1/artefact_dummy.jpg" | |
| return requests.get(url).content | |
| def mock_image_path(mock_image_stream, tmpdir_factory): | |
| file = BytesIO(mock_image_stream) | |
| folder = tmpdir_factory.mktemp("images") | |
| fn = folder.join("mock_image_file.jpeg") | |
| with open(fn, "wb") as f: | |
| f.write(file.getbuffer()) | |
| return str(fn) | |
| def mock_image_folder(mock_image_stream, tmpdir_factory): | |
| file = BytesIO(mock_image_stream) | |
| folder = tmpdir_factory.mktemp("images") | |
| for i in range(5): | |
| fn = folder.join("mock_image_file_" + str(i) + ".jpeg") | |
| with open(fn, "wb") as f: | |
| f.write(file.getbuffer()) | |
| return str(folder) | |
| def mock_detection_label(tmpdir_factory): | |
| folder = tmpdir_factory.mktemp("labels") | |
| labels = {} | |
| for idx in range(5): | |
| labels[f"mock_image_file_{idx}.jpeg"] = { | |
| "img_dimensions": (800, 600), | |
| "img_hash": "dummy_hash", | |
| "polygons": [ | |
| [[1, 2], [1, 3], [2, 1], [2, 3]], | |
| [[10, 20], [10, 30], [20, 10], [20, 30]], | |
| [[3, 2], [3, 3], [4, 1], [4, 3]], | |
| [[30, 20], [30, 30], [40, 10], [40, 30]], | |
| ], | |
| } | |
| labels_path = folder.join("labels.json") | |
| with open(labels_path, "w") as f: | |
| json.dump(labels, f) | |
| return str(labels_path) | |
| def mock_recognition_label(tmpdir_factory): | |
| label_file = tmpdir_factory.mktemp("labels").join("labels.json") | |
| label = { | |
| "mock_image_file_0.jpeg": "I", | |
| "mock_image_file_1.jpeg": "am", | |
| "mock_image_file_2.jpeg": "a", | |
| "mock_image_file_3.jpeg": "jedi", | |
| "mock_image_file_4.jpeg": "!", | |
| } | |
| with open(label_file, "w") as f: | |
| json.dump(label, f) | |
| return str(label_file) | |
| def mock_ocrdataset(tmpdir_factory, mock_image_stream): | |
| root = tmpdir_factory.mktemp("dataset") | |
| label_file = root.join("labels.json") | |
| label = { | |
| "mock_image_file_0.jpg": { | |
| "typed_words": [ | |
| {"value": "I", "geometry": (0.2, 0.2, 0.1, 0.1, 0)}, | |
| {"value": "am", "geometry": (0.5, 0.5, 0.1, 0.1, 0)}, | |
| ] | |
| }, | |
| "mock_image_file_1.jpg": { | |
| "typed_words": [ | |
| {"value": "a", "geometry": (0.2, 0.2, 0.1, 0.1, 0)}, | |
| {"value": "jedi", "geometry": (0.5, 0.5, 0.1, 0.1, 0)}, | |
| ] | |
| }, | |
| "mock_image_file_2.jpg": { | |
| "typed_words": [ | |
| {"value": "!", "geometry": (0.2, 0.2, 0.1, 0.1, 0)}, | |
| ] | |
| }, | |
| } | |
| with open(label_file, "w") as f: | |
| json.dump(label, f) | |
| file = BytesIO(mock_image_stream) | |
| image_folder = tmpdir_factory.mktemp("images") | |
| for i in range(3): | |
| fn = image_folder.join(f"mock_image_file_{i}.jpg") | |
| with open(fn, "wb") as f: | |
| f.write(file.getbuffer()) | |
| return str(image_folder), str(label_file) | |
| def mock_ic13(tmpdir_factory, mock_image_stream): | |
| file = BytesIO(mock_image_stream) | |
| image_folder = tmpdir_factory.mktemp("images") | |
| label_folder = tmpdir_factory.mktemp("labels") | |
| labels = [ | |
| "100, 100, 200, 200, 'I'\n", | |
| "250, 300, 455, 678, 'am'\n", | |
| "321, 485, 529, 607, 'a'\n", | |
| "235, 121, 325, 621, 'jedi'\n", | |
| "468, 589, 1120, 2520, '!'", | |
| ] | |
| for i in range(5): | |
| fn_l = label_folder.join(f"gt_mock_image_file_{i}.txt") | |
| with open(fn_l, "w") as f: | |
| f.writelines(labels) | |
| fn_i = image_folder.join(f"mock_image_file_{i}.jpg") | |
| with open(fn_i, "wb") as f: | |
| f.write(file.getbuffer()) | |
| return str(image_folder), str(label_folder) | |
| def mock_imgur5k(tmpdir_factory, mock_image_stream): | |
| file = BytesIO(mock_image_stream) | |
| image_folder = tmpdir_factory.mktemp("images") | |
| label_folder = tmpdir_factory.mktemp("dataset_info") | |
| labels = { | |
| "index_id": { | |
| "YsaVkzl": { | |
| "image_url": "https://i.imgur.com/YsaVkzl.jpg", | |
| "image_path": "/path/to/IMGUR5K-Handwriting-Dataset/images/YsaVkzl.jpg", | |
| "image_hash": "993a7cbb04a7c854d1d841b065948369", | |
| }, | |
| "wz3wHhN": { | |
| "image_url": "https://i.imgur.com/wz3wHhN.jpg", | |
| "image_path": "/path/to/IMGUR5K-Handwriting-Dataset/images/wz3wHhN.jpg", | |
| "image_hash": "9157426a98ee52f3e1e8d41fa3a99175", | |
| }, | |
| "BRHSP23": { | |
| "image_url": "https://i.imgur.com/BRHSP23.jpg", | |
| "image_path": "/path/to/IMGUR5K-Handwriting-Dataset/images/BRHSP23.jpg", | |
| "image_hash": "aab01f7ac82ae53845b01674e9e34167", | |
| }, | |
| }, | |
| "index_to_ann_map": { | |
| "YsaVkzl": ["YsaVkzl_0", "YsaVkzl_1"], | |
| "wz3wHhN": ["wz3wHhN_0", "wz3wHhN_1"], | |
| "BRHSP23": ["BRHSP23_0", "BRHSP23_1"], | |
| }, | |
| "ann_id": { | |
| "YsaVkzl_0": {"word": "I", "bounding_box": "[305.33, 850.67, 432.33, 115.33, 5.0]"}, | |
| "YsaVkzl_1": {"word": "am", "bounding_box": "[546.67, 455.67, 345.0, 212.33, 18.67]"}, | |
| "wz3wHhN_0": {"word": "a", "bounding_box": "[544.67, 345.67, 76.0, 222.33, 34.67]"}, | |
| "wz3wHhN_1": {"word": "jedi", "bounding_box": "[545.0, 437.0, 76.67, 201.0, 23.33]"}, | |
| "BRHSP23_0": {"word": "!", "bounding_box": "[555.67, 432.67, 220.0, 120.33, 7.67]"}, | |
| "BRHSP23_1": {"word": "!", "bounding_box": "[566.0, 437.0, 76.67, 201.0, 25.33]"}, | |
| }, | |
| } | |
| label_file = label_folder.join("imgur5k_annotations.json") | |
| with open(label_file, "w") as f: | |
| json.dump(labels, f) | |
| for index_id in ["YsaVkzl", "wz3wHhN", "BRHSP23"]: | |
| fn_i = image_folder.join(f"{index_id}.jpg") | |
| with open(fn_i, "wb") as f: | |
| f.write(file.getbuffer()) | |
| return str(image_folder), str(label_file) | |
| def mock_svhn_dataset(tmpdir_factory, mock_image_stream): | |
| root = tmpdir_factory.mktemp("datasets") | |
| svhn_root = root.mkdir("svhn") | |
| train_root = svhn_root.mkdir("train") | |
| file = BytesIO(mock_image_stream) | |
| # NOTE: hdf5storage seems not to be maintained anymore, ref.: https://github.com/frejanordsiek/hdf5storage/pull/134 | |
| # Instead we download the mocked data which was generated using the following code: | |
| # ascii image names | |
| # first = np.array([[49], [46], [112], [110], [103]], dtype=np.int16) # 1.png | |
| # second = np.array([[50], [46], [112], [110], [103]], dtype=np.int16) # 2.png | |
| # third = np.array([[51], [46], [112], [110], [103]], dtype=np.int16) # 3.png | |
| # labels: label is also ascii | |
| # label = { | |
| # "height": [35, 35, 35, 35], | |
| # "label": [1, 1, 3, 7], | |
| # "left": [116, 128, 137, 151], | |
| # "top": [27, 29, 29, 26], | |
| # "width": [15, 10, 17, 17], | |
| # } | |
| # matcontent = {"digitStruct": {"name": [first, second, third], "bbox": [label, label, label]}} | |
| # Mock train data | |
| # hdf5storage.write(matcontent, filename=train_root.join("digitStruct.mat")) | |
| # Downloading the mocked data | |
| url = "https://github.com/mindee/doctr/releases/download/v0.9.0/digitStruct.mat" | |
| response = requests.get(url) | |
| with open(train_root.join("digitStruct.mat"), "wb") as f: | |
| f.write(response.content) | |
| for i in range(3): | |
| fn = train_root.join(f"{i + 1}.png") | |
| with open(fn, "wb") as f: | |
| f.write(file.getbuffer()) | |
| # Packing data into an archive to simulate the real data set and bypass archive extraction | |
| archive_path = root.join("svhn_train.tar") | |
| shutil.make_archive(root.join("svhn_train"), "tar", str(svhn_root)) | |
| return str(archive_path) | |
| def mock_sroie_dataset(tmpdir_factory, mock_image_stream): | |
| root = tmpdir_factory.mktemp("datasets") | |
| sroie_root = root.mkdir("sroie2019_train_task1") | |
| annotations_folder = sroie_root.mkdir("annotations") | |
| image_folder = sroie_root.mkdir("images") | |
| labels = [ | |
| "72, 25, 326, 25, 326, 64, 72, 64, 'I'\n", | |
| "50, 82, 440, 82, 440, 121, 50, 121, 'am'\n", | |
| "205, 121, 285, 121, 285, 139, 205, 139, 'a'\n", | |
| "18, 250, 440, 320, 250, 64, 85, 121, 'jedi'\n", | |
| "400, 112, 252, 84, 112, 84, 75, 88, '!'", | |
| ] | |
| file = BytesIO(mock_image_stream) | |
| for i in range(3): | |
| fn_i = image_folder.join(f"{i}.jpg") | |
| with open(fn_i, "wb") as f: | |
| f.write(file.getbuffer()) | |
| fn_l = annotations_folder.join(f"{i}.txt") | |
| with open(fn_l, "w") as f: | |
| f.writelines(labels) | |
| # Packing data into an archive to simulate the real data set and bypass archive extraction | |
| archive_path = root.join("sroie2019_train_task1.zip") | |
| shutil.make_archive(root.join("sroie2019_train_task1"), "zip", str(sroie_root)) | |
| return str(archive_path) | |
| def mock_funsd_dataset(tmpdir_factory, mock_image_stream): | |
| root = tmpdir_factory.mktemp("datasets") | |
| funsd_root = root.mkdir("funsd") | |
| sub_dataset_root = funsd_root.mkdir("dataset") | |
| train_root = sub_dataset_root.mkdir("training_data") | |
| image_folder = train_root.mkdir("images") | |
| annotations_folder = train_root.mkdir("annotations") | |
| labels = { | |
| "form": [ | |
| { | |
| "box": [84, 109, 136, 119], | |
| "text": "I", | |
| "label": "question", | |
| "words": [{"box": [84, 109, 136, 119], "text": "I"}], | |
| "linking": [[0, 37]], | |
| "id": 0, | |
| }, | |
| { | |
| "box": [85, 110, 145, 120], | |
| "text": "am", | |
| "label": "answer", | |
| "words": [{"box": [85, 110, 145, 120], "text": "am"}], | |
| "linking": [[1, 38]], | |
| "id": 1, | |
| }, | |
| { | |
| "box": [86, 115, 150, 125], | |
| "text": "Luke", | |
| "label": "answer", | |
| "words": [{"box": [86, 115, 150, 125], "text": "Luke"}], | |
| "linking": [[2, 44]], | |
| "id": 2, | |
| }, | |
| ] | |
| } | |
| file = BytesIO(mock_image_stream) | |
| for i in range(3): | |
| fn_i = image_folder.join(f"{i}.png") | |
| with open(fn_i, "wb") as f: | |
| f.write(file.getbuffer()) | |
| fn_l = annotations_folder.join(f"{i}.json") | |
| with open(fn_l, "w") as f: | |
| json.dump(labels, f) | |
| # Packing data into an archive to simulate the real data set and bypass archive extraction | |
| archive_path = root.join("funsd.zip") | |
| shutil.make_archive(root.join("funsd"), "zip", str(funsd_root)) | |
| return str(archive_path) | |
| def mock_cord_dataset(tmpdir_factory, mock_image_stream): | |
| root = tmpdir_factory.mktemp("datasets") | |
| cord_root = root.mkdir("cord_train") | |
| image_folder = cord_root.mkdir("image") | |
| annotations_folder = cord_root.mkdir("json") | |
| labels = { | |
| "dontcare": [], | |
| "valid_line": [ | |
| { | |
| "words": [ | |
| { | |
| "quad": { | |
| "x2": 270, | |
| "y3": 390, | |
| "x3": 270, | |
| "y4": 390, | |
| "x1": 256, | |
| "y1": 374, | |
| "x4": 256, | |
| "y2": 374, | |
| }, | |
| "is_key": 0, | |
| "row_id": 2179893, | |
| "text": "I", | |
| } | |
| ], | |
| "category": "menu.cnt", | |
| "group_id": 3, | |
| }, | |
| { | |
| "words": [ | |
| { | |
| "quad": { | |
| "x2": 270, | |
| "y3": 418, | |
| "x3": 270, | |
| "y4": 418, | |
| "x1": 258, | |
| "y1": 402, | |
| "x4": 258, | |
| "y2": 402, | |
| }, | |
| "is_key": 0, | |
| "row_id": 2179894, | |
| "text": "am", | |
| } | |
| ], | |
| "category": "menu.cnt", | |
| "group_id": 4, | |
| }, | |
| { | |
| "words": [ | |
| { | |
| "quad": { | |
| "x2": 272, | |
| "y3": 444, | |
| "x3": 272, | |
| "y4": 444, | |
| "x1": 258, | |
| "y1": 428, | |
| "x4": 258, | |
| "y2": 428, | |
| }, | |
| "is_key": 0, | |
| "row_id": 2179895, | |
| "text": "Luke", | |
| } | |
| ], | |
| "category": "menu.cnt", | |
| "group_id": 5, | |
| }, | |
| ], | |
| } | |
| file = BytesIO(mock_image_stream) | |
| for i in range(3): | |
| fn_i = image_folder.join(f"receipt_{i}.png") | |
| with open(fn_i, "wb") as f: | |
| f.write(file.getbuffer()) | |
| fn_l = annotations_folder.join(f"receipt_{i}.json") | |
| with open(fn_l, "w") as f: | |
| json.dump(labels, f) | |
| # Packing data into an archive to simulate the real data set and bypass archive extraction | |
| archive_path = root.join("cord_train.zip") | |
| shutil.make_archive(root.join("cord_train"), "zip", str(cord_root)) | |
| return str(archive_path) | |
| def mock_synthtext_dataset(tmpdir_factory, mock_image_stream): | |
| root = tmpdir_factory.mktemp("datasets") | |
| synthtext_root = root.mkdir("SynthText") | |
| image_folder = synthtext_root.mkdir("8") | |
| annotation_file = synthtext_root.join("gt.mat") | |
| labels = { | |
| "imnames": [[["8/ballet_106_0.jpg"], ["8/ballet_106_1.jpg"], ["8/ballet_106_2.jpg"]]], | |
| "wordBB": [[np.random.randint(1000, size=(2, 4, 5)) for _ in range(3)]], | |
| "txt": [np.array([["I ", "am\na ", "Jedi ", "!"] for _ in range(3)])], | |
| } | |
| # hacky trick to write file into a LocalPath object with scipy.io.savemat | |
| with tempfile.NamedTemporaryFile(mode="wb", delete=True) as f: | |
| sio.savemat(f.name, labels) | |
| shutil.copy(f.name, str(annotation_file)) | |
| file = BytesIO(mock_image_stream) | |
| for i in range(3): | |
| fn_i = image_folder.join(f"ballet_106_{i}.jpg") | |
| with open(fn_i, "wb") as f: | |
| f.write(file.getbuffer()) | |
| # Packing data into an archive to simulate the real data set and bypass archive extraction | |
| archive_path = root.join("SynthText.zip") | |
| shutil.make_archive(root.join("SynthText"), "zip", str(synthtext_root)) | |
| return str(archive_path) | |
| def mock_doc_artefacts(tmpdir_factory, mock_image_stream): | |
| root = tmpdir_factory.mktemp("datasets") | |
| doc_root = root.mkdir("artefact_detection") | |
| labels = { | |
| "0.jpg": [ | |
| {"geometry": [0.94375, 0.4013671875, 0.99375, 0.4365234375], "label": "bar_code"}, | |
| {"geometry": [0.03125, 0.6923828125, 0.07875, 0.7294921875], "label": "qr_code"}, | |
| {"geometry": [0.1975, 0.1748046875, 0.39875, 0.2216796875], "label": "bar_code"}, | |
| ], | |
| "1.jpg": [ | |
| {"geometry": [0.94375, 0.4013671875, 0.99375, 0.4365234375], "label": "bar_code"}, | |
| {"geometry": [0.03125, 0.6923828125, 0.07875, 0.7294921875], "label": "qr_code"}, | |
| {"geometry": [0.1975, 0.1748046875, 0.39875, 0.2216796875], "label": "background"}, | |
| ], | |
| "2.jpg": [ | |
| {"geometry": [0.94375, 0.4013671875, 0.99375, 0.4365234375], "label": "logo"}, | |
| {"geometry": [0.03125, 0.6923828125, 0.07875, 0.7294921875], "label": "qr_code"}, | |
| {"geometry": [0.1975, 0.1748046875, 0.39875, 0.2216796875], "label": "photo"}, | |
| ], | |
| } | |
| train_root = doc_root.mkdir("train") | |
| label_file = train_root.join("labels.json") | |
| with open(label_file, "w") as f: | |
| json.dump(labels, f) | |
| image_folder = train_root.mkdir("images") | |
| file = BytesIO(mock_image_stream) | |
| for i in range(3): | |
| fn = image_folder.join(f"{i}.jpg") | |
| with open(fn, "wb") as f: | |
| f.write(file.getbuffer()) | |
| # Packing data into an archive to simulate the real data set and bypass archive extraction | |
| archive_path = root.join("artefact_detection.zip") | |
| shutil.make_archive(root.join("artefact_detection"), "zip", str(doc_root)) | |
| return str(archive_path) | |
| def mock_iiit5k_dataset(tmpdir_factory, mock_image_stream): | |
| root = tmpdir_factory.mktemp("datasets") | |
| iiit5k_root = root.mkdir("IIIT5K") | |
| image_folder = iiit5k_root.mkdir("train") | |
| annotation_file = iiit5k_root.join("trainCharBound.mat") | |
| labels = { | |
| "trainCharBound": {"ImgName": ["train/0.png"], "chars": ["I"], "charBB": np.random.randint(50, size=(1, 4))}, | |
| } | |
| # hacky trick to write file into a LocalPath object with scipy.io.savemat | |
| with tempfile.NamedTemporaryFile(mode="wb", delete=True) as f: | |
| sio.savemat(f.name, labels) | |
| shutil.copy(f.name, str(annotation_file)) | |
| file = BytesIO(mock_image_stream) | |
| for i in range(1): | |
| fn_i = image_folder.join(f"{i}.png") | |
| with open(fn_i, "wb") as f: | |
| f.write(file.getbuffer()) | |
| # Packing data into an archive to simulate the real data set and bypass archive extraction | |
| archive_path = root.join("IIIT5K-Word-V3.tar") | |
| shutil.make_archive(root.join("IIIT5K-Word-V3"), "tar", str(iiit5k_root)) | |
| return str(archive_path) | |
| def mock_svt_dataset(tmpdir_factory, mock_image_stream): | |
| root = tmpdir_factory.mktemp("datasets") | |
| svt_root = root.mkdir("svt1") | |
| labels = """<tagset><image><imageName>img/00_00.jpg</imageName> | |
| <address>341 Southwest 10th Avenue Portland OR</address><lex>LIVING,ROOM,THEATERS</lex> | |
| <Resolution x="1280" y="880"/><taggedRectangles><taggedRectangle height="75" width="236" x="375" y="253"> | |
| <tag>LIVING</tag></taggedRectangle></taggedRectangles></image><image><imageName>img/00_01.jpg</imageName> | |
| <address>1100 Southwest 6th Avenue Portland OR</address><lex>LULA</lex><Resolution x="1650" y="500"/> | |
| <taggedRectangles><taggedRectangle height="80" width="250" x="450" y="242"><tag>HOUSE</tag></taggedRectangle> | |
| </taggedRectangles></image><image><imageName>img/00_02.jpg</imageName> | |
| <address>341 Southwest 10th Avenue Portland OR</address><lex>LIVING,ROOM,THEATERS</lex><Resolution x="850" y="420"/> | |
| <taggedRectangles><taggedRectangle height="100" width="250" x="350" y="220"><tag>COST</tag></taggedRectangle> | |
| </taggedRectangles></image></tagset>""" | |
| with open(svt_root.join("train.xml"), "w") as f: | |
| f.write(labels) | |
| image_folder = svt_root.mkdir("img") | |
| file = BytesIO(mock_image_stream) | |
| for i in range(3): | |
| fn = image_folder.join(f"00_0{i}.jpg") | |
| with open(fn, "wb") as f: | |
| f.write(file.getbuffer()) | |
| # Packing data into an archive to simulate the real data set and bypass archive extraction | |
| archive_path = root.join("svt.zip") | |
| shutil.make_archive(root.join("svt"), "zip", str(svt_root)) | |
| return str(archive_path) | |
| def mock_ic03_dataset(tmpdir_factory, mock_image_stream): | |
| root = tmpdir_factory.mktemp("datasets") | |
| ic03_root = root.mkdir("SceneTrialTrain") | |
| labels = """<tagset><image><imageName>images/0.jpg</imageName><Resolution x="1280" y="880"/><taggedRectangles> | |
| <taggedRectangle x="174.0" y="392.0" width="274.0" height="195.0" offset="0.0" rotation="0.0"><tag>LIVING</tag> | |
| </taggedRectangle></taggedRectangles></image><image><imageName>images/1.jpg</imageName> | |
| <Resolution x="1650" y="500"/> | |
| <taggedRectangles><taggedRectangle x="244.0" y="440.0" width="300.0" height="220.0" offset="0.0" rotation="0.0"> | |
| <tag>HOUSE</tag></taggedRectangle></taggedRectangles></image><image><imageName>images/2.jpg</imageName> | |
| <Resolution x="850" y="420"/><taggedRectangles> | |
| <taggedRectangle x="180.0" y="400.0" width="280.0" height="250.0" offset="0.0" rotation="0.0"><tag>COST</tag> | |
| </taggedRectangle></taggedRectangles></image></tagset>""" | |
| with open(ic03_root.join("words.xml"), "w") as f: | |
| f.write(labels) | |
| image_folder = ic03_root.mkdir("images") | |
| file = BytesIO(mock_image_stream) | |
| for i in range(3): | |
| fn = image_folder.join(f"{i}.jpg") | |
| with open(fn, "wb") as f: | |
| f.write(file.getbuffer()) | |
| # Packing data into an archive to simulate the real data set and bypass archive extraction | |
| archive_path = root.join("ic03_train.zip") | |
| shutil.make_archive(root.join("ic03_train"), "zip", str(ic03_root)) | |
| return str(archive_path) | |
| def mock_mjsynth_dataset(tmpdir_factory, mock_image_stream): | |
| root = tmpdir_factory.mktemp("datasets") | |
| mjsynth_root = root.mkdir("mjsynth") | |
| image_folder = mjsynth_root.mkdir("images") | |
| label_file = mjsynth_root.join("imlist.txt") | |
| labels = [ | |
| "./mjsynth/images/12_I_34.jpg\n", | |
| "./mjsynth/images/12_am_34.jpg\n", | |
| "./mjsynth/images/12_a_34.jpg\n", | |
| "./mjsynth/images/12_Jedi_34.jpg\n", | |
| "./mjsynth/images/12_!_34.jpg\n", | |
| ] | |
| with open(label_file, "w") as f: | |
| for label in labels: | |
| f.write(label) | |
| file = BytesIO(mock_image_stream) | |
| for i in ["I", "am", "a", "Jedi", "!"]: | |
| fn = image_folder.join(f"12_{i}_34.jpg") | |
| with open(fn, "wb") as f: | |
| f.write(file.getbuffer()) | |
| return str(root), str(label_file) | |
| def mock_iiithws_dataset(tmpdir_factory, mock_image_stream): | |
| root = tmpdir_factory.mktemp("datasets") | |
| iiithws_root = root.mkdir("iiit-hws") | |
| image_folder = iiithws_root.mkdir("Images_90K_Normalized") | |
| image_sub_folder = image_folder.mkdir("1") | |
| label_file = iiithws_root.join("IIIT-HWS-90K.txt") | |
| labels = [ | |
| "./iiit-hws/Images_90K_Normalized/1/499_5_3_0_0.png I 1 0\n", | |
| "./iiit-hws/Images_90K_Normalized/1/117_1_3_0_0.png am 1 0\n", | |
| "./iiit-hws/Images_90K_Normalized/1/80_7_3_0_0.png a 1 0\n", | |
| "./iiit-hws/Images_90K_Normalized/1/585_3_2_0_0.png Jedi 1 0\n", | |
| "./iiit-hws/Images_90K_Normalized/1/222_5_3_0_0.png ! 1 0\n", | |
| ] | |
| with open(label_file, "w") as f: | |
| for label in labels: | |
| f.write(label) | |
| file = BytesIO(mock_image_stream) | |
| for label in labels: | |
| fn = image_sub_folder.join(label.split()[0].split("/")[-1]) | |
| with open(fn, "wb") as f: | |
| f.write(file.getbuffer()) | |
| return str(root), str(label_file) | |
| def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): | |
| file = BytesIO(mock_image_stream) | |
| root = tmpdir_factory.mktemp("datasets") | |
| wildreceipt_root = root.mkdir("wildreceipt") | |
| annotations_folder = wildreceipt_root | |
| image_folder = wildreceipt_root.mkdir("image_files") | |
| labels = { | |
| "file_name": "Image_58/20/receipt_0.jpeg", | |
| "height": 348, | |
| "width": 348, | |
| "annotations": [ | |
| {"box": [263.0, 283.0, 325.0, 283.0, 325.0, 260.0, 263.0, 260.0], "text": "$55.96", "label": 17}, | |
| {"box": [274.0, 308.0, 326.0, 308.0, 326.0, 286.0, 274.0, 286.0], "text": "$4.48", "label": 19}, | |
| ], | |
| } | |
| labels2 = { | |
| "file_name": "Image_58/20/receipt_1.jpeg", | |
| "height": 348, | |
| "width": 348, | |
| "annotations": [ | |
| {"box": [386.0, 409.0, 599.0, 409.0, 599.0, 373.0, 386.0, 373.0], "text": "089-46169340", "label": 5} | |
| ], | |
| } | |
| annotation_file = annotations_folder.join("train.txt") | |
| with open(annotation_file, "w") as f: | |
| json.dump(labels, f) | |
| f.write("\n") | |
| json.dump(labels2, f) | |
| f.write("\n") | |
| file = BytesIO(mock_image_stream) | |
| wildreceipt_image_folder = image_folder.mkdir("Image_58") | |
| wildreceipt_image_folder = wildreceipt_image_folder.mkdir("20") | |
| for i in range(2): | |
| fn_i = wildreceipt_image_folder.join(f"receipt_{i}.jpeg") | |
| with open(fn_i, "wb") as f: | |
| f.write(file.getbuffer()) | |
| return str(image_folder), str(annotation_file) | |
| def mock_cocotext_dataset(tmpdir_factory, mock_image_stream): | |
| file = BytesIO(mock_image_stream) | |
| root = tmpdir_factory.mktemp("datasets") | |
| cocotext_root = root.mkdir("cocotext") | |
| annotations_folder = cocotext_root | |
| image_folder = cocotext_root.mkdir("train2014") | |
| filenames = [ | |
| "COCO_train2014_000000353709.jpg", | |
| "COCO_train2014_000000077346.jpg", | |
| "COCO_train2014_000000437996.jpg", | |
| ] | |
| labels = { | |
| "cats": {}, | |
| "anns": { | |
| "1": { | |
| "mask": [286.1, 215.5, 285.2, 221.5, 304.6, 222.0, 304.6, 216.9], | |
| "class": "machine printed", | |
| "bbox": [285.2, 215.5, 19.4, 6.5], | |
| "image_id": 367969, | |
| "id": 108418, | |
| "language": "english", | |
| "area": 105.6, | |
| "utf8_string": "GATO", | |
| "legibility": "legible", | |
| }, | |
| "2": { | |
| "mask": [310.4, 304.6, 319.4, 302.1, 323.2, 318.1, 307.2, 318.1], | |
| "class": "machine printed", | |
| "bbox": [307.2, 302.1, 16.0, 16.0], | |
| "image_id": 77346, | |
| "id": 196817, | |
| "language": "english", | |
| "area": 184.75, | |
| "utf8_string": "6", | |
| "legibility": "legible", | |
| }, | |
| "3": { | |
| "mask": [212.6, 245.8, 210.1, 248.6, 212.0, 262.8, 221.9, 260.9, 227.4, 244.6], | |
| "class": "machine printed", | |
| "bbox": [210.1, 244.6, 17.3, 18.2], | |
| "image_id": 437996, | |
| "id": 134765, | |
| "language": "english", | |
| "area": 221.31, | |
| "utf8_string": "17", | |
| "legibility": "legible", | |
| }, | |
| }, | |
| "imgs": { | |
| "367969": {"id": 367969, "set": "train", "width": 640, "file_name": f"{filenames[0]}", "height": 427}, | |
| "77346": {"id": 77346, "set": "train", "width": 640, "file_name": f"{filenames[1]}", "height": 427}, | |
| "437996": {"id": 437996, "set": "train", "width": 640, "file_name": f"{filenames[2]}", "height": 427}, | |
| }, | |
| "imgToAnns": {}, | |
| "info": {}, | |
| } | |
| annotation_file = annotations_folder.join("cocotext.v2.json") | |
| with open(annotation_file, "w") as f: | |
| json.dump(labels, f) | |
| file = BytesIO(mock_image_stream) | |
| for img_name in filenames: | |
| fn = image_folder.join(f"{img_name}") | |
| with open(fn, "wb") as f: | |
| f.write(file.getbuffer()) | |
| return str(image_folder), str(annotation_file) | |