Spaces:
No application file
No application file
| ### Create file named dataset.py | |
| ### Paste | |
| # coding=utf-8 | |
| import json | |
| import os | |
| from pathlib import Path | |
| import datasets | |
| from PIL import Image | |
| import pandas as pd | |
| logger = datasets.logging.get_logger(__name__) | |
| _CITATION = """{}""" | |
| _DESCRIPTION = """Discharge Summary""" | |
| def load_image(image_path): | |
| image = Image.open(image_path) | |
| w, h = image.size | |
| return image, (w, h) | |
| def normalize_bbox(bbox, size): | |
| return [ | |
| int(1000 * bbox[0] / size[0]), | |
| int(1000 * bbox[1] / size[1]), | |
| int(1000 * bbox[2] / size[0]), | |
| int(1000 * bbox[3] / size[1]), | |
| ] | |
| class SroieConfig(datasets.BuilderConfig): | |
| """BuilderConfig for SROIE""" | |
| def __init__(self, **kwargs): | |
| """BuilderConfig for SROIE. | |
| Args: | |
| **kwargs: keyword arguments forwarded to super. | |
| """ | |
| super(SroieConfig, self).__init__(**kwargs) | |
| class Sroie(datasets.GeneratorBasedBuilder): | |
| BUILDER_CONFIGS = [ | |
| SroieConfig(name="discharge", version=datasets.Version("1.0.0"), description="Discharge summary dataset"), | |
| ] | |
| def _info(self): | |
| return datasets.DatasetInfo( | |
| description=_DESCRIPTION, | |
| features=datasets.Features( | |
| { | |
| "id": datasets.Value("string"), | |
| "words": datasets.Sequence(datasets.Value("string")), | |
| "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))), | |
| "ner_tags": datasets.Sequence( | |
| datasets.features.ClassLabel( | |
| names=['others', | |
| 'produttore_key', | |
| 'produttore_value', | |
| 'cliente_key', | |
| 'cliente_value', | |
| 'unitloc_key', | |
| 'unitloc_value', | |
| 'operatore_key', | |
| 'operatore_value', | |
| 'referente_key', | |
| 'referente_value', | |
| 'cfproduttore_key', | |
| 'cfproduttore_value', | |
| 'telefono_key', | |
| 'telefono_value', | |
| 'emailcliente_key', | |
| 'emailcliente_value', | |
| 'datarichiesta_key', | |
| 'datarichiesta_value', | |
| 'orariorichiesta_key', | |
| 'orariorichiesta_value', | |
| 'emailproduttore_key', | |
| 'emailproduttore_value', | |
| 'mattina_key', | |
| 'mattina_value', | |
| 'pomeriggio_key', | |
| 'pomeriggio_value', | |
| 'cer_key', | |
| 'cer_value', | |
| 'descrizione_key', | |
| 'descrizione_value', | |
| 'sf_key', | |
| 'sf_value', | |
| 'classpericolo_key', | |
| 'classpericolo_value', | |
| 'destino_key', | |
| 'destino_value', | |
| 'confezionamento_key', | |
| 'confezionamento_value', | |
| 'destinazione_key', | |
| 'destinazione_value' | |
| ] | |
| ) | |
| ), | |
| #"image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"), | |
| "image_path": datasets.Value("string"), | |
| } | |
| ), | |
| supervised_keys=None, | |
| citation=_CITATION, | |
| homepage="", | |
| ) | |
| def _split_generators(self, dl_manager): | |
| """Returns SplitGenerators.""" | |
| """Uses local files located with data_dir""" | |
| #downloaded_file = dl_manager.download_and_extract(_URLS) | |
| # move files from the second URL together with files from the first one. | |
| dest = Path('dataset') | |
| return [ | |
| datasets.SplitGenerator( | |
| name=datasets.Split.TRAIN, gen_kwargs={"filepath": dest/"train"} | |
| ), | |
| datasets.SplitGenerator( | |
| name=datasets.Split.TEST, gen_kwargs={"filepath": dest/"test"} | |
| ), | |
| ] | |
| def _generate_examples(self, filepath): | |
| logger.info("⏳ Generating examples from = %s", filepath) | |
| ann_dir = os.path.join(filepath, "annotation_dir") | |
| img_dir = os.path.join(filepath, "img_dir") | |
| for guid, fname in enumerate(sorted(os.listdir(img_dir))): | |
| name, ext = os.path.splitext(fname) | |
| file_path = os.path.join(ann_dir, name + ".csv") | |
| df = pd.read_csv(file_path) | |
| image_path = os.path.join(img_dir, fname) | |
| image, size = load_image(image_path) | |
| boxes = [[xmin, ymin, xmax, ymax] for xmin, ymin, xmax, ymax in zip(df['left'],df['top'],df['left']+df['width'],df['top']+df['height'])] | |
| text = [i for i in df['text']] | |
| label = [i for i in df['label']] | |
| boxes = [normalize_bbox(box, size) for box in boxes] | |
| print(image_path) | |
| for i in boxes: | |
| for j in i: | |
| if j>1000: | |
| print(j) | |
| pass | |
| yield guid, {"id": str(guid), "words": text, "bboxes": boxes, "ner_tags": label, "image_path": image_path} | |