Spaces:

DataIntelligenceTeam
/

README

No application file

App Files Files Community

README / dataset.py

sxandie

Create new file

43a08bd over 3 years ago

raw

history blame contribute delete

5.9 kB

	### Create file named dataset.py
	### Paste
	# coding=utf-8
	import json
	import os
	from pathlib import Path
	import datasets
	from PIL import Image
	import pandas as pd

	logger = datasets.logging.get_logger(__name__)
	_CITATION = """{}"""
	_DESCRIPTION = """Discharge Summary"""


	def load_image(image_path):
	image = Image.open(image_path)
	w, h = image.size
	return image, (w, h)

	def normalize_bbox(bbox, size):
	return [
	int(1000 * bbox[0] / size[0]),
	int(1000 * bbox[1] / size[1]),
	int(1000 * bbox[2] / size[0]),
	int(1000 * bbox[3] / size[1]),
	]


	class SroieConfig(datasets.BuilderConfig):
	"""BuilderConfig for SROIE"""
	def __init__(self, **kwargs):
	"""BuilderConfig for SROIE.
	Args:
	**kwargs: keyword arguments forwarded to super.
	"""
	super(SroieConfig, self).__init__(**kwargs)


	class Sroie(datasets.GeneratorBasedBuilder):
	BUILDER_CONFIGS = [
	SroieConfig(name="discharge", version=datasets.Version("1.0.0"), description="Discharge summary dataset"),
	]

	def _info(self):
	return datasets.DatasetInfo(
	description=_DESCRIPTION,
	features=datasets.Features(
	{
	"id": datasets.Value("string"),
	"words": datasets.Sequence(datasets.Value("string")),
	"bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
	"ner_tags": datasets.Sequence(
	datasets.features.ClassLabel(
	names=['others',
	'produttore_key',
	'produttore_value',
	'cliente_key',
	'cliente_value',
	'unitloc_key',
	'unitloc_value',
	'operatore_key',
	'operatore_value',
	'referente_key',
	'referente_value',
	'cfproduttore_key',
	'cfproduttore_value',
	'telefono_key',
	'telefono_value',
	'emailcliente_key',
	'emailcliente_value',
	'datarichiesta_key',
	'datarichiesta_value',
	'orariorichiesta_key',
	'orariorichiesta_value',
	'emailproduttore_key',
	'emailproduttore_value',
	'mattina_key',
	'mattina_value',
	'pomeriggio_key',
	'pomeriggio_value',
	'cer_key',
	'cer_value',
	'descrizione_key',
	'descrizione_value',
	'sf_key',
	'sf_value',
	'classpericolo_key',
	'classpericolo_value',
	'destino_key',
	'destino_value',
	'confezionamento_key',
	'confezionamento_value',
	'destinazione_key',
	'destinazione_value'
	]
	)
	),
	#"image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"),
	"image_path": datasets.Value("string"),
	}
	),
	supervised_keys=None,
	citation=_CITATION,
	homepage="",
	)

	def _split_generators(self, dl_manager):
	"""Returns SplitGenerators."""
	"""Uses local files located with data_dir"""
	#downloaded_file = dl_manager.download_and_extract(_URLS)
	# move files from the second URL together with files from the first one.
	dest = Path('dataset')

	return [
	datasets.SplitGenerator(
	name=datasets.Split.TRAIN, gen_kwargs={"filepath": dest/"train"}
	),
	datasets.SplitGenerator(
	name=datasets.Split.TEST, gen_kwargs={"filepath": dest/"test"}
	),
	]

	def _generate_examples(self, filepath):

	logger.info("⏳ Generating examples from = %s", filepath)
	ann_dir = os.path.join(filepath, "annotation_dir")
	img_dir = os.path.join(filepath, "img_dir")

	for guid, fname in enumerate(sorted(os.listdir(img_dir))):

	name, ext = os.path.splitext(fname)
	file_path = os.path.join(ann_dir, name + ".csv")


	df = pd.read_csv(file_path)

	image_path = os.path.join(img_dir, fname)

	image, size = load_image(image_path)

	boxes = [[xmin, ymin, xmax, ymax] for xmin, ymin, xmax, ymax in zip(df['left'],df['top'],df['left']+df['width'],df['top']+df['height'])]
	text = [i for i in df['text']]
	label = [i for i in df['label']]

	boxes = [normalize_bbox(box, size) for box in boxes]

	print(image_path)
	for i in boxes:
	for j in i:
	if j>1000:
	print(j)
	pass

	yield guid, {"id": str(guid), "words": text, "bboxes": boxes, "ner_tags": label, "image_path": image_path}