Spaces:

TwinklData
/

Community_Collections_App

Sleeping

lynn-twinkl

training scirpts used for ner-finetuning

ac33de7 about 1 year ago

1.13 kB

	import json
	import sys

	raw_data = sys.argv[1]

	def load_data(json_path):
	"""
	Load your custom JSON with 'additional_info' and 'label' fields.
	Returns a list of (text, {"entities": [(start, end, label), ...]}) tuples.
	"""
	with open(json_path, 'r', encoding='utf-8') as f:
	data = json.load(f)

	# If your JSON is a list of records
	# If it's a single record, wrap it in [data] or handle accordingly
	if not isinstance(data, list):
	data = [data]

	training_data = []

	for record in data:
	text = record["additional_info"]
	spans = []
	for annotation in record["label"]:
	# Each annotation can have multiple "labels", but typically there's just one
	label = annotation["labels"][0]
	start = annotation["start"]
	end = annotation["end"]
	spans.append((start, end, label))
	# Append in spaCy's format
	training_data.append((text, {"entities": spans}))

	return training_data

	if __name__ == "__main__":
	# Example usage
	TRAIN_DATA = load_data(raw_data)

	print(TRAIN_DATA[:2])