lynn-twinkl
training scirpts used for ner-finetuning
ac33de7
raw
history blame
1.13 kB
import json
import sys
raw_data = sys.argv[1]
def load_data(json_path):
"""
Load your custom JSON with 'additional_info' and 'label' fields.
Returns a list of (text, {"entities": [(start, end, label), ...]}) tuples.
"""
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# If your JSON is a list of records
# If it's a single record, wrap it in [data] or handle accordingly
if not isinstance(data, list):
data = [data]
training_data = []
for record in data:
text = record["additional_info"]
spans = []
for annotation in record["label"]:
# Each annotation can have multiple "labels", but typically there's just one
label = annotation["labels"][0]
start = annotation["start"]
end = annotation["end"]
spans.append((start, end, label))
# Append in spaCy's format
training_data.append((text, {"entities": spans}))
return training_data
if __name__ == "__main__":
# Example usage
TRAIN_DATA = load_data(raw_data)
print(TRAIN_DATA[:2])