stanza-digphil / stanza /utils /ner /paying_annotators.py
Albin Thörn Cleland
Clean initial commit with LFS
19b8775
import json
import os
def get_worker_subs(json_string):
"""
Gets the AWS worker IDs from the annotation file in output folder.
Returns a list of the AWS worker subs
"""
subs = []
# json.loads() works on JSON strings, json.load() is for JSON files
job_data = json.loads(json_string)
for i in range(len(job_data["answers"])):
subs.append(job_data["answers"][i]["workerMetadata"]["identityData"]["sub"])
return subs
def track_tasks(input_path, worker_map=None):
"""
Takes a path to a folder containing the worker annotation metadata from AWS Sagemaker labeling job and a
dictionary mapping AWS worker subs to their names or identification tags and returns a dictionary mapping
the names/identification tags to the number of labeling tasks completed.
If no worker map is provided, this function returns a dictionary mapping the worker "sub" fields to
the number of tasks they completed.
:param input_path: string of the path to the directory containing the worker annotation sub-directories
:param worker_map: dictionary mapping AWS worker subs to the worker identifications
:return: dictionary mapping worker identifications to the number of tasks completed
"""
tracker = {}
res = {}
for direc in os.listdir(input_path):
subdir_path = os.path.join(input_path, direc)
subdir = os.listdir(subdir_path)
json_file_path = os.path.join(subdir_path, subdir[0])
with open(json_file_path) as json_file:
json_string = json_file.read()
subs = get_worker_subs(json_string)
for sub in subs:
tracker[sub] = tracker.get(sub, 0) + 1
if worker_map:
for sub in tracker:
worker = worker_map[sub]
res[worker] = tracker[sub]
return res
return tracker
def main():
# sample from completed labeling job
print(track_tasks('..\\tests\\ner\\aws_labeling_copy', worker_map={
"7efc17ac-3397-4472-afe5-89184ad145d0": "Worker1",
"afce8c28-969c-4e73-a20f-622ef122f585": "Worker2",
"91f6236e-63c6-4a84-8fd6-1efbab6dedab": "Worker3",
"6f202e93-e6b6-4e1d-8f07-0484b9a9093a": "Worker4",
"2b674d33-f656-44b0-8f90-d70a1ab71ec2": "Worker5"
}
))
# sample from completed labeling job -- no worker map provided
print(track_tasks('..\\tests\\ner\\aws_labeling_copy'))
return
if __name__ == "__main__":
main()