Spaces:
Running
Running
File size: 7,470 Bytes
cfe897e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# By Wasim Khatib
# Version 2.0
# This function take a list a set of annotated entities, in this format: [["صرح","O"], ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
# ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
# ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
# after that it will return array of the set of distilled entities and their positions (start, end) and tags, such as
# [[" رئيس نقابة العاملين في جامعة بيرزيت", OCC,1,7],
# [" نقابة العاملين في جامعة بيرزيت", ORG,2,7], [" جامعة بيرزيت", ORG,5,7],["يوم الخميس", DATE,10,11]]
def distill_entities(entities):
# This is list that we put the output what we need
list_output = list()
# This line go to sort function and save the output to temp_entities
temp_entities = sortTags(entities)
# This list help us to make the output,
temp_list = list()
# initlize the temp_list
temp_list.append(["", "", 0, 0])
word_position = 0
# For each entity, convert ibo to distllir list.
for entity in temp_entities:
# This is counter tag of this entity
counter_tag = 0
# For each tag
for tag in str(entity[1]).split():
# If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
if counter_tag >= len(temp_list):
temp_list.append(["", "", 0, 0])
# If tag equal O and word postion of this tag is not equal zero then it will add all
# not empty eliment of temp list in output list
if "O" == tag and word_position != 0:
for j in range(0, len(temp_list)):
if temp_list[j][1] != "":
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
temp_list[j][0] = ""
temp_list[j][1] = ""
temp_list[j][2] = word_position
temp_list[j][3] = word_position
# if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
# of the split its B
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
# if the temp_list of counter is not empty then it will append in output list and hten it will
# initilize by new string and tag in templist of counter
if temp_list[counter_tag][1] != "":
list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
temp_list[counter_tag][0] = str(entity[0]) + " "
temp_list[counter_tag][1] = str(tag).split("-")[1]
temp_list[counter_tag][2] = word_position
temp_list[counter_tag][3] = word_position
# if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
# of the split its O
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
# For each of temp_list, check if in this counter tag of templist is same tag with this.tag
# then will complete if not it will save in output list and cheak another
for j in range(counter_tag,len(temp_list)):
if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
temp_list[j][0] += str(entity[0]) + " "
temp_list[j][3] += 1
break
else:
if temp_list[j][1] != "":
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
temp_list[j][0] = ""
temp_list[j][1] = ""
temp_list[j][2] = word_position
temp_list[j][3] = word_position
counter_tag += 1
word_position += 1
# For each temp_list, at the end of the previous loop, there will be some
# values in this list, we should save it to the output list
for j in range(0, len(temp_list)):
if temp_list[j][1] != "":
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
return sorted(list_output, key=lambda x: (x[2]))
def sortTags(entities):
temp_entities = entities
temp_counter = 0
# For each entity, this loop will sort each tag of entitiy, first it will check if the
# previous tags has same count of this tag, second will sort the tags and check if this tags is correct
for entity in temp_entities:
tags = entity[1].split()
for tag in tags:
# if the counter is not 0 then, will complete
if temp_counter != 0:
# Check if this tag is equal I-, if yes then it will count how many tag in this tags and
# count how many tag in previous tags
if "I-" == tag[0:2]:
counter_of_this_tag = 0
counter_of_previous_tag = 0
for word in tags:
if tag.split("-")[1] in word:
counter_of_this_tag+=1
for word in temp_entities[temp_counter-1][1].split():
if tag.split("-")[1] in word:
counter_of_previous_tag+=1
# if the counter of previous tag is bigger than counter of this tag, then we
# need to add I-tag in this tags
if counter_of_previous_tag > counter_of_this_tag:
tags.append("I-"+tag.split("-")[1])
# Sort the tags
tags.sort()
# Need to revers the tags because it should begins with I
tags.reverse()
# If the counter is not 0 then we can complete
if temp_counter != 0:
this_tags = tags
previous_tags = temp_entities[temp_counter - 1][1].split()
sorted_tags = list()
# Check if the this tag is not O and previous tags is not O, then will complete,
# if not then it will ignor this tag
if "O" not in this_tags and "O" not in previous_tags:
index = 0
#For each previous tags, need sort this tag by previous tags if its I, B we can ignor
for i in previous_tags:
j = 0
while this_tags and j < len(this_tags):
if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
sorted_tags.insert(index, this_tags.pop(j))
break
elif this_tags[j][0:2] == "B-":
break
j += 1
index += 1
sorted_tags += this_tags
tags = sorted_tags
str_tag = " "
str_tag = str_tag.join(tags)
str_tag = str_tag.strip()
temp_entities[temp_counter][1] = str_tag
temp_counter += 1
return temp_entities |