aaljabari commited on
Commit
c025d46
·
verified ·
1 Parent(s): fa45496

Create NER_Distiller.py

Browse files
Files changed (1) hide show
  1. NER_Distiller.py +138 -0
NER_Distiller.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # By Wasim Khatib
2
+ # Version 2.0
3
+ # This function take a list a set of annotated entities, in this format: [["صرح","O"], ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
4
+ # ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
5
+ # ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
6
+ # after that it will return array of the set of distilled entities and their positions (start, end) and tags, such as
7
+ # [[" رئيس نقابة العاملين في جامعة بيرزيت", OCC,1,7],
8
+ # [" نقابة العاملين في جامعة بيرزيت", ORG,2,7], [" جامعة بيرزيت", ORG,5,7],["يوم الخميس", DATE,10,11]]
9
+ def distill_entities(entities):
10
+ # This is list that we put the output what we need
11
+ list_output = list()
12
+
13
+ # This line go to sort function and save the output to temp_entities
14
+ temp_entities = sortTags(entities)
15
+
16
+ # This list help us to make the output,
17
+ temp_list = list()
18
+
19
+ # initlize the temp_list
20
+ temp_list.append(["", "", 0, 0])
21
+ word_position = 0
22
+
23
+ # For each entity, convert ibo to distllir list.
24
+ for entity in temp_entities:
25
+ # This is counter tag of this entity
26
+ counter_tag = 0
27
+ # For each tag
28
+ for tag in str(entity[1]).split():
29
+ # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
30
+ if counter_tag >= len(temp_list):
31
+ temp_list.append(["", "", 0, 0])
32
+
33
+ # If tag equal O and word postion of this tag is not equal zero then it will add all
34
+ # not empty eliment of temp list in output list
35
+ if "O" == tag and word_position != 0:
36
+ for j in range(0, len(temp_list)):
37
+ if temp_list[j][1] != "":
38
+ list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
39
+ temp_list[j][0] = ""
40
+ temp_list[j][1] = ""
41
+ temp_list[j][2] = word_position
42
+ temp_list[j][3] = word_position
43
+ # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
44
+ # of the split its B
45
+ elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
46
+ # if the temp_list of counter is not empty then it will append in output list and hten it will
47
+ # initilize by new string and tag in templist of counter
48
+ if temp_list[counter_tag][1] != "":
49
+ list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
50
+ temp_list[counter_tag][0] = str(entity[0]) + " "
51
+ temp_list[counter_tag][1] = str(tag).split("-")[1]
52
+ temp_list[counter_tag][2] = word_position
53
+ temp_list[counter_tag][3] = word_position
54
+
55
+ # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
56
+ # of the split its O
57
+ elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
58
+ # For each of temp_list, check if in this counter tag of templist is same tag with this.tag
59
+ # then will complete if not it will save in output list and cheak another
60
+ for j in range(counter_tag,len(temp_list)):
61
+ if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
62
+ temp_list[j][0] += str(entity[0]) + " "
63
+ temp_list[j][3] += 1
64
+ break
65
+ else:
66
+ if temp_list[j][1] != "":
67
+ list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
68
+ temp_list[j][0] = ""
69
+ temp_list[j][1] = ""
70
+ temp_list[j][2] = word_position
71
+ temp_list[j][3] = word_position
72
+ counter_tag += 1
73
+ word_position += 1
74
+ # For each temp_list, at the end of the previous loop, there will be some
75
+ # values in this list, we should save it to the output list
76
+ for j in range(0, len(temp_list)):
77
+ if temp_list[j][1] != "":
78
+ list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
79
+ return sorted(list_output, key=lambda x: (x[2]))
80
+
81
+ def sortTags(entities):
82
+ temp_entities = entities
83
+ temp_counter = 0
84
+ # For each entity, this loop will sort each tag of entitiy, first it will check if the
85
+ # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
86
+ for entity in temp_entities:
87
+ tags = entity[1].split()
88
+ for tag in tags:
89
+ # if the counter is not 0 then, will complete
90
+ if temp_counter != 0:
91
+ # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
92
+ # count how many tag in previous tags
93
+ if "I-" == tag[0:2]:
94
+ counter_of_this_tag = 0
95
+ counter_of_previous_tag = 0
96
+ for word in tags:
97
+ if tag.split("-")[1] in word:
98
+ counter_of_this_tag+=1
99
+ for word in temp_entities[temp_counter-1][1].split():
100
+ if tag.split("-")[1] in word:
101
+ counter_of_previous_tag+=1
102
+ # if the counter of previous tag is bigger than counter of this tag, then we
103
+ # need to add I-tag in this tags
104
+ if counter_of_previous_tag > counter_of_this_tag:
105
+ tags.append("I-"+tag.split("-")[1])
106
+ # Sort the tags
107
+ tags.sort()
108
+ # Need to revers the tags because it should begins with I
109
+ tags.reverse()
110
+ # If the counter is not 0 then we can complete
111
+ if temp_counter != 0:
112
+ this_tags = tags
113
+ previous_tags = temp_entities[temp_counter - 1][1].split()
114
+ sorted_tags = list()
115
+
116
+ # Check if the this tag is not O and previous tags is not O, then will complete,
117
+ # if not then it will ignor this tag
118
+ if "O" not in this_tags and "O" not in previous_tags:
119
+ index = 0
120
+ #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
121
+ for i in previous_tags:
122
+ j = 0
123
+ while this_tags and j < len(this_tags):
124
+ if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
125
+ sorted_tags.insert(index, this_tags.pop(j))
126
+ break
127
+ elif this_tags[j][0:2] == "B-":
128
+ break
129
+ j += 1
130
+ index += 1
131
+ sorted_tags += this_tags
132
+ tags = sorted_tags
133
+ str_tag = " "
134
+ str_tag = str_tag.join(tags)
135
+ str_tag = str_tag.strip()
136
+ temp_entities[temp_counter][1] = str_tag
137
+ temp_counter += 1
138
+ return temp_entities