wojood-api / IBO_to_XML.py
TymaaHammouda's picture
Update entities
6125895
# By Wasim Khatib
# Version 2.0
# This function take a list a set of annotated entities, in this format: [["صرح","O"],
# ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
# ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
# ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
# after that it will return text of xml in this fomrat: صرح <OCC> رئيس <ORG> نقابة العاملين </ORG> </OCC> يوم في <ORG>
# جامعة <LOC> بيرزيت </LOC> </ORG> ان غدا هو <DATE> يوم الخميس </DATE>
# This function assume the input is correct and each tag must start with B- or I-, not empty tag and discard all tags
# start with ignore I- tags if they don’t have B-tags.
import numpy as np
def IBO_to_XML(temp):
xml_output = ""
temp_entities = sortTags(temp)
temp_list = list()
# initlize the temp_list
temp_list.append("")
word_position = 0
# For each entity, convert ibo to xml list.
for entity in temp_entities:
counter_tag = 0
# For each tag
for tag in str(entity[1]).split():
# If the counter tag greater than or equal to lenght of templist, if yes then we will append the empt value in templist
if counter_tag >= len(temp_list):
temp_list.append("")
# If the tag is equal O then and word position not equal zero then add all from templist to output ist
if "O" == tag and word_position != 0:
for j in range(len(temp_list),0,-1):
if temp_list[j-1]!= "":
xml_output+=" </"+str(temp_list[j-1])+">"
temp_list[j-1] = ""
# if its not equal O and its correct tag like B-tag or I-tag and its B
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
# if the templist of counter tag is not empty then we need add xml word that contains
# </name of previous tag> its mean that we closed the tag in xml in xml_output
if temp_list[counter_tag] != "":
xml_output+=" </"+str(temp_list[counter_tag])+">"
# After that we replace the previous tag from templist in new tag
temp_list[counter_tag] = str(tag).split("-")[1]
# And add xml word that contains <name of new tag> its mean we open the tag in xml in xml_output
xml_output += " <" + str(temp_list[counter_tag]) + ">"
# if its not equal O and its correct tag like B-tag or I-tag and its i and not first word postion
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
# we need to check if this tag like previous tag
for j in range(counter_tag,len(temp_list)):
# if its equal then will break the loop and continue
if temp_list[j] == tag[2:]:
break
# if not then we need to add xml word to close the tag like </name of previous> in xml_output
else:
if temp_list[j] != "":
xml_output+=" </"+str(temp_list[j])+">"
temp_list[j] = ""
counter_tag += 1
word_position += 1
# Add word in xml_output
xml_output +=" "+str(entity[0])
# Add all xml words in xml_output
for j in range(0, len(temp_list)):
if temp_list[j] != "":
xml_output+=" </"+str(temp_list[j])+">"
return xml_output.strip()
def sortTags(entities):
temp_entities = entities
temp_counter = 0
# For each entity, this loop will sort each tag of entitiy, first it will check if the
# previous tags has same count of this tag, second will sort the tags and check if this tags is correct
for entity in temp_entities:
tags = entity[1].split()
for tag in tags:
# if the counter is not 0 then, will complete
if temp_counter != 0:
# Check if this tag is equal I-, if yes then it will count how many tag in this tags and
# count how many tag in previous tags
if "I-" == tag[0:2]:
counter_of_this_tag = 0
counter_of_previous_tag = 0
for word in tags:
if tag.split("-")[1] in word:
counter_of_this_tag+=1
for word in temp_entities[temp_counter-1][1].split():
if tag.split("-")[1] in word:
counter_of_previous_tag+=1
# if the counter of previous tag is bigger than counter of this tag, then we
# need to add I-tag in this tags
if counter_of_previous_tag > counter_of_this_tag:
tags.append("I-"+tag.split("-")[1])
# Sort the tags
tags.sort()
# Need to revers the tags because it should begins with I
tags.reverse()
# If the counter is not 0 then we can complete
if temp_counter != 0:
this_tags = tags
previous_tags = temp_entities[temp_counter - 1][1].split()
sorted_tags = list()
# Check if the this tag is not O and previous tags is not O, then will complete,
# if not then it will ignor this tag
if "O" not in this_tags and "O" not in previous_tags:
index = 0
#For each previous tags, need sort this tag by previous tags if its I, B we can ignor
for i in previous_tags:
j = 0
while this_tags and j < len(this_tags):
if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
sorted_tags.insert(index, this_tags.pop(j))
break
elif this_tags[j][0:2] == "B-":
break
j += 1
index += 1
sorted_tags += this_tags
tags = sorted_tags
str_tag = " "
str_tag = str_tag.join(tags)
str_tag = str_tag.strip()
temp_entities[temp_counter][1] = str_tag
temp_counter += 1
return temp_entities