# By Wasim Khatib # Version 2.0 # This function take a list a set of annotated entities, in this format: [["صرح","O"], # ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"], # ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"], # ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]] # after that it will return text of xml in this fomrat: صرح رئيس نقابة العاملين يوم في # جامعة بيرزيت ان غدا هو يوم الخميس # This function assume the input is correct and each tag must start with B- or I-, not empty tag and discard all tags # start with ignore I- tags if they don’t have B-tags. import numpy as np def IBO_to_XML(temp): xml_output = "" temp_entities = sortTags(temp) temp_list = list() # initlize the temp_list temp_list.append("") word_position = 0 # For each entity, convert ibo to xml list. for entity in temp_entities: counter_tag = 0 # For each tag for tag in str(entity[1]).split(): # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empt value in templist if counter_tag >= len(temp_list): temp_list.append("") # If the tag is equal O then and word position not equal zero then add all from templist to output ist if "O" == tag and word_position != 0: for j in range(len(temp_list),0,-1): if temp_list[j-1]!= "": xml_output+=" " temp_list[j-1] = "" # if its not equal O and its correct tag like B-tag or I-tag and its B elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B": # if the templist of counter tag is not empty then we need add xml word that contains # its mean that we closed the tag in xml in xml_output if temp_list[counter_tag] != "": xml_output+=" " # After that we replace the previous tag from templist in new tag temp_list[counter_tag] = str(tag).split("-")[1] # And add xml word that contains its mean we open the tag in xml in xml_output xml_output += " <" + str(temp_list[counter_tag]) + ">" # if its not equal O and its correct tag like B-tag or I-tag and its i and not first word postion elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0: # we need to check if this tag like previous tag for j in range(counter_tag,len(temp_list)): # if its equal then will break the loop and continue if temp_list[j] == tag[2:]: break # if not then we need to add xml word to close the tag like in xml_output else: if temp_list[j] != "": xml_output+=" " temp_list[j] = "" counter_tag += 1 word_position += 1 # Add word in xml_output xml_output +=" "+str(entity[0]) # Add all xml words in xml_output for j in range(0, len(temp_list)): if temp_list[j] != "": xml_output+=" " return xml_output.strip() def sortTags(entities): temp_entities = entities temp_counter = 0 # For each entity, this loop will sort each tag of entitiy, first it will check if the # previous tags has same count of this tag, second will sort the tags and check if this tags is correct for entity in temp_entities: tags = entity[1].split() for tag in tags: # if the counter is not 0 then, will complete if temp_counter != 0: # Check if this tag is equal I-, if yes then it will count how many tag in this tags and # count how many tag in previous tags if "I-" == tag[0:2]: counter_of_this_tag = 0 counter_of_previous_tag = 0 for word in tags: if tag.split("-")[1] in word: counter_of_this_tag+=1 for word in temp_entities[temp_counter-1][1].split(): if tag.split("-")[1] in word: counter_of_previous_tag+=1 # if the counter of previous tag is bigger than counter of this tag, then we # need to add I-tag in this tags if counter_of_previous_tag > counter_of_this_tag: tags.append("I-"+tag.split("-")[1]) # Sort the tags tags.sort() # Need to revers the tags because it should begins with I tags.reverse() # If the counter is not 0 then we can complete if temp_counter != 0: this_tags = tags previous_tags = temp_entities[temp_counter - 1][1].split() sorted_tags = list() # Check if the this tag is not O and previous tags is not O, then will complete, # if not then it will ignor this tag if "O" not in this_tags and "O" not in previous_tags: index = 0 #For each previous tags, need sort this tag by previous tags if its I, B we can ignor for i in previous_tags: j = 0 while this_tags and j < len(this_tags): if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]: sorted_tags.insert(index, this_tags.pop(j)) break elif this_tags[j][0:2] == "B-": break j += 1 index += 1 sorted_tags += this_tags tags = sorted_tags str_tag = " " str_tag = str_tag.join(tags) str_tag = str_tag.strip() temp_entities[temp_counter][1] = str_tag temp_counter += 1 return temp_entities