Spaces:
Running
Running
| # By Wasim Khatib | |
| # Version 2.0 | |
| # This function take a list a set of annotated entities, in this format: [["صرح","O"], | |
| # ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"], | |
| # ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"], | |
| # ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]] | |
| # after that it will return text of xml in this fomrat: صرح <OCC> رئيس <ORG> نقابة العاملين </ORG> </OCC> يوم في <ORG> | |
| # جامعة <LOC> بيرزيت </LOC> </ORG> ان غدا هو <DATE> يوم الخميس </DATE> | |
| # This function assume the input is correct and each tag must start with B- or I-, not empty tag and discard all tags | |
| # start with ignore I- tags if they don’t have B-tags. | |
| import numpy as np | |
| def IBO_to_XML(temp): | |
| xml_output = "" | |
| temp_entities = sortTags(temp) | |
| temp_list = list() | |
| # initlize the temp_list | |
| temp_list.append("") | |
| word_position = 0 | |
| # For each entity, convert ibo to xml list. | |
| for entity in temp_entities: | |
| counter_tag = 0 | |
| # For each tag | |
| for tag in str(entity[1]).split(): | |
| # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empt value in templist | |
| if counter_tag >= len(temp_list): | |
| temp_list.append("") | |
| # If the tag is equal O then and word position not equal zero then add all from templist to output ist | |
| if "O" == tag and word_position != 0: | |
| for j in range(len(temp_list),0,-1): | |
| if temp_list[j-1]!= "": | |
| xml_output+=" </"+str(temp_list[j-1])+">" | |
| temp_list[j-1] = "" | |
| # if its not equal O and its correct tag like B-tag or I-tag and its B | |
| elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B": | |
| # if the templist of counter tag is not empty then we need add xml word that contains | |
| # </name of previous tag> its mean that we closed the tag in xml in xml_output | |
| if temp_list[counter_tag] != "": | |
| xml_output+=" </"+str(temp_list[counter_tag])+">" | |
| # After that we replace the previous tag from templist in new tag | |
| temp_list[counter_tag] = str(tag).split("-")[1] | |
| # And add xml word that contains <name of new tag> its mean we open the tag in xml in xml_output | |
| xml_output += " <" + str(temp_list[counter_tag]) + ">" | |
| # if its not equal O and its correct tag like B-tag or I-tag and its i and not first word postion | |
| elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0: | |
| # we need to check if this tag like previous tag | |
| for j in range(counter_tag,len(temp_list)): | |
| # if its equal then will break the loop and continue | |
| if temp_list[j] == tag[2:]: | |
| break | |
| # if not then we need to add xml word to close the tag like </name of previous> in xml_output | |
| else: | |
| if temp_list[j] != "": | |
| xml_output+=" </"+str(temp_list[j])+">" | |
| temp_list[j] = "" | |
| counter_tag += 1 | |
| word_position += 1 | |
| # Add word in xml_output | |
| xml_output +=" "+str(entity[0]) | |
| # Add all xml words in xml_output | |
| for j in range(0, len(temp_list)): | |
| if temp_list[j] != "": | |
| xml_output+=" </"+str(temp_list[j])+">" | |
| return xml_output.strip() | |
| def sortTags(entities): | |
| temp_entities = entities | |
| temp_counter = 0 | |
| # For each entity, this loop will sort each tag of entitiy, first it will check if the | |
| # previous tags has same count of this tag, second will sort the tags and check if this tags is correct | |
| for entity in temp_entities: | |
| tags = entity[1].split() | |
| for tag in tags: | |
| # if the counter is not 0 then, will complete | |
| if temp_counter != 0: | |
| # Check if this tag is equal I-, if yes then it will count how many tag in this tags and | |
| # count how many tag in previous tags | |
| if "I-" == tag[0:2]: | |
| counter_of_this_tag = 0 | |
| counter_of_previous_tag = 0 | |
| for word in tags: | |
| if tag.split("-")[1] in word: | |
| counter_of_this_tag+=1 | |
| for word in temp_entities[temp_counter-1][1].split(): | |
| if tag.split("-")[1] in word: | |
| counter_of_previous_tag+=1 | |
| # if the counter of previous tag is bigger than counter of this tag, then we | |
| # need to add I-tag in this tags | |
| if counter_of_previous_tag > counter_of_this_tag: | |
| tags.append("I-"+tag.split("-")[1]) | |
| # Sort the tags | |
| tags.sort() | |
| # Need to revers the tags because it should begins with I | |
| tags.reverse() | |
| # If the counter is not 0 then we can complete | |
| if temp_counter != 0: | |
| this_tags = tags | |
| previous_tags = temp_entities[temp_counter - 1][1].split() | |
| sorted_tags = list() | |
| # Check if the this tag is not O and previous tags is not O, then will complete, | |
| # if not then it will ignor this tag | |
| if "O" not in this_tags and "O" not in previous_tags: | |
| index = 0 | |
| #For each previous tags, need sort this tag by previous tags if its I, B we can ignor | |
| for i in previous_tags: | |
| j = 0 | |
| while this_tags and j < len(this_tags): | |
| if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]: | |
| sorted_tags.insert(index, this_tags.pop(j)) | |
| break | |
| elif this_tags[j][0:2] == "B-": | |
| break | |
| j += 1 | |
| index += 1 | |
| sorted_tags += this_tags | |
| tags = sorted_tags | |
| str_tag = " " | |
| str_tag = str_tag.join(tags) | |
| str_tag = str_tag.strip() | |
| temp_entities[temp_counter][1] = str_tag | |
| temp_counter += 1 | |
| return temp_entities | |