Spaces:
Running
Running
File size: 6,496 Bytes
cfe897e 6125895 cfe897e 6125895 cfe897e 6125895 cfe897e 6125895 c479870 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | # By Wasim Khatib
# Version 2.0
# This function take a list a set of annotated entities, in this format: [["صرح","O"],
# ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
# ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
# ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
# after that it will return text of xml in this fomrat: صرح <OCC> رئيس <ORG> نقابة العاملين </ORG> </OCC> يوم في <ORG>
# جامعة <LOC> بيرزيت </LOC> </ORG> ان غدا هو <DATE> يوم الخميس </DATE>
# This function assume the input is correct and each tag must start with B- or I-, not empty tag and discard all tags
# start with ignore I- tags if they don’t have B-tags.
import numpy as np
def IBO_to_XML(temp):
xml_output = ""
temp_entities = sortTags(temp)
temp_list = list()
# initlize the temp_list
temp_list.append("")
word_position = 0
# For each entity, convert ibo to xml list.
for entity in temp_entities:
counter_tag = 0
# For each tag
for tag in str(entity[1]).split():
# If the counter tag greater than or equal to lenght of templist, if yes then we will append the empt value in templist
if counter_tag >= len(temp_list):
temp_list.append("")
# If the tag is equal O then and word position not equal zero then add all from templist to output ist
if "O" == tag and word_position != 0:
for j in range(len(temp_list),0,-1):
if temp_list[j-1]!= "":
xml_output+=" </"+str(temp_list[j-1])+">"
temp_list[j-1] = ""
# if its not equal O and its correct tag like B-tag or I-tag and its B
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
# if the templist of counter tag is not empty then we need add xml word that contains
# </name of previous tag> its mean that we closed the tag in xml in xml_output
if temp_list[counter_tag] != "":
xml_output+=" </"+str(temp_list[counter_tag])+">"
# After that we replace the previous tag from templist in new tag
temp_list[counter_tag] = str(tag).split("-")[1]
# And add xml word that contains <name of new tag> its mean we open the tag in xml in xml_output
xml_output += " <" + str(temp_list[counter_tag]) + ">"
# if its not equal O and its correct tag like B-tag or I-tag and its i and not first word postion
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
# we need to check if this tag like previous tag
for j in range(counter_tag,len(temp_list)):
# if its equal then will break the loop and continue
if temp_list[j] == tag[2:]:
break
# if not then we need to add xml word to close the tag like </name of previous> in xml_output
else:
if temp_list[j] != "":
xml_output+=" </"+str(temp_list[j])+">"
temp_list[j] = ""
counter_tag += 1
word_position += 1
# Add word in xml_output
xml_output +=" "+str(entity[0])
# Add all xml words in xml_output
for j in range(0, len(temp_list)):
if temp_list[j] != "":
xml_output+=" </"+str(temp_list[j])+">"
return xml_output.strip()
def sortTags(entities):
temp_entities = entities
temp_counter = 0
# For each entity, this loop will sort each tag of entitiy, first it will check if the
# previous tags has same count of this tag, second will sort the tags and check if this tags is correct
for entity in temp_entities:
tags = entity[1].split()
for tag in tags:
# if the counter is not 0 then, will complete
if temp_counter != 0:
# Check if this tag is equal I-, if yes then it will count how many tag in this tags and
# count how many tag in previous tags
if "I-" == tag[0:2]:
counter_of_this_tag = 0
counter_of_previous_tag = 0
for word in tags:
if tag.split("-")[1] in word:
counter_of_this_tag+=1
for word in temp_entities[temp_counter-1][1].split():
if tag.split("-")[1] in word:
counter_of_previous_tag+=1
# if the counter of previous tag is bigger than counter of this tag, then we
# need to add I-tag in this tags
if counter_of_previous_tag > counter_of_this_tag:
tags.append("I-"+tag.split("-")[1])
# Sort the tags
tags.sort()
# Need to revers the tags because it should begins with I
tags.reverse()
# If the counter is not 0 then we can complete
if temp_counter != 0:
this_tags = tags
previous_tags = temp_entities[temp_counter - 1][1].split()
sorted_tags = list()
# Check if the this tag is not O and previous tags is not O, then will complete,
# if not then it will ignor this tag
if "O" not in this_tags and "O" not in previous_tags:
index = 0
#For each previous tags, need sort this tag by previous tags if its I, B we can ignor
for i in previous_tags:
j = 0
while this_tags and j < len(this_tags):
if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
sorted_tags.insert(index, this_tags.pop(j))
break
elif this_tags[j][0:2] == "B-":
break
j += 1
index += 1
sorted_tags += this_tags
tags = sorted_tags
str_tag = " "
str_tag = str_tag.join(tags)
str_tag = str_tag.strip()
temp_entities[temp_counter][1] = str_tag
temp_counter += 1
return temp_entities
|