Spaces:

SinaLab
/

wojood-api

Running

File size: 6,496 Bytes

# By Wasim Khatib
# Version 2.0
# This function take a list a set of annotated entities, in this format: [["صرح","O"],
# ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
# ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
# ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
# after that it will return text of xml in this fomrat: صرح <OCC> رئيس <ORG> نقابة العاملين </ORG> </OCC> يوم في <ORG>
# جامعة <LOC> بيرزيت </LOC> </ORG> ان غدا هو <DATE> يوم الخميس </DATE>
# This function assume the input is correct and each tag must start with B- or I-, not empty tag and discard all tags
# start with ignore I- tags if they don’t have B-tags.
import numpy as np


def IBO_to_XML(temp):
    xml_output = ""

    temp_entities = sortTags(temp)

    temp_list = list()

    # initlize the temp_list
    temp_list.append("")
    word_position = 0

    # For each entity, convert ibo to xml list.
    for entity in temp_entities:
        counter_tag = 0
        # For each tag
        for tag in str(entity[1]).split():

            # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empt value in templist
            if counter_tag >= len(temp_list):
                temp_list.append("")

            # If the tag is equal O then and word position not equal zero then add all from templist to output ist
            if "O" == tag and word_position != 0:
                for j in range(len(temp_list),0,-1):
                    if temp_list[j-1]!= "":
                        xml_output+=" </"+str(temp_list[j-1])+">"
                        temp_list[j-1] = ""

            # if its not equal O and its correct tag like B-tag or I-tag and its B  
            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
                # if the templist of counter tag is not empty then we need add xml word that contains 
                # </name of previous tag> its mean that we closed the tag in xml in xml_output
                if temp_list[counter_tag] != "":
                    xml_output+=" </"+str(temp_list[counter_tag])+">"
                # After that we replace the previous tag from templist in new tag
                temp_list[counter_tag] = str(tag).split("-")[1]
                # And add xml word that contains <name of new tag> its mean we open the tag in xml in xml_output
                xml_output += " <" + str(temp_list[counter_tag]) + ">"



            # if its not equal O and its correct tag like B-tag or I-tag and its i and not first word postion
            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
                # we need to check if this tag like previous tag
                for j in range(counter_tag,len(temp_list)):
                    # if its equal then will break the loop and continue
                    if temp_list[j] == tag[2:]:
                        break
                    # if not then we need to add xml word to close the tag like </name of previous> in xml_output
                    else:
                        if temp_list[j] != "":
                            xml_output+=" </"+str(temp_list[j])+">"
                            temp_list[j] = ""
            counter_tag += 1
        word_position += 1
        # Add word in xml_output
        xml_output +=" "+str(entity[0])
    # Add all xml words in xml_output
    for j in range(0, len(temp_list)):
        if temp_list[j] != "":
            xml_output+=" </"+str(temp_list[j])+">"
    return xml_output.strip()


def sortTags(entities):
    temp_entities = entities
    temp_counter = 0
    # For each entity, this loop will sort each tag of entitiy, first it will check if the
    # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
    for entity in temp_entities:
        tags = entity[1].split()
        for tag in tags:
            # if the counter is not 0 then, will complete
            if temp_counter != 0:
                # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
                # count how many tag in previous tags
                if "I-" == tag[0:2]:
                    counter_of_this_tag = 0
                    counter_of_previous_tag = 0
                    for word in tags:
                        if tag.split("-")[1] in word:
                            counter_of_this_tag+=1
                    for word in temp_entities[temp_counter-1][1].split():
                        if tag.split("-")[1] in word:
                            counter_of_previous_tag+=1
                    # if the counter of previous tag is bigger than counter of this tag, then we
                    # need to add I-tag in this tags
                    if counter_of_previous_tag > counter_of_this_tag:
                        tags.append("I-"+tag.split("-")[1])
        # Sort the tags
        tags.sort()
        # Need to revers the tags because it should begins with I
        tags.reverse()
        # If the counter is not 0 then we can complete
        if temp_counter != 0:
            this_tags = tags
            previous_tags = temp_entities[temp_counter - 1][1].split()
            sorted_tags = list()

            # Check if the this tag is not O and previous tags is not O, then will complete,
            # if not then it will ignor this tag
            if "O" not in this_tags and "O" not in previous_tags:
                index = 0
                #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
                for i in previous_tags:
                    j = 0
                    while this_tags and j < len(this_tags):
                        if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
                            sorted_tags.insert(index, this_tags.pop(j))
                            break
                        elif this_tags[j][0:2] == "B-":
                            break
                        j += 1
                    index += 1
            sorted_tags += this_tags
            tags = sorted_tags
        str_tag = " "
        str_tag = str_tag.join(tags)
        str_tag = str_tag.strip()
        temp_entities[temp_counter][1] = str_tag
        temp_counter += 1
    return temp_entities