Spaces:

SinaLab
/

wojood-api

Running

File size: 7,470 Bytes

cfe897e

# By Wasim Khatib
# Version 2.0
# This function take a list a set of annotated entities, in this format: [["صرح","O"], ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
# ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
# ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
# after that it will return array of the set of distilled entities and their positions (start, end) and tags, such as
# [[" رئيس نقابة العاملين في جامعة بيرزيت", OCC,1,7],
# [" نقابة العاملين في جامعة بيرزيت", ORG,2,7], [" جامعة بيرزيت", ORG,5,7],["يوم الخميس", DATE,10,11]]
def distill_entities(entities):
    # This is list that we put the output what we need
    list_output = list()

    # This line go to sort function and save the output to temp_entities
    temp_entities = sortTags(entities)

    # This list help us to make the output,
    temp_list = list()

    # initlize the temp_list
    temp_list.append(["", "", 0, 0])
    word_position = 0

    # For each entity, convert ibo to distllir list.
    for entity in temp_entities:
        # This is counter tag of this entity
        counter_tag = 0
        # For each tag
        for tag in str(entity[1]).split():
            # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
            if counter_tag >= len(temp_list):
                temp_list.append(["", "", 0, 0])

            # If tag equal O and word postion of this tag is not equal zero then it will add all
            # not empty eliment of temp list in output list
            if "O" == tag and word_position != 0:
                for j in range(0, len(temp_list)):
                    if temp_list[j][1] != "":
                        list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
                        temp_list[j][0] = ""
                        temp_list[j][1] = ""
                        temp_list[j][2] = word_position
                        temp_list[j][3] = word_position
            # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
            # of the split its B
            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
                # if the temp_list of counter is not empty then it will append in output list and hten it will
                # initilize by new string and tag in templist of counter
                if temp_list[counter_tag][1] != "":
                    list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
                temp_list[counter_tag][0] = str(entity[0]) + " "
                temp_list[counter_tag][1] = str(tag).split("-")[1]
                temp_list[counter_tag][2] = word_position
                temp_list[counter_tag][3] = word_position

            # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
            # of the split its O
            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
                # For each of temp_list, check if in this counter tag of templist is same tag with this.tag
                # then will complete if not it will save in output list and cheak another
                for j in range(counter_tag,len(temp_list)):
                    if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
                        temp_list[j][0] += str(entity[0]) + " "
                        temp_list[j][3] += 1
                        break
                    else:
                        if temp_list[j][1] != "":
                            list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
                            temp_list[j][0] = ""
                            temp_list[j][1] = ""
                            temp_list[j][2] = word_position
                            temp_list[j][3] = word_position
            counter_tag += 1
        word_position += 1
    # For each temp_list, at the end of the previous loop, there will be some
    # values in this list, we should save it to the output list
    for j in range(0, len(temp_list)):
        if temp_list[j][1] != "":
            list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
    return sorted(list_output, key=lambda x: (x[2]))

def sortTags(entities):
    temp_entities = entities
    temp_counter = 0
    # For each entity, this loop will sort each tag of entitiy, first it will check if the
    # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
    for entity in temp_entities:
        tags = entity[1].split()
        for tag in tags:
            # if the counter is not 0 then, will complete
            if temp_counter != 0:
                # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
                # count how many tag in previous tags
                if "I-" == tag[0:2]:
                    counter_of_this_tag = 0
                    counter_of_previous_tag = 0
                    for word in tags:
                        if tag.split("-")[1] in word:
                            counter_of_this_tag+=1
                    for word in temp_entities[temp_counter-1][1].split():
                        if tag.split("-")[1] in word:
                            counter_of_previous_tag+=1
                    # if the counter of previous tag is bigger than counter of this tag, then we
                    # need to add I-tag in this tags
                    if counter_of_previous_tag > counter_of_this_tag:
                        tags.append("I-"+tag.split("-")[1])
        # Sort the tags
        tags.sort()
        # Need to revers the tags because it should begins with I
        tags.reverse()
        # If the counter is not 0 then we can complete
        if temp_counter != 0:
            this_tags = tags
            previous_tags = temp_entities[temp_counter - 1][1].split()
            sorted_tags = list()

            # Check if the this tag is not O and previous tags is not O, then will complete,
            # if not then it will ignor this tag
            if "O" not in this_tags and "O" not in previous_tags:
                index = 0
                #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
                for i in previous_tags:
                    j = 0
                    while this_tags and j < len(this_tags):
                        if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
                            sorted_tags.insert(index, this_tags.pop(j))
                            break
                        elif this_tags[j][0:2] == "B-":
                            break
                        j += 1
                    index += 1
            sorted_tags += this_tags
            tags = sorted_tags
        str_tag = " "
        str_tag = str_tag.join(tags)
        str_tag = str_tag.strip()
        temp_entities[temp_counter][1] = str_tag
        temp_counter += 1
    return temp_entities