Spaces:

SinaLab
/

wojood-api

Running

App Files Files Community

wojood-api / IBO_to_XML.py

TymaaHammouda

Update entities

6125895 about 2 months ago

raw

history blame contribute delete

6.5 kB

	# By Wasim Khatib
	# Version 2.0
	# This function take a list a set of annotated entities, in this format: [["صرح","O"],
	# ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
	# ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
	# ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
	# after that it will return text of xml in this fomrat: صرح <OCC> رئيس <ORG> نقابة العاملين </ORG> </OCC> يوم في <ORG>
	# جامعة <LOC> بيرزيت </LOC> </ORG> ان غدا هو <DATE> يوم الخميس </DATE>
	# This function assume the input is correct and each tag must start with B- or I-, not empty tag and discard all tags
	# start with ignore I- tags if they don’t have B-tags.
	import numpy as np


	def IBO_to_XML(temp):
	xml_output = ""

	temp_entities = sortTags(temp)

	temp_list = list()

	# initlize the temp_list
	temp_list.append("")
	word_position = 0

	# For each entity, convert ibo to xml list.
	for entity in temp_entities:
	counter_tag = 0
	# For each tag
	for tag in str(entity[1]).split():

	# If the counter tag greater than or equal to lenght of templist, if yes then we will append the empt value in templist
	if counter_tag >= len(temp_list):
	temp_list.append("")

	# If the tag is equal O then and word position not equal zero then add all from templist to output ist
	if "O" == tag and word_position != 0:
	for j in range(len(temp_list),0,-1):
	if temp_list[j-1]!= "":
	xml_output+=" </"+str(temp_list[j-1])+">"
	temp_list[j-1] = ""

	# if its not equal O and its correct tag like B-tag or I-tag and its B
	elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
	# if the templist of counter tag is not empty then we need add xml word that contains
	# </name of previous tag> its mean that we closed the tag in xml in xml_output
	if temp_list[counter_tag] != "":
	xml_output+=" </"+str(temp_list[counter_tag])+">"
	# After that we replace the previous tag from templist in new tag
	temp_list[counter_tag] = str(tag).split("-")[1]
	# And add xml word that contains <name of new tag> its mean we open the tag in xml in xml_output
	xml_output += " <" + str(temp_list[counter_tag]) + ">"



	# if its not equal O and its correct tag like B-tag or I-tag and its i and not first word postion
	elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
	# we need to check if this tag like previous tag
	for j in range(counter_tag,len(temp_list)):
	# if its equal then will break the loop and continue
	if temp_list[j] == tag[2:]:
	break
	# if not then we need to add xml word to close the tag like </name of previous> in xml_output
	else:
	if temp_list[j] != "":
	xml_output+=" </"+str(temp_list[j])+">"
	temp_list[j] = ""
	counter_tag += 1
	word_position += 1
	# Add word in xml_output
	xml_output +=" "+str(entity[0])
	# Add all xml words in xml_output
	for j in range(0, len(temp_list)):
	if temp_list[j] != "":
	xml_output+=" </"+str(temp_list[j])+">"
	return xml_output.strip()


	def sortTags(entities):
	temp_entities = entities
	temp_counter = 0
	# For each entity, this loop will sort each tag of entitiy, first it will check if the
	# previous tags has same count of this tag, second will sort the tags and check if this tags is correct
	for entity in temp_entities:
	tags = entity[1].split()
	for tag in tags:
	# if the counter is not 0 then, will complete
	if temp_counter != 0:
	# Check if this tag is equal I-, if yes then it will count how many tag in this tags and
	# count how many tag in previous tags
	if "I-" == tag[0:2]:
	counter_of_this_tag = 0
	counter_of_previous_tag = 0
	for word in tags:
	if tag.split("-")[1] in word:
	counter_of_this_tag+=1
	for word in temp_entities[temp_counter-1][1].split():
	if tag.split("-")[1] in word:
	counter_of_previous_tag+=1
	# if the counter of previous tag is bigger than counter of this tag, then we
	# need to add I-tag in this tags
	if counter_of_previous_tag > counter_of_this_tag:
	tags.append("I-"+tag.split("-")[1])
	# Sort the tags
	tags.sort()
	# Need to revers the tags because it should begins with I
	tags.reverse()
	# If the counter is not 0 then we can complete
	if temp_counter != 0:
	this_tags = tags
	previous_tags = temp_entities[temp_counter - 1][1].split()
	sorted_tags = list()

	# Check if the this tag is not O and previous tags is not O, then will complete,
	# if not then it will ignor this tag
	if "O" not in this_tags and "O" not in previous_tags:
	index = 0
	#For each previous tags, need sort this tag by previous tags if its I, B we can ignor
	for i in previous_tags:
	j = 0
	while this_tags and j < len(this_tags):
	if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
	sorted_tags.insert(index, this_tags.pop(j))
	break
	elif this_tags[j][0:2] == "B-":
	break
	j += 1
	index += 1
	sorted_tags += this_tags
	tags = sorted_tags
	str_tag = " "
	str_tag = str_tag.join(tags)
	str_tag = str_tag.strip()
	temp_entities[temp_counter][1] = str_tag
	temp_counter += 1
	return temp_entities