Spaces:

sayehghp
/

vicca

Sleeping

App Files Files Community

vicca / Entity_Extract /EntityExtractorv2.py

sayehghp

Add application file

0f8411f 3 months ago

raw

history blame contribute delete

4.77 kB

	import spacy
	import re
	# from Bio_Epidemiology_NER.bio_recognizer import ner_prediction
	import nltk

	nlp = spacy.load("en_core_sci_md")
	nlp2 = spacy.load("en_core_web_sm")

	def check_word_in_sentence(word_list, sentence):
	words = nltk.tokenize.word_tokenize(sentence.lower())
	neg = []
	for word in word_list:
	if word in words:
	neg.append(word)
	return neg

	def remove_special_characters(string):
	# Define the regular expression pattern
	pattern = r"[^a-zA-Z\s.!?]+"

	# Remove special characters except end-of-sentence characters
	try:
	cleaned_string = re.sub(pattern, "", string)
	return cleaned_string
	except:
	pass

	def check_negation(sentence, entity, negations):
	tokens = sentence.split() # Tokenize the sentence
	try:
	entity_index = tokens.index(entity) # Find the index of the entity
	except:
	return False

	for i in range(1, 4): # Search one, two, and three words behind the entity
	if entity_index - i >= 0 and tokens[entity_index - i] in negations:
	return True

	return False

	def clean_text(text):
	# text = re.sub(r'[^\w\s]', '', text)
	text = remove_special_characters(text)
	text = text.replace('\n','')
	return text.lower()

	def extract_medical_terms(text):
	# Load the spaCy English model
	# nlp = spacy.load("en_core_sci_md")

	# text = clean_text(text)

	# Process the text with spaCy
	doc = nlp(text)

	# Extract medical terms from the text
	labels = ["ENTITY"]
	medical_terms = []
	for ent in doc.ents:
	if ent.label_ in labels:
	medical_terms.append(ent.text)

	return medical_terms

	def add_negations(medical_terms, text):
	# list of negation words
	negations = ['not', 'never', 'no', 'neither', 'none', 'nobody', 'nowhere', 'nothing', 'without', "don't", "can't", "won't", "isn't", "aren't", "wasn't", "weren't", "haven't", "hasn't", "hadn't", "shouldn't", "wouldn't", "couldn't", "doesn't", "didn't", "may not", "might not", "need not", "mustn't", "shall not", "will not", "ought not to", "nevermore", "rarely", "scarcely"]
	# negations = ['not', 'never', 'no']
	# iterate over sentences in the report
	sentences = re.split('[.?!]', text.lower())
	for sentence in sentences:
	# check if any negation word is present in the sentence
	neg = check_word_in_sentence(negations, sentence)
	if neg:
	# check if any medical term is present in the same sentence
	for term in medical_terms:
	if term in sentence:
	if check_negation(sentence, term.split()[0], negations):
	# replace the medical term with negation + medical term
	# neg_term = 'no ' + term
	neg_term = neg[0] + ' ' + term
	medical_terms[medical_terms.index(term)] = neg_term

	return medical_terms

	def remove_single_adj(medical_terms):
	# nlp = spacy.load("en_core_web_sm")
	for term in medical_terms:
	# Last Check if the term is an single adjective
	idoc = nlp2(term.lower())
	if len(idoc) == 1 and idoc[0].pos_ == "ADJ" :
	medical_terms.remove(term)
	elif len(idoc) == 2 and idoc[0].pos_ == "DET" and idoc[1].pos_ == "ADJ":
	medical_terms.remove(term)
	return medical_terms

	def extract_associated_adjectives(text, medical_terms):
	# parse report text with spaCy
	# nlp2 = spacy.load("en_core_web_sm")
	doc = nlp2(text)

	assoc_adjs = {}

	for sent in doc.sents:
	# extract adjectives and medical terms in sentence
	tokens = [token for token in sent]
	adjs = [tokens[i-1].text for i in range(1, len(tokens)) if tokens[i].pos_ == 'ADJ']

	# check if any medical term is present in sentence
	for term in medical_terms:
	# check if any associated adjective is present in sentence
	assoc_adjs[term] = [adj.lower() for adj in adjs if adj + ' ' + term in sent.text]
	return assoc_adjs

	def replace_medical_terms(medical_terms, assoc_adjs):

	for i, term in enumerate(medical_terms):
	if term in assoc_adjs:
	adj = assoc_adjs[term]
	if adj:
	medical_terms[i] = " ".join(adj) + ' ' + term
	return medical_terms

	def medical_term(reftext):
	reftext = clean_text(reftext)
	medical_terms = extract_medical_terms(reftext)
	medical_terms = remove_single_adj(medical_terms)
	assoc_adjs = extract_associated_adjectives(reftext, medical_terms)
	medical_terms = replace_medical_terms(medical_terms, assoc_adjs)
	medical_terms = add_negations(medical_terms, reftext)
	return medical_terms