vicca / Entity_Extract /EntityExtractorv2.py
sayehghp's picture
Add application file
0f8411f
import spacy
import re
# from Bio_Epidemiology_NER.bio_recognizer import ner_prediction
import nltk
nlp = spacy.load("en_core_sci_md")
nlp2 = spacy.load("en_core_web_sm")
def check_word_in_sentence(word_list, sentence):
words = nltk.tokenize.word_tokenize(sentence.lower())
neg = []
for word in word_list:
if word in words:
neg.append(word)
return neg
def remove_special_characters(string):
# Define the regular expression pattern
pattern = r"[^a-zA-Z\s.!?]+"
# Remove special characters except end-of-sentence characters
try:
cleaned_string = re.sub(pattern, "", string)
return cleaned_string
except:
pass
def check_negation(sentence, entity, negations):
tokens = sentence.split() # Tokenize the sentence
try:
entity_index = tokens.index(entity) # Find the index of the entity
except:
return False
for i in range(1, 4): # Search one, two, and three words behind the entity
if entity_index - i >= 0 and tokens[entity_index - i] in negations:
return True
return False
def clean_text(text):
# text = re.sub(r'[^\w\s]', '', text)
text = remove_special_characters(text)
text = text.replace('\n','')
return text.lower()
def extract_medical_terms(text):
# Load the spaCy English model
# nlp = spacy.load("en_core_sci_md")
# text = clean_text(text)
# Process the text with spaCy
doc = nlp(text)
# Extract medical terms from the text
labels = ["ENTITY"]
medical_terms = []
for ent in doc.ents:
if ent.label_ in labels:
medical_terms.append(ent.text)
return medical_terms
def add_negations(medical_terms, text):
# list of negation words
negations = ['not', 'never', 'no', 'neither', 'none', 'nobody', 'nowhere', 'nothing', 'without', "don't", "can't", "won't", "isn't", "aren't", "wasn't", "weren't", "haven't", "hasn't", "hadn't", "shouldn't", "wouldn't", "couldn't", "doesn't", "didn't", "may not", "might not", "need not", "mustn't", "shall not", "will not", "ought not to", "nevermore", "rarely", "scarcely"]
# negations = ['not', 'never', 'no']
# iterate over sentences in the report
sentences = re.split('[.?!]', text.lower())
for sentence in sentences:
# check if any negation word is present in the sentence
neg = check_word_in_sentence(negations, sentence)
if neg:
# check if any medical term is present in the same sentence
for term in medical_terms:
if term in sentence:
if check_negation(sentence, term.split()[0], negations):
# replace the medical term with negation + medical term
# neg_term = 'no ' + term
neg_term = neg[0] + ' ' + term
medical_terms[medical_terms.index(term)] = neg_term
return medical_terms
def remove_single_adj(medical_terms):
# nlp = spacy.load("en_core_web_sm")
for term in medical_terms:
# Last Check if the term is an single adjective
idoc = nlp2(term.lower())
if len(idoc) == 1 and idoc[0].pos_ == "ADJ" :
medical_terms.remove(term)
elif len(idoc) == 2 and idoc[0].pos_ == "DET" and idoc[1].pos_ == "ADJ":
medical_terms.remove(term)
return medical_terms
def extract_associated_adjectives(text, medical_terms):
# parse report text with spaCy
# nlp2 = spacy.load("en_core_web_sm")
doc = nlp2(text)
assoc_adjs = {}
for sent in doc.sents:
# extract adjectives and medical terms in sentence
tokens = [token for token in sent]
adjs = [tokens[i-1].text for i in range(1, len(tokens)) if tokens[i].pos_ == 'ADJ']
# check if any medical term is present in sentence
for term in medical_terms:
# check if any associated adjective is present in sentence
assoc_adjs[term] = [adj.lower() for adj in adjs if adj + ' ' + term in sent.text]
return assoc_adjs
def replace_medical_terms(medical_terms, assoc_adjs):
for i, term in enumerate(medical_terms):
if term in assoc_adjs:
adj = assoc_adjs[term]
if adj:
medical_terms[i] = " ".join(adj) + ' ' + term
return medical_terms
def medical_term(reftext):
reftext = clean_text(reftext)
medical_terms = extract_medical_terms(reftext)
medical_terms = remove_single_adj(medical_terms)
assoc_adjs = extract_associated_adjectives(reftext, medical_terms)
medical_terms = replace_medical_terms(medical_terms, assoc_adjs)
medical_terms = add_negations(medical_terms, reftext)
return medical_terms