File size: 4,769 Bytes
0f8411f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | import spacy
import re
# from Bio_Epidemiology_NER.bio_recognizer import ner_prediction
import nltk
nlp = spacy.load("en_core_sci_md")
nlp2 = spacy.load("en_core_web_sm")
def check_word_in_sentence(word_list, sentence):
words = nltk.tokenize.word_tokenize(sentence.lower())
neg = []
for word in word_list:
if word in words:
neg.append(word)
return neg
def remove_special_characters(string):
# Define the regular expression pattern
pattern = r"[^a-zA-Z\s.!?]+"
# Remove special characters except end-of-sentence characters
try:
cleaned_string = re.sub(pattern, "", string)
return cleaned_string
except:
pass
def check_negation(sentence, entity, negations):
tokens = sentence.split() # Tokenize the sentence
try:
entity_index = tokens.index(entity) # Find the index of the entity
except:
return False
for i in range(1, 4): # Search one, two, and three words behind the entity
if entity_index - i >= 0 and tokens[entity_index - i] in negations:
return True
return False
def clean_text(text):
# text = re.sub(r'[^\w\s]', '', text)
text = remove_special_characters(text)
text = text.replace('\n','')
return text.lower()
def extract_medical_terms(text):
# Load the spaCy English model
# nlp = spacy.load("en_core_sci_md")
# text = clean_text(text)
# Process the text with spaCy
doc = nlp(text)
# Extract medical terms from the text
labels = ["ENTITY"]
medical_terms = []
for ent in doc.ents:
if ent.label_ in labels:
medical_terms.append(ent.text)
return medical_terms
def add_negations(medical_terms, text):
# list of negation words
negations = ['not', 'never', 'no', 'neither', 'none', 'nobody', 'nowhere', 'nothing', 'without', "don't", "can't", "won't", "isn't", "aren't", "wasn't", "weren't", "haven't", "hasn't", "hadn't", "shouldn't", "wouldn't", "couldn't", "doesn't", "didn't", "may not", "might not", "need not", "mustn't", "shall not", "will not", "ought not to", "nevermore", "rarely", "scarcely"]
# negations = ['not', 'never', 'no']
# iterate over sentences in the report
sentences = re.split('[.?!]', text.lower())
for sentence in sentences:
# check if any negation word is present in the sentence
neg = check_word_in_sentence(negations, sentence)
if neg:
# check if any medical term is present in the same sentence
for term in medical_terms:
if term in sentence:
if check_negation(sentence, term.split()[0], negations):
# replace the medical term with negation + medical term
# neg_term = 'no ' + term
neg_term = neg[0] + ' ' + term
medical_terms[medical_terms.index(term)] = neg_term
return medical_terms
def remove_single_adj(medical_terms):
# nlp = spacy.load("en_core_web_sm")
for term in medical_terms:
# Last Check if the term is an single adjective
idoc = nlp2(term.lower())
if len(idoc) == 1 and idoc[0].pos_ == "ADJ" :
medical_terms.remove(term)
elif len(idoc) == 2 and idoc[0].pos_ == "DET" and idoc[1].pos_ == "ADJ":
medical_terms.remove(term)
return medical_terms
def extract_associated_adjectives(text, medical_terms):
# parse report text with spaCy
# nlp2 = spacy.load("en_core_web_sm")
doc = nlp2(text)
assoc_adjs = {}
for sent in doc.sents:
# extract adjectives and medical terms in sentence
tokens = [token for token in sent]
adjs = [tokens[i-1].text for i in range(1, len(tokens)) if tokens[i].pos_ == 'ADJ']
# check if any medical term is present in sentence
for term in medical_terms:
# check if any associated adjective is present in sentence
assoc_adjs[term] = [adj.lower() for adj in adjs if adj + ' ' + term in sent.text]
return assoc_adjs
def replace_medical_terms(medical_terms, assoc_adjs):
for i, term in enumerate(medical_terms):
if term in assoc_adjs:
adj = assoc_adjs[term]
if adj:
medical_terms[i] = " ".join(adj) + ' ' + term
return medical_terms
def medical_term(reftext):
reftext = clean_text(reftext)
medical_terms = extract_medical_terms(reftext)
medical_terms = remove_single_adj(medical_terms)
assoc_adjs = extract_associated_adjectives(reftext, medical_terms)
medical_terms = replace_medical_terms(medical_terms, assoc_adjs)
medical_terms = add_negations(medical_terms, reftext)
return medical_terms
|