| | import spacy |
| | import re |
| | |
| | import nltk |
| |
|
| | nlp = spacy.load("en_core_sci_md") |
| | nlp2 = spacy.load("en_core_web_sm") |
| |
|
| | def check_word_in_sentence(word_list, sentence): |
| | words = nltk.tokenize.word_tokenize(sentence.lower()) |
| | neg = [] |
| | for word in word_list: |
| | if word in words: |
| | neg.append(word) |
| | return neg |
| |
|
| | def remove_special_characters(string): |
| | |
| | pattern = r"[^a-zA-Z\s.!?]+" |
| | |
| | |
| | try: |
| | cleaned_string = re.sub(pattern, "", string) |
| | return cleaned_string |
| | except: |
| | pass |
| |
|
| | def check_negation(sentence, entity, negations): |
| | tokens = sentence.split() |
| | try: |
| | entity_index = tokens.index(entity) |
| | except: |
| | return False |
| | |
| | for i in range(1, 4): |
| | if entity_index - i >= 0 and tokens[entity_index - i] in negations: |
| | return True |
| | |
| | return False |
| |
|
| | def clean_text(text): |
| | |
| | text = remove_special_characters(text) |
| | text = text.replace('\n','') |
| | return text.lower() |
| |
|
| | def extract_medical_terms(text): |
| | |
| | |
| | |
| | |
| |
|
| | |
| | doc = nlp(text) |
| |
|
| | |
| | labels = ["ENTITY"] |
| | medical_terms = [] |
| | for ent in doc.ents: |
| | if ent.label_ in labels: |
| | medical_terms.append(ent.text) |
| | |
| | return medical_terms |
| |
|
| | def add_negations(medical_terms, text): |
| | |
| | negations = ['not', 'never', 'no', 'neither', 'none', 'nobody', 'nowhere', 'nothing', 'without', "don't", "can't", "won't", "isn't", "aren't", "wasn't", "weren't", "haven't", "hasn't", "hadn't", "shouldn't", "wouldn't", "couldn't", "doesn't", "didn't", "may not", "might not", "need not", "mustn't", "shall not", "will not", "ought not to", "nevermore", "rarely", "scarcely"] |
| | |
| | |
| | sentences = re.split('[.?!]', text.lower()) |
| | for sentence in sentences: |
| | |
| | neg = check_word_in_sentence(negations, sentence) |
| | if neg: |
| | |
| | for term in medical_terms: |
| | if term in sentence: |
| | if check_negation(sentence, term.split()[0], negations): |
| | |
| | |
| | neg_term = neg[0] + ' ' + term |
| | medical_terms[medical_terms.index(term)] = neg_term |
| | |
| | return medical_terms |
| |
|
| | def remove_single_adj(medical_terms): |
| | |
| | for term in medical_terms: |
| | |
| | idoc = nlp2(term.lower()) |
| | if len(idoc) == 1 and idoc[0].pos_ == "ADJ" : |
| | medical_terms.remove(term) |
| | elif len(idoc) == 2 and idoc[0].pos_ == "DET" and idoc[1].pos_ == "ADJ": |
| | medical_terms.remove(term) |
| | return medical_terms |
| |
|
| | def extract_associated_adjectives(text, medical_terms): |
| | |
| | |
| | doc = nlp2(text) |
| | |
| | assoc_adjs = {} |
| |
|
| | for sent in doc.sents: |
| | |
| | tokens = [token for token in sent] |
| | adjs = [tokens[i-1].text for i in range(1, len(tokens)) if tokens[i].pos_ == 'ADJ'] |
| |
|
| | |
| | for term in medical_terms: |
| | |
| | assoc_adjs[term] = [adj.lower() for adj in adjs if adj + ' ' + term in sent.text] |
| | return assoc_adjs |
| |
|
| | def replace_medical_terms(medical_terms, assoc_adjs): |
| | |
| | for i, term in enumerate(medical_terms): |
| | if term in assoc_adjs: |
| | adj = assoc_adjs[term] |
| | if adj: |
| | medical_terms[i] = " ".join(adj) + ' ' + term |
| | return medical_terms |
| |
|
| | def medical_term(reftext): |
| | reftext = clean_text(reftext) |
| | medical_terms = extract_medical_terms(reftext) |
| | medical_terms = remove_single_adj(medical_terms) |
| | assoc_adjs = extract_associated_adjectives(reftext, medical_terms) |
| | medical_terms = replace_medical_terms(medical_terms, assoc_adjs) |
| | medical_terms = add_negations(medical_terms, reftext) |
| | return medical_terms |
| |
|