File size: 4,769 Bytes
0f8411f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import spacy
import re
# from Bio_Epidemiology_NER.bio_recognizer import ner_prediction
import nltk

nlp = spacy.load("en_core_sci_md")
nlp2 = spacy.load("en_core_web_sm")

def check_word_in_sentence(word_list, sentence):
    words = nltk.tokenize.word_tokenize(sentence.lower())
    neg = []
    for word in word_list:
        if word in words:
            neg.append(word)
    return neg

def remove_special_characters(string):
    # Define the regular expression pattern
    pattern = r"[^a-zA-Z\s.!?]+"
    
    # Remove special characters except end-of-sentence characters
    try:
        cleaned_string = re.sub(pattern, "", string)
        return cleaned_string
    except:
        pass

def check_negation(sentence, entity, negations):
    tokens = sentence.split()  # Tokenize the sentence
    try:
        entity_index = tokens.index(entity)  # Find the index of the entity
    except:
        return False
    
    for i in range(1, 4):  # Search one, two, and three words behind the entity
        if entity_index - i >= 0 and tokens[entity_index - i] in negations:
            return True
    
    return False

def clean_text(text):
    # text = re.sub(r'[^\w\s]', '', text)
    text = remove_special_characters(text)
    text = text.replace('\n','')
    return text.lower()

def extract_medical_terms(text):
    # Load the spaCy English model
    # nlp = spacy.load("en_core_sci_md")
    
    # text = clean_text(text)

    # Process the text with spaCy
    doc = nlp(text)

    # Extract medical terms from the text
    labels = ["ENTITY"] 
    medical_terms = []
    for ent in doc.ents:
        if ent.label_ in labels:
            medical_terms.append(ent.text)
    
    return medical_terms

def add_negations(medical_terms, text):
    # list of negation words
    negations = ['not', 'never', 'no', 'neither', 'none', 'nobody', 'nowhere', 'nothing', 'without', "don't", "can't", "won't", "isn't", "aren't", "wasn't", "weren't", "haven't", "hasn't", "hadn't", "shouldn't", "wouldn't", "couldn't", "doesn't", "didn't", "may not", "might not", "need not", "mustn't", "shall not", "will not", "ought not to", "nevermore", "rarely", "scarcely"]
    # negations = ['not', 'never', 'no']
    # iterate over sentences in the report
    sentences = re.split('[.?!]', text.lower())
    for sentence in sentences:
        # check if any negation word is present in the sentence
        neg = check_word_in_sentence(negations, sentence)
        if neg:
            # check if any medical term is present in the same sentence
            for term in medical_terms:
                if term in sentence:
                    if check_negation(sentence, term.split()[0], negations):
                        # replace the medical term with negation + medical term
                        # neg_term = 'no ' + term
                        neg_term = neg[0] + ' ' + term
                        medical_terms[medical_terms.index(term)] = neg_term
    
    return medical_terms

def remove_single_adj(medical_terms):
    # nlp = spacy.load("en_core_web_sm")
    for term in medical_terms:
        # Last Check if the term is an single adjective
        idoc = nlp2(term.lower())
        if len(idoc) == 1 and idoc[0].pos_ == "ADJ" :
            medical_terms.remove(term)
        elif len(idoc) == 2 and idoc[0].pos_ == "DET" and idoc[1].pos_ == "ADJ":
            medical_terms.remove(term)
    return medical_terms

def extract_associated_adjectives(text, medical_terms):
    # parse report text with spaCy
    # nlp2 = spacy.load("en_core_web_sm")
    doc = nlp2(text)
    
    assoc_adjs = {}

    for sent in doc.sents:
        # extract adjectives and medical terms in sentence
        tokens = [token for token in sent]
        adjs = [tokens[i-1].text for i in range(1, len(tokens)) if tokens[i].pos_ == 'ADJ']

        # check if any medical term is present in sentence
        for term in medical_terms:
            # check if any associated adjective is present in sentence
            assoc_adjs[term] = [adj.lower() for adj in adjs if adj + ' ' + term in sent.text]
    return assoc_adjs

def replace_medical_terms(medical_terms, assoc_adjs):
    
    for i, term in enumerate(medical_terms):
        if term in assoc_adjs:
            adj = assoc_adjs[term]
            if adj:
                medical_terms[i] = " ".join(adj) + ' ' + term
    return medical_terms

def medical_term(reftext):
    reftext = clean_text(reftext)
    medical_terms = extract_medical_terms(reftext)
    medical_terms = remove_single_adj(medical_terms)
    assoc_adjs = extract_associated_adjectives(reftext, medical_terms)
    medical_terms = replace_medical_terms(medical_terms, assoc_adjs)
    medical_terms = add_negations(medical_terms, reftext)
    return medical_terms