File size: 12,955 Bytes
ae4fa62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
# prac.py

import gradio as gr
import re
import fitz  # PyMuPDF                                            
import spacy
from config import (
    keywords,
    numeric_regex,
    exclude_brackets_regex,
    date_regex,
    table_regex,
    gender_regex,
    age_regex,
    author_pattern,
    exclude_words,
    key_sections,
    follow_up
)

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

def normalize_text(text):
    """Normalize text by removing extra whitespace."""
    text = re.sub(r'([a-zA-Z0-9])([\),.!?;-]+)([a-zA-Z])', r'\1\2 \3', text )    # Space between delimmiter and letter   
    text = re.sub(r'([a-z])([\.])([\s]*)([a-z])', r'\1 \3\4', text)              # Reomove '.' between two lowercase letters e.g., et al. xxx
    text = re.sub(r'([0-9]+)([\.]+)([0-9]+)([\.]+)([0-9]+)', r'\1-\3-\5', text)  # Reomove '.' between three decimal numbers e.g., et 000.55.66
    text = re.sub(r'([a-z])([\.]*)([0-9])', r'\1\2 \3', text)                    # Space between letter and no.    
    text = re.sub(r'(\s)([a-z0-9]+)([A-Z])([\w]+)', r'\1\2. \3\4', text)         # Put a '.' after a lowercase letter/number followed by Uppercase e.g., drains removed by day threeHe continued to 
    text = re.sub(r'([a-z0-9])([\n]+)([A-Z])', r'\1\. \3', text)                 # Put a between lowercase letter/number, \n and uppercase letter e.g., xxx5 \n Yyy
    text = re.sub(r'(\.)([\s]*)([\.]+)', r'\1', text)                            # Removing extra '.'s, if any 
    text = re.sub(r'([a-zA-Z0-9])([\s]*)([-])([\s]*)([a-zA-Z0-9])', r'\1\3\5', text) # Replace words like trans - anethole with trans-anethole
    # return text
    return " ".join(line.strip() for line in text.splitlines())

def extract_sentences(text):
    """Split text into sentences."""
    return re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s', text)

def contains_valid_numeric(sentence):
    """Check if a sentence contains valid numeric values."""
    matches = numeric_regex.findall(sentence)
    bracketed_numbers = exclude_brackets_regex.findall(sentence)
    return bool(matches) and len(matches) != len(bracketed_numbers)

def matches_criteria(sentence, check_time_duration=False):
    """Check if a sentence matches any of the defined keyword criteria."""
    if date_regex.search(sentence) or table_regex.match(sentence):
        return False

    # Gender: Whole-word match only
    contains_gender = bool(gender_regex.search(sentence))

    # Age: Must contain numeric + age-related keyword as a whole word
    # contains_age_and_numeric = bool(re.search(
    #     r"\b(\d{1,3})\s*(?:years? old|year-old|year olds?|aged|age|young|elderly)\b",
    #     sentence, re.IGNORECASE
    # ))
    

    contains_age_and_numeric = bool(re.search(
    r"\b(?:\d{1,3}(?:–\d{1,3})?)\s*(?:years?|year-old|year olds?|aged\b|ages\b)\b",
    sentence, re.IGNORECASE
    ))



    # Patients: Must contain numeric + patients
    contains_patients_and_numeric = bool(re.search(
        r"\b(\d+)\s*(?:patient|patients|case|cases|subject|subjects)\b",
        sentence, re.IGNORECASE
    ))

    # Participants: Must contain numeric + participants
    contains_participants_and_numeric = bool(re.search(
        r"\b(\d+)\s*(?:participant|participants|attendee|respondent|volunteer)\b",
        sentence, re.IGNORECASE
    ))

    # Inclusion and Exclusion: Must contain numeric + keyword
    contains_inclusion_and_numeric = bool(re.search(
        r"\b(\d+)\s*(?:inclusion|eligibility criteria|study inclusion)\b",
        sentence, re.IGNORECASE
    ))
    contains_exclusion_and_numeric = bool(re.search(
        r"\b(\d+)\s*(?:exclusion|study exclusion|not eligible)\b",
        sentence, re.IGNORECASE
    ))

    # Co-morbidities: Matches keyword only
    contains_comorbidities = any(kw in sentence.lower() for kw in keywords["Co-morbidities"])

    # Time durations: Matches numeric + time unit
    time_duration_regex = re.compile(
        r'\b(?:\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s*'
        + r"(?:years|year|weeks|week|months|month|days|day)\b",
        re.IGNORECASE
    )
    contains_time_duration = bool(time_duration_regex.search(sentence))

    # Ensure the sentence contains valid numeric values
    contains_valid_numeric_value = contains_valid_numeric(sentence)

    # Additional criteria based on Remark and Intervention Groups
    contains_remark = any(kw in sentence.lower() for kw in keywords["Remark"])
    contains_intervention = any(kw in sentence.lower() for kw in keywords["Intervention Groups"])
    contains_study_type = any(kw in sentence.lower() for kw in keywords["Study Types"])
    contains_country = any(kw in sentence.lower() for kw in keywords["Country"])
    contains_race = any(kw in sentence.lower() for kw in keywords["Race/Ethnicity"])

    if check_time_duration:
        return contains_time_duration

    return (
        contains_valid_numeric_value and (
            contains_gender
            or contains_age_and_numeric
            or contains_patients_and_numeric
            or contains_participants_and_numeric
            or contains_inclusion_and_numeric
            or contains_exclusion_and_numeric
            or contains_comorbidities
            or contains_time_duration
            or contains_remark
            or contains_intervention
            or contains_study_type
            or contains_country
        )
    )

def matches_keyword(sentence, user_keywords):
    """Check if a sentence contains any of the user-specified keywords."""
    return any(keyword.lower() in sentence.lower() for keyword in user_keywords)

def extract_authors(page):
    """Extract authors' names from the text above specified headers."""
    full_text = page.get_text()

    # Find the position of key sections
    section_positions = {section: full_text.find(section) for section in key_sections}
    # Filter out sections not found
    section_positions = {k: v for k, v in section_positions.items() if v != -1}

    # Determine the closest section and extract text above it
    if section_positions:
        closest_section = min(section_positions, key=section_positions.get)
        cutoff_position = section_positions[closest_section]
        text_to_search = full_text[:cutoff_position]  # Extract text above the section
    else:
        text_to_search = full_text

    # Find author names using regex
    author_matches = re.findall(author_pattern, text_to_search)

    # Use NLP to further refine author name extraction
    doc = nlp(text_to_search)
    nlp_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

    # Combine regex and NLP results, filtering out unwanted words
    combined_names = set(author_matches + nlp_names)
    filtered_authors = [name for name in combined_names if name.lower() not in exclude_words]

    return list(set(filtered_authors))

def highlight_keywords(sentence, user_keywords):
    """Highlight user_keywords in the sentence using <mark> tags."""
    if not user_keywords:
        return sentence

    # Separate single-word and multi-word keywords
    single_words = [kw for kw in user_keywords if ' ' not in kw]
    phrases = [kw for kw in user_keywords if ' ' in kw]

    # Escape keywords for regex
    escaped_single_words = [re.escape(kw) for kw in single_words]
    escaped_phrases = [re.escape(kw) for kw in phrases]

    # Build regex patterns
    patterns = []
    if escaped_single_words:
        single_word_pattern = r'\b(?:' + '|'.join(escaped_single_words) + r')\b'
        patterns.append(single_word_pattern)
    if escaped_phrases:
        phrase_pattern = r'(?:' + '|'.join(escaped_phrases) + r')'
        patterns.append(phrase_pattern)

    # Combine patterns into a single regex
    if patterns:
        combined_pattern = re.compile('|'.join(patterns), re.IGNORECASE)
    else:
        return sentence

    # Function to add <mark> tags
    def replacer(match):
        return f"<mark>{match.group(0)}</mark>"

    # Substitute matched keywords with highlighted version
    highlighted_sentence = combined_pattern.sub(replacer, sentence)
    return highlighted_sentence

def process_file(file_path, user_keywords, check_time_duration=False):
    """
    Process the PDF file and extract sentences based on criteria,
    then filter by user keywords and highlight them.
    """
    doc = fitz.open(file_path)
    first_page = doc[0]
    author_names = extract_authors(first_page)
    authors_str = ', '.join(author_names)

    all_extracted_sentences = []

    for page in doc:
        text = normalize_text(page.get_text())
        sentences = extract_sentences(text)
        extracted = [sentence.strip() for sentence in sentences if matches_criteria(sentence, check_time_duration)]
        all_extracted_sentences.extend(extracted)

    if not check_time_duration:
        filtered_sentences = [sentence for sentence in all_extracted_sentences if matches_keyword(sentence, user_keywords)]
    else:
        filtered_sentences = all_extracted_sentences

    # Highlight keywords in the filtered sentences
    highlighted_sentences = [highlight_keywords(sentence, user_keywords) for sentence in filtered_sentences]

    doc.close()
    return highlighted_sentences, authors_str

def process_text(input_text, user_keywords, check_time_duration=False):
    """
    Process the input text and extract sentences based on criteria,
    then filter by user keywords and highlight them.
    """
    refined_text = normalize_text(input_text)
    sentences = extract_sentences(refined_text)
    extracted_sentences = [sentence.strip() for sentence in sentences if matches_criteria(sentence, check_time_duration)]

    if not check_time_duration:
        filtered_sentences = [sentence for sentence in extracted_sentences if matches_keyword(sentence, user_keywords)]
    else:
        filtered_sentences = extracted_sentences

    # Highlight keywords in the filtered sentences
    highlighted_sentences = [highlight_keywords(sentence, user_keywords) for sentence in filtered_sentences]

    return highlighted_sentences, "Authors not extracted from text input."

def handle_input(file_path=None, input_text=None, keyword_group=None, custom_keywords=None, time_duration=False):
    """
    Handle user input from the Gradio interface,
    process the file or text, and return highlighted sentences with authors.
    """
    # Decide on which keywords to use
    user_keywords = []
    if keyword_group:
        user_keywords = keywords.get(keyword_group, [])
    if custom_keywords:
        user_keywords.extend(kw.strip() for kw in custom_keywords.split(",") if kw.strip())

    if not user_keywords and not time_duration:
        return "No keyword provided."

    if file_path:
        extracted_sentences, authors_str = process_file(file_path, user_keywords, time_duration)
    elif input_text:
        extracted_sentences, authors_str = process_text(input_text, user_keywords, time_duration)
    else:
        return "No input provided."

    if extracted_sentences:
        # Combine authors and highlighted sentences into HTML
        highlighted_html = f"<p><b>Authors:</b> {authors_str}</p>"
        for sentence in extracted_sentences:
            highlighted_html += f"<p>{sentence}</p>"
        return highlighted_html

    return "No matching sentences found."

# Gradio Interface
iface = gr.Interface(
    fn=handle_input,
    inputs=[
        gr.File(label="Upload PDF or Text File", type="filepath"),
        gr.Textbox(label="Enter Text", placeholder="Type or paste text here..."),
        gr.Radio(
            choices=list(keywords.keys()),
            label="Information related to..."
        ),
        gr.Textbox(
            label="Enter Custom Keywords",
            placeholder="e.g., migraine, headache"
        ),
        # gr.Checkbox(
        #     label="Check Time Duration Criteria",
        #     value=False
        # )
    ],
    outputs=gr.HTML(label="Processed Output"),
    title="BioMedical Information Extraction",
    description="""
        <div style='text-align: left;'>
            Made by: Sumit Kumar (2311006), Ramavath Tharun (21219) <br>
            Supervisor: Dr. Tanmay Basu<br>
            Indian Institute of Science Education and Research<br>
        </div>
        <div style='text-align: center; margin-top: 10px;'>
            <b>Upload a PDF file or enter text, then select a keyword group or enter custom keywords to extract and highlight relevant sentences.</b>
        </div>
    """,
    examples=None,  # You can add example files or texts if desired
    allow_flagging="never",
    cache_examples=True,
    # Add custom CSS to style the <mark> tag if necessary
    css="""
        mark {
            background-color: blue;
            padding: 0;
            border-radius: 2px;
        }
        /* Optional: Adjust paragraph spacing */
        p {
            margin-bottom: 10px;
        }
    """
)

iface.launch(share=True)