|
|
|
|
|
|
|
|
import gradio as gr |
|
|
import re |
|
|
import fitz |
|
|
import spacy |
|
|
from config import ( |
|
|
keywords, |
|
|
numeric_regex, |
|
|
exclude_brackets_regex, |
|
|
date_regex, |
|
|
table_regex, |
|
|
gender_regex, |
|
|
age_regex, |
|
|
author_pattern, |
|
|
exclude_words, |
|
|
key_sections, |
|
|
follow_up |
|
|
) |
|
|
|
|
|
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
def normalize_text(text): |
|
|
"""Normalize text by removing extra whitespace.""" |
|
|
text = re.sub(r'([a-zA-Z0-9])([\),.!?;-]+)([a-zA-Z])', r'\1\2 \3', text ) |
|
|
text = re.sub(r'([a-z])([\.])([\s]*)([a-z])', r'\1 \3\4', text) |
|
|
text = re.sub(r'([0-9]+)([\.]+)([0-9]+)([\.]+)([0-9]+)', r'\1-\3-\5', text) |
|
|
text = re.sub(r'([a-z])([\.]*)([0-9])', r'\1\2 \3', text) |
|
|
text = re.sub(r'(\s)([a-z0-9]+)([A-Z])([\w]+)', r'\1\2. \3\4', text) |
|
|
text = re.sub(r'([a-z0-9])([\n]+)([A-Z])', r'\1\. \3', text) |
|
|
text = re.sub(r'(\.)([\s]*)([\.]+)', r'\1', text) |
|
|
text = re.sub(r'([a-zA-Z0-9])([\s]*)([-])([\s]*)([a-zA-Z0-9])', r'\1\3\5', text) |
|
|
|
|
|
return " ".join(line.strip() for line in text.splitlines()) |
|
|
|
|
|
def extract_sentences(text): |
|
|
"""Split text into sentences.""" |
|
|
return re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s', text) |
|
|
|
|
|
def contains_valid_numeric(sentence): |
|
|
"""Check if a sentence contains valid numeric values.""" |
|
|
matches = numeric_regex.findall(sentence) |
|
|
bracketed_numbers = exclude_brackets_regex.findall(sentence) |
|
|
return bool(matches) and len(matches) != len(bracketed_numbers) |
|
|
|
|
|
def matches_criteria(sentence, check_time_duration=False): |
|
|
"""Check if a sentence matches any of the defined keyword criteria.""" |
|
|
if date_regex.search(sentence) or table_regex.match(sentence): |
|
|
return False |
|
|
|
|
|
|
|
|
contains_gender = bool(gender_regex.search(sentence)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
contains_age_and_numeric = bool(re.search( |
|
|
r"\b(?:\d{1,3}(?:–\d{1,3})?)\s*(?:years?|year-old|year olds?|aged\b|ages\b)\b", |
|
|
sentence, re.IGNORECASE |
|
|
)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
contains_patients_and_numeric = bool(re.search( |
|
|
r"\b(\d+)\s*(?:patient|patients|case|cases|subject|subjects)\b", |
|
|
sentence, re.IGNORECASE |
|
|
)) |
|
|
|
|
|
|
|
|
contains_participants_and_numeric = bool(re.search( |
|
|
r"\b(\d+)\s*(?:participant|participants|attendee|respondent|volunteer)\b", |
|
|
sentence, re.IGNORECASE |
|
|
)) |
|
|
|
|
|
|
|
|
contains_inclusion_and_numeric = bool(re.search( |
|
|
r"\b(\d+)\s*(?:inclusion|eligibility criteria|study inclusion)\b", |
|
|
sentence, re.IGNORECASE |
|
|
)) |
|
|
contains_exclusion_and_numeric = bool(re.search( |
|
|
r"\b(\d+)\s*(?:exclusion|study exclusion|not eligible)\b", |
|
|
sentence, re.IGNORECASE |
|
|
)) |
|
|
|
|
|
|
|
|
contains_comorbidities = any(kw in sentence.lower() for kw in keywords["Co-morbidities"]) |
|
|
|
|
|
|
|
|
time_duration_regex = re.compile( |
|
|
r'\b(?:\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s*' |
|
|
+ r"(?:years|year|weeks|week|months|month|days|day)\b", |
|
|
re.IGNORECASE |
|
|
) |
|
|
contains_time_duration = bool(time_duration_regex.search(sentence)) |
|
|
|
|
|
|
|
|
contains_valid_numeric_value = contains_valid_numeric(sentence) |
|
|
|
|
|
|
|
|
contains_remark = any(kw in sentence.lower() for kw in keywords["Remark"]) |
|
|
contains_intervention = any(kw in sentence.lower() for kw in keywords["Intervention Groups"]) |
|
|
contains_study_type = any(kw in sentence.lower() for kw in keywords["Study Types"]) |
|
|
contains_country = any(kw in sentence.lower() for kw in keywords["Country"]) |
|
|
contains_race = any(kw in sentence.lower() for kw in keywords["Race/Ethnicity"]) |
|
|
|
|
|
if check_time_duration: |
|
|
return contains_time_duration |
|
|
|
|
|
return ( |
|
|
contains_valid_numeric_value and ( |
|
|
contains_gender |
|
|
or contains_age_and_numeric |
|
|
or contains_patients_and_numeric |
|
|
or contains_participants_and_numeric |
|
|
or contains_inclusion_and_numeric |
|
|
or contains_exclusion_and_numeric |
|
|
or contains_comorbidities |
|
|
or contains_time_duration |
|
|
or contains_remark |
|
|
or contains_intervention |
|
|
or contains_study_type |
|
|
or contains_country |
|
|
) |
|
|
) |
|
|
|
|
|
def matches_keyword(sentence, user_keywords): |
|
|
"""Check if a sentence contains any of the user-specified keywords.""" |
|
|
return any(keyword.lower() in sentence.lower() for keyword in user_keywords) |
|
|
|
|
|
def extract_authors(page): |
|
|
"""Extract authors' names from the text above specified headers.""" |
|
|
full_text = page.get_text() |
|
|
|
|
|
|
|
|
section_positions = {section: full_text.find(section) for section in key_sections} |
|
|
|
|
|
section_positions = {k: v for k, v in section_positions.items() if v != -1} |
|
|
|
|
|
|
|
|
if section_positions: |
|
|
closest_section = min(section_positions, key=section_positions.get) |
|
|
cutoff_position = section_positions[closest_section] |
|
|
text_to_search = full_text[:cutoff_position] |
|
|
else: |
|
|
text_to_search = full_text |
|
|
|
|
|
|
|
|
author_matches = re.findall(author_pattern, text_to_search) |
|
|
|
|
|
|
|
|
doc = nlp(text_to_search) |
|
|
nlp_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"] |
|
|
|
|
|
|
|
|
combined_names = set(author_matches + nlp_names) |
|
|
filtered_authors = [name for name in combined_names if name.lower() not in exclude_words] |
|
|
|
|
|
return list(set(filtered_authors)) |
|
|
|
|
|
def highlight_keywords(sentence, user_keywords): |
|
|
"""Highlight user_keywords in the sentence using <mark> tags.""" |
|
|
if not user_keywords: |
|
|
return sentence |
|
|
|
|
|
|
|
|
single_words = [kw for kw in user_keywords if ' ' not in kw] |
|
|
phrases = [kw for kw in user_keywords if ' ' in kw] |
|
|
|
|
|
|
|
|
escaped_single_words = [re.escape(kw) for kw in single_words] |
|
|
escaped_phrases = [re.escape(kw) for kw in phrases] |
|
|
|
|
|
|
|
|
patterns = [] |
|
|
if escaped_single_words: |
|
|
single_word_pattern = r'\b(?:' + '|'.join(escaped_single_words) + r')\b' |
|
|
patterns.append(single_word_pattern) |
|
|
if escaped_phrases: |
|
|
phrase_pattern = r'(?:' + '|'.join(escaped_phrases) + r')' |
|
|
patterns.append(phrase_pattern) |
|
|
|
|
|
|
|
|
if patterns: |
|
|
combined_pattern = re.compile('|'.join(patterns), re.IGNORECASE) |
|
|
else: |
|
|
return sentence |
|
|
|
|
|
|
|
|
def replacer(match): |
|
|
return f"<mark>{match.group(0)}</mark>" |
|
|
|
|
|
|
|
|
highlighted_sentence = combined_pattern.sub(replacer, sentence) |
|
|
return highlighted_sentence |
|
|
|
|
|
def process_file(file_path, user_keywords, check_time_duration=False): |
|
|
""" |
|
|
Process the PDF file and extract sentences based on criteria, |
|
|
then filter by user keywords and highlight them. |
|
|
""" |
|
|
doc = fitz.open(file_path) |
|
|
first_page = doc[0] |
|
|
author_names = extract_authors(first_page) |
|
|
authors_str = ', '.join(author_names) |
|
|
|
|
|
all_extracted_sentences = [] |
|
|
|
|
|
for page in doc: |
|
|
text = normalize_text(page.get_text()) |
|
|
sentences = extract_sentences(text) |
|
|
extracted = [sentence.strip() for sentence in sentences if matches_criteria(sentence, check_time_duration)] |
|
|
all_extracted_sentences.extend(extracted) |
|
|
|
|
|
if not check_time_duration: |
|
|
filtered_sentences = [sentence for sentence in all_extracted_sentences if matches_keyword(sentence, user_keywords)] |
|
|
else: |
|
|
filtered_sentences = all_extracted_sentences |
|
|
|
|
|
|
|
|
highlighted_sentences = [highlight_keywords(sentence, user_keywords) for sentence in filtered_sentences] |
|
|
|
|
|
doc.close() |
|
|
return highlighted_sentences, authors_str |
|
|
|
|
|
def process_text(input_text, user_keywords, check_time_duration=False): |
|
|
""" |
|
|
Process the input text and extract sentences based on criteria, |
|
|
then filter by user keywords and highlight them. |
|
|
""" |
|
|
refined_text = normalize_text(input_text) |
|
|
sentences = extract_sentences(refined_text) |
|
|
extracted_sentences = [sentence.strip() for sentence in sentences if matches_criteria(sentence, check_time_duration)] |
|
|
|
|
|
if not check_time_duration: |
|
|
filtered_sentences = [sentence for sentence in extracted_sentences if matches_keyword(sentence, user_keywords)] |
|
|
else: |
|
|
filtered_sentences = extracted_sentences |
|
|
|
|
|
|
|
|
highlighted_sentences = [highlight_keywords(sentence, user_keywords) for sentence in filtered_sentences] |
|
|
|
|
|
return highlighted_sentences, "Authors not extracted from text input." |
|
|
|
|
|
def handle_input(file_path=None, input_text=None, keyword_group=None, custom_keywords=None, time_duration=False): |
|
|
""" |
|
|
Handle user input from the Gradio interface, |
|
|
process the file or text, and return highlighted sentences with authors. |
|
|
""" |
|
|
|
|
|
user_keywords = [] |
|
|
if keyword_group: |
|
|
user_keywords = keywords.get(keyword_group, []) |
|
|
if custom_keywords: |
|
|
user_keywords.extend(kw.strip() for kw in custom_keywords.split(",") if kw.strip()) |
|
|
|
|
|
if not user_keywords and not time_duration: |
|
|
return "No keyword provided." |
|
|
|
|
|
if file_path: |
|
|
extracted_sentences, authors_str = process_file(file_path, user_keywords, time_duration) |
|
|
elif input_text: |
|
|
extracted_sentences, authors_str = process_text(input_text, user_keywords, time_duration) |
|
|
else: |
|
|
return "No input provided." |
|
|
|
|
|
if extracted_sentences: |
|
|
|
|
|
highlighted_html = f"<p><b>Authors:</b> {authors_str}</p>" |
|
|
for sentence in extracted_sentences: |
|
|
highlighted_html += f"<p>{sentence}</p>" |
|
|
return highlighted_html |
|
|
|
|
|
return "No matching sentences found." |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=handle_input, |
|
|
inputs=[ |
|
|
gr.File(label="Upload PDF or Text File", type="filepath"), |
|
|
gr.Textbox(label="Enter Text", placeholder="Type or paste text here..."), |
|
|
gr.Radio( |
|
|
choices=list(keywords.keys()), |
|
|
label="Information related to..." |
|
|
), |
|
|
gr.Textbox( |
|
|
label="Enter Custom Keywords", |
|
|
placeholder="e.g., migraine, headache" |
|
|
), |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
], |
|
|
outputs=gr.HTML(label="Processed Output"), |
|
|
title="BioMedical Information Extraction", |
|
|
description=""" |
|
|
<div style='text-align: left;'> |
|
|
Made by: Sumit Kumar (2311006), Ramavath Tharun (21219) <br> |
|
|
Supervisor: Dr. Tanmay Basu<br> |
|
|
Indian Institute of Science Education and Research<br> |
|
|
</div> |
|
|
<div style='text-align: center; margin-top: 10px;'> |
|
|
<b>Upload a PDF file or enter text, then select a keyword group or enter custom keywords to extract and highlight relevant sentences.</b> |
|
|
</div> |
|
|
""", |
|
|
examples=None, |
|
|
allow_flagging="never", |
|
|
cache_examples=True, |
|
|
|
|
|
css=""" |
|
|
mark { |
|
|
background-color: blue; |
|
|
padding: 0; |
|
|
border-radius: 2px; |
|
|
} |
|
|
/* Optional: Adjust paragraph spacing */ |
|
|
p { |
|
|
margin-bottom: 10px; |
|
|
} |
|
|
""" |
|
|
) |
|
|
|
|
|
iface.launch(share=True) |
|
|
|
|
|
|
|
|
|
|
|
|