sumit4352's picture
Upload 2 files
ae4fa62 verified
# prac.py
import gradio as gr
import re
import fitz # PyMuPDF
import spacy
from config import (
keywords,
numeric_regex,
exclude_brackets_regex,
date_regex,
table_regex,
gender_regex,
age_regex,
author_pattern,
exclude_words,
key_sections,
follow_up
)
# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")
def normalize_text(text):
"""Normalize text by removing extra whitespace."""
text = re.sub(r'([a-zA-Z0-9])([\),.!?;-]+)([a-zA-Z])', r'\1\2 \3', text ) # Space between delimmiter and letter
text = re.sub(r'([a-z])([\.])([\s]*)([a-z])', r'\1 \3\4', text) # Reomove '.' between two lowercase letters e.g., et al. xxx
text = re.sub(r'([0-9]+)([\.]+)([0-9]+)([\.]+)([0-9]+)', r'\1-\3-\5', text) # Reomove '.' between three decimal numbers e.g., et 000.55.66
text = re.sub(r'([a-z])([\.]*)([0-9])', r'\1\2 \3', text) # Space between letter and no.
text = re.sub(r'(\s)([a-z0-9]+)([A-Z])([\w]+)', r'\1\2. \3\4', text) # Put a '.' after a lowercase letter/number followed by Uppercase e.g., drains removed by day threeHe continued to
text = re.sub(r'([a-z0-9])([\n]+)([A-Z])', r'\1\. \3', text) # Put a between lowercase letter/number, \n and uppercase letter e.g., xxx5 \n Yyy
text = re.sub(r'(\.)([\s]*)([\.]+)', r'\1', text) # Removing extra '.'s, if any
text = re.sub(r'([a-zA-Z0-9])([\s]*)([-])([\s]*)([a-zA-Z0-9])', r'\1\3\5', text) # Replace words like trans - anethole with trans-anethole
# return text
return " ".join(line.strip() for line in text.splitlines())
def extract_sentences(text):
"""Split text into sentences."""
return re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s', text)
def contains_valid_numeric(sentence):
"""Check if a sentence contains valid numeric values."""
matches = numeric_regex.findall(sentence)
bracketed_numbers = exclude_brackets_regex.findall(sentence)
return bool(matches) and len(matches) != len(bracketed_numbers)
def matches_criteria(sentence, check_time_duration=False):
"""Check if a sentence matches any of the defined keyword criteria."""
if date_regex.search(sentence) or table_regex.match(sentence):
return False
# Gender: Whole-word match only
contains_gender = bool(gender_regex.search(sentence))
# Age: Must contain numeric + age-related keyword as a whole word
# contains_age_and_numeric = bool(re.search(
# r"\b(\d{1,3})\s*(?:years? old|year-old|year olds?|aged|age|young|elderly)\b",
# sentence, re.IGNORECASE
# ))
contains_age_and_numeric = bool(re.search(
r"\b(?:\d{1,3}(?:–\d{1,3})?)\s*(?:years?|year-old|year olds?|aged\b|ages\b)\b",
sentence, re.IGNORECASE
))
# Patients: Must contain numeric + patients
contains_patients_and_numeric = bool(re.search(
r"\b(\d+)\s*(?:patient|patients|case|cases|subject|subjects)\b",
sentence, re.IGNORECASE
))
# Participants: Must contain numeric + participants
contains_participants_and_numeric = bool(re.search(
r"\b(\d+)\s*(?:participant|participants|attendee|respondent|volunteer)\b",
sentence, re.IGNORECASE
))
# Inclusion and Exclusion: Must contain numeric + keyword
contains_inclusion_and_numeric = bool(re.search(
r"\b(\d+)\s*(?:inclusion|eligibility criteria|study inclusion)\b",
sentence, re.IGNORECASE
))
contains_exclusion_and_numeric = bool(re.search(
r"\b(\d+)\s*(?:exclusion|study exclusion|not eligible)\b",
sentence, re.IGNORECASE
))
# Co-morbidities: Matches keyword only
contains_comorbidities = any(kw in sentence.lower() for kw in keywords["Co-morbidities"])
# Time durations: Matches numeric + time unit
time_duration_regex = re.compile(
r'\b(?:\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s*'
+ r"(?:years|year|weeks|week|months|month|days|day)\b",
re.IGNORECASE
)
contains_time_duration = bool(time_duration_regex.search(sentence))
# Ensure the sentence contains valid numeric values
contains_valid_numeric_value = contains_valid_numeric(sentence)
# Additional criteria based on Remark and Intervention Groups
contains_remark = any(kw in sentence.lower() for kw in keywords["Remark"])
contains_intervention = any(kw in sentence.lower() for kw in keywords["Intervention Groups"])
contains_study_type = any(kw in sentence.lower() for kw in keywords["Study Types"])
contains_country = any(kw in sentence.lower() for kw in keywords["Country"])
contains_race = any(kw in sentence.lower() for kw in keywords["Race/Ethnicity"])
if check_time_duration:
return contains_time_duration
return (
contains_valid_numeric_value and (
contains_gender
or contains_age_and_numeric
or contains_patients_and_numeric
or contains_participants_and_numeric
or contains_inclusion_and_numeric
or contains_exclusion_and_numeric
or contains_comorbidities
or contains_time_duration
or contains_remark
or contains_intervention
or contains_study_type
or contains_country
)
)
def matches_keyword(sentence, user_keywords):
"""Check if a sentence contains any of the user-specified keywords."""
return any(keyword.lower() in sentence.lower() for keyword in user_keywords)
def extract_authors(page):
"""Extract authors' names from the text above specified headers."""
full_text = page.get_text()
# Find the position of key sections
section_positions = {section: full_text.find(section) for section in key_sections}
# Filter out sections not found
section_positions = {k: v for k, v in section_positions.items() if v != -1}
# Determine the closest section and extract text above it
if section_positions:
closest_section = min(section_positions, key=section_positions.get)
cutoff_position = section_positions[closest_section]
text_to_search = full_text[:cutoff_position] # Extract text above the section
else:
text_to_search = full_text
# Find author names using regex
author_matches = re.findall(author_pattern, text_to_search)
# Use NLP to further refine author name extraction
doc = nlp(text_to_search)
nlp_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
# Combine regex and NLP results, filtering out unwanted words
combined_names = set(author_matches + nlp_names)
filtered_authors = [name for name in combined_names if name.lower() not in exclude_words]
return list(set(filtered_authors))
def highlight_keywords(sentence, user_keywords):
"""Highlight user_keywords in the sentence using <mark> tags."""
if not user_keywords:
return sentence
# Separate single-word and multi-word keywords
single_words = [kw for kw in user_keywords if ' ' not in kw]
phrases = [kw for kw in user_keywords if ' ' in kw]
# Escape keywords for regex
escaped_single_words = [re.escape(kw) for kw in single_words]
escaped_phrases = [re.escape(kw) for kw in phrases]
# Build regex patterns
patterns = []
if escaped_single_words:
single_word_pattern = r'\b(?:' + '|'.join(escaped_single_words) + r')\b'
patterns.append(single_word_pattern)
if escaped_phrases:
phrase_pattern = r'(?:' + '|'.join(escaped_phrases) + r')'
patterns.append(phrase_pattern)
# Combine patterns into a single regex
if patterns:
combined_pattern = re.compile('|'.join(patterns), re.IGNORECASE)
else:
return sentence
# Function to add <mark> tags
def replacer(match):
return f"<mark>{match.group(0)}</mark>"
# Substitute matched keywords with highlighted version
highlighted_sentence = combined_pattern.sub(replacer, sentence)
return highlighted_sentence
def process_file(file_path, user_keywords, check_time_duration=False):
"""
Process the PDF file and extract sentences based on criteria,
then filter by user keywords and highlight them.
"""
doc = fitz.open(file_path)
first_page = doc[0]
author_names = extract_authors(first_page)
authors_str = ', '.join(author_names)
all_extracted_sentences = []
for page in doc:
text = normalize_text(page.get_text())
sentences = extract_sentences(text)
extracted = [sentence.strip() for sentence in sentences if matches_criteria(sentence, check_time_duration)]
all_extracted_sentences.extend(extracted)
if not check_time_duration:
filtered_sentences = [sentence for sentence in all_extracted_sentences if matches_keyword(sentence, user_keywords)]
else:
filtered_sentences = all_extracted_sentences
# Highlight keywords in the filtered sentences
highlighted_sentences = [highlight_keywords(sentence, user_keywords) for sentence in filtered_sentences]
doc.close()
return highlighted_sentences, authors_str
def process_text(input_text, user_keywords, check_time_duration=False):
"""
Process the input text and extract sentences based on criteria,
then filter by user keywords and highlight them.
"""
refined_text = normalize_text(input_text)
sentences = extract_sentences(refined_text)
extracted_sentences = [sentence.strip() for sentence in sentences if matches_criteria(sentence, check_time_duration)]
if not check_time_duration:
filtered_sentences = [sentence for sentence in extracted_sentences if matches_keyword(sentence, user_keywords)]
else:
filtered_sentences = extracted_sentences
# Highlight keywords in the filtered sentences
highlighted_sentences = [highlight_keywords(sentence, user_keywords) for sentence in filtered_sentences]
return highlighted_sentences, "Authors not extracted from text input."
def handle_input(file_path=None, input_text=None, keyword_group=None, custom_keywords=None, time_duration=False):
"""
Handle user input from the Gradio interface,
process the file or text, and return highlighted sentences with authors.
"""
# Decide on which keywords to use
user_keywords = []
if keyword_group:
user_keywords = keywords.get(keyword_group, [])
if custom_keywords:
user_keywords.extend(kw.strip() for kw in custom_keywords.split(",") if kw.strip())
if not user_keywords and not time_duration:
return "No keyword provided."
if file_path:
extracted_sentences, authors_str = process_file(file_path, user_keywords, time_duration)
elif input_text:
extracted_sentences, authors_str = process_text(input_text, user_keywords, time_duration)
else:
return "No input provided."
if extracted_sentences:
# Combine authors and highlighted sentences into HTML
highlighted_html = f"<p><b>Authors:</b> {authors_str}</p>"
for sentence in extracted_sentences:
highlighted_html += f"<p>{sentence}</p>"
return highlighted_html
return "No matching sentences found."
# Gradio Interface
iface = gr.Interface(
fn=handle_input,
inputs=[
gr.File(label="Upload PDF or Text File", type="filepath"),
gr.Textbox(label="Enter Text", placeholder="Type or paste text here..."),
gr.Radio(
choices=list(keywords.keys()),
label="Information related to..."
),
gr.Textbox(
label="Enter Custom Keywords",
placeholder="e.g., migraine, headache"
),
# gr.Checkbox(
# label="Check Time Duration Criteria",
# value=False
# )
],
outputs=gr.HTML(label="Processed Output"),
title="BioMedical Information Extraction",
description="""
<div style='text-align: left;'>
Made by: Sumit Kumar (2311006), Ramavath Tharun (21219) <br>
Supervisor: Dr. Tanmay Basu<br>
Indian Institute of Science Education and Research<br>
</div>
<div style='text-align: center; margin-top: 10px;'>
<b>Upload a PDF file or enter text, then select a keyword group or enter custom keywords to extract and highlight relevant sentences.</b>
</div>
""",
examples=None, # You can add example files or texts if desired
allow_flagging="never",
cache_examples=True,
# Add custom CSS to style the <mark> tag if necessary
css="""
mark {
background-color: blue;
padding: 0;
border-radius: 2px;
}
/* Optional: Adjust paragraph spacing */
p {
margin-bottom: 10px;
}
"""
)
iface.launch(share=True)