File size: 12,955 Bytes

ae4fa62

# prac.py

import gradio as gr
import re
import fitz  # PyMuPDF                                            
import spacy
from config import (
    keywords,
    numeric_regex,
    exclude_brackets_regex,
    date_regex,
    table_regex,
    gender_regex,
    age_regex,
    author_pattern,
    exclude_words,
    key_sections,
    follow_up
)

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

def normalize_text(text):
    """Normalize text by removing extra whitespace."""
    text = re.sub(r'([a-zA-Z0-9])([\),.!?;-]+)([a-zA-Z])', r'\1\2 \3', text )    # Space between delimmiter and letter   
    text = re.sub(r'([a-z])([\.])([\s]*)([a-z])', r'\1 \3\4', text)              # Reomove '.' between two lowercase letters e.g., et al. xxx
    text = re.sub(r'([0-9]+)([\.]+)([0-9]+)([\.]+)([0-9]+)', r'\1-\3-\5', text)  # Reomove '.' between three decimal numbers e.g., et 000.55.66
    text = re.sub(r'([a-z])([\.]*)([0-9])', r'\1\2 \3', text)                    # Space between letter and no.    
    text = re.sub(r'(\s)([a-z0-9]+)([A-Z])([\w]+)', r'\1\2. \3\4', text)         # Put a '.' after a lowercase letter/number followed by Uppercase e.g., drains removed by day threeHe continued to 
    text = re.sub(r'([a-z0-9])([\n]+)([A-Z])', r'\1\. \3', text)                 # Put a between lowercase letter/number, \n and uppercase letter e.g., xxx5 \n Yyy
    text = re.sub(r'(\.)([\s]*)([\.]+)', r'\1', text)                            # Removing extra '.'s, if any 
    text = re.sub(r'([a-zA-Z0-9])([\s]*)([-])([\s]*)([a-zA-Z0-9])', r'\1\3\5', text) # Replace words like trans - anethole with trans-anethole
    # return text
    return " ".join(line.strip() for line in text.splitlines())

def extract_sentences(text):
    """Split text into sentences."""
    return re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s', text)

def contains_valid_numeric(sentence):
    """Check if a sentence contains valid numeric values."""
    matches = numeric_regex.findall(sentence)
    bracketed_numbers = exclude_brackets_regex.findall(sentence)
    return bool(matches) and len(matches) != len(bracketed_numbers)

def matches_criteria(sentence, check_time_duration=False):
    """Check if a sentence matches any of the defined keyword criteria."""
    if date_regex.search(sentence) or table_regex.match(sentence):
        return False

    # Gender: Whole-word match only
    contains_gender = bool(gender_regex.search(sentence))

    # Age: Must contain numeric + age-related keyword as a whole word
    # contains_age_and_numeric = bool(re.search(
    #     r"\b(\d{1,3})\s*(?:years? old|year-old|year olds?|aged|age|young|elderly)\b",
    #     sentence, re.IGNORECASE
    # ))
    

    contains_age_and_numeric = bool(re.search(
    r"\b(?:\d{1,3}(?:–\d{1,3})?)\s*(?:years?|year-old|year olds?|aged\b|ages\b)\b",
    sentence, re.IGNORECASE
    ))



    # Patients: Must contain numeric + patients
    contains_patients_and_numeric = bool(re.search(
        r"\b(\d+)\s*(?:patient|patients|case|cases|subject|subjects)\b",
        sentence, re.IGNORECASE
    ))

    # Participants: Must contain numeric + participants
    contains_participants_and_numeric = bool(re.search(
        r"\b(\d+)\s*(?:participant|participants|attendee|respondent|volunteer)\b",
        sentence, re.IGNORECASE
    ))

    # Inclusion and Exclusion: Must contain numeric + keyword
    contains_inclusion_and_numeric = bool(re.search(
        r"\b(\d+)\s*(?:inclusion|eligibility criteria|study inclusion)\b",
        sentence, re.IGNORECASE
    ))
    contains_exclusion_and_numeric = bool(re.search(
        r"\b(\d+)\s*(?:exclusion|study exclusion|not eligible)\b",
        sentence, re.IGNORECASE
    ))

    # Co-morbidities: Matches keyword only
    contains_comorbidities = any(kw in sentence.lower() for kw in keywords["Co-morbidities"])

    # Time durations: Matches numeric + time unit
    time_duration_regex = re.compile(
        r'\b(?:\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s*'
        + r"(?:years|year|weeks|week|months|month|days|day)\b",
        re.IGNORECASE
    )
    contains_time_duration = bool(time_duration_regex.search(sentence))

    # Ensure the sentence contains valid numeric values
    contains_valid_numeric_value = contains_valid_numeric(sentence)

    # Additional criteria based on Remark and Intervention Groups
    contains_remark = any(kw in sentence.lower() for kw in keywords["Remark"])
    contains_intervention = any(kw in sentence.lower() for kw in keywords["Intervention Groups"])
    contains_study_type = any(kw in sentence.lower() for kw in keywords["Study Types"])
    contains_country = any(kw in sentence.lower() for kw in keywords["Country"])
    contains_race = any(kw in sentence.lower() for kw in keywords["Race/Ethnicity"])

    if check_time_duration:
        return contains_time_duration

    return (
        contains_valid_numeric_value and (
            contains_gender
            or contains_age_and_numeric
            or contains_patients_and_numeric
            or contains_participants_and_numeric
            or contains_inclusion_and_numeric
            or contains_exclusion_and_numeric
            or contains_comorbidities
            or contains_time_duration
            or contains_remark
            or contains_intervention
            or contains_study_type
            or contains_country
        )
    )

def matches_keyword(sentence, user_keywords):
    """Check if a sentence contains any of the user-specified keywords."""
    return any(keyword.lower() in sentence.lower() for keyword in user_keywords)

def extract_authors(page):
    """Extract authors' names from the text above specified headers."""
    full_text = page.get_text()

    # Find the position of key sections
    section_positions = {section: full_text.find(section) for section in key_sections}
    # Filter out sections not found
    section_positions = {k: v for k, v in section_positions.items() if v != -1}

    # Determine the closest section and extract text above it
    if section_positions:
        closest_section = min(section_positions, key=section_positions.get)
        cutoff_position = section_positions[closest_section]
        text_to_search = full_text[:cutoff_position]  # Extract text above the section
    else:
        text_to_search = full_text

    # Find author names using regex
    author_matches = re.findall(author_pattern, text_to_search)

    # Use NLP to further refine author name extraction
    doc = nlp(text_to_search)
    nlp_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

    # Combine regex and NLP results, filtering out unwanted words
    combined_names = set(author_matches + nlp_names)
    filtered_authors = [name for name in combined_names if name.lower() not in exclude_words]

    return list(set(filtered_authors))

def highlight_keywords(sentence, user_keywords):
    """Highlight user_keywords in the sentence using <mark> tags."""
    if not user_keywords:
        return sentence

    # Separate single-word and multi-word keywords
    single_words = [kw for kw in user_keywords if ' ' not in kw]
    phrases = [kw for kw in user_keywords if ' ' in kw]

    # Escape keywords for regex
    escaped_single_words = [re.escape(kw) for kw in single_words]
    escaped_phrases = [re.escape(kw) for kw in phrases]

    # Build regex patterns
    patterns = []
    if escaped_single_words:
        single_word_pattern = r'\b(?:' + '|'.join(escaped_single_words) + r')\b'
        patterns.append(single_word_pattern)
    if escaped_phrases:
        phrase_pattern = r'(?:' + '|'.join(escaped_phrases) + r')'
        patterns.append(phrase_pattern)

    # Combine patterns into a single regex
    if patterns:
        combined_pattern = re.compile('|'.join(patterns), re.IGNORECASE)
    else:
        return sentence

    # Function to add <mark> tags
    def replacer(match):
        return f"<mark>{match.group(0)}</mark>"

    # Substitute matched keywords with highlighted version
    highlighted_sentence = combined_pattern.sub(replacer, sentence)
    return highlighted_sentence

def process_file(file_path, user_keywords, check_time_duration=False):
    """
    Process the PDF file and extract sentences based on criteria,
    then filter by user keywords and highlight them.
    """
    doc = fitz.open(file_path)
    first_page = doc[0]
    author_names = extract_authors(first_page)
    authors_str = ', '.join(author_names)

    all_extracted_sentences = []

    for page in doc:
        text = normalize_text(page.get_text())
        sentences = extract_sentences(text)
        extracted = [sentence.strip() for sentence in sentences if matches_criteria(sentence, check_time_duration)]
        all_extracted_sentences.extend(extracted)

    if not check_time_duration:
        filtered_sentences = [sentence for sentence in all_extracted_sentences if matches_keyword(sentence, user_keywords)]
    else:
        filtered_sentences = all_extracted_sentences

    # Highlight keywords in the filtered sentences
    highlighted_sentences = [highlight_keywords(sentence, user_keywords) for sentence in filtered_sentences]

    doc.close()
    return highlighted_sentences, authors_str

def process_text(input_text, user_keywords, check_time_duration=False):
    """
    Process the input text and extract sentences based on criteria,
    then filter by user keywords and highlight them.
    """
    refined_text = normalize_text(input_text)
    sentences = extract_sentences(refined_text)
    extracted_sentences = [sentence.strip() for sentence in sentences if matches_criteria(sentence, check_time_duration)]

    if not check_time_duration:
        filtered_sentences = [sentence for sentence in extracted_sentences if matches_keyword(sentence, user_keywords)]
    else:
        filtered_sentences = extracted_sentences

    # Highlight keywords in the filtered sentences
    highlighted_sentences = [highlight_keywords(sentence, user_keywords) for sentence in filtered_sentences]

    return highlighted_sentences, "Authors not extracted from text input."

def handle_input(file_path=None, input_text=None, keyword_group=None, custom_keywords=None, time_duration=False):
    """
    Handle user input from the Gradio interface,
    process the file or text, and return highlighted sentences with authors.
    """
    # Decide on which keywords to use
    user_keywords = []
    if keyword_group:
        user_keywords = keywords.get(keyword_group, [])
    if custom_keywords:
        user_keywords.extend(kw.strip() for kw in custom_keywords.split(",") if kw.strip())

    if not user_keywords and not time_duration:
        return "No keyword provided."

    if file_path:
        extracted_sentences, authors_str = process_file(file_path, user_keywords, time_duration)
    elif input_text:
        extracted_sentences, authors_str = process_text(input_text, user_keywords, time_duration)
    else:
        return "No input provided."

    if extracted_sentences:
        # Combine authors and highlighted sentences into HTML
        highlighted_html = f"<p><b>Authors:</b> {authors_str}</p>"
        for sentence in extracted_sentences:
            highlighted_html += f"<p>{sentence}</p>"
        return highlighted_html

    return "No matching sentences found."

# Gradio Interface
iface = gr.Interface(
    fn=handle_input,
    inputs=[
        gr.File(label="Upload PDF or Text File", type="filepath"),
        gr.Textbox(label="Enter Text", placeholder="Type or paste text here..."),
        gr.Radio(
            choices=list(keywords.keys()),
            label="Information related to..."
        ),
        gr.Textbox(
            label="Enter Custom Keywords",
            placeholder="e.g., migraine, headache"
        ),
        # gr.Checkbox(
        #     label="Check Time Duration Criteria",
        #     value=False
        # )
    ],
    outputs=gr.HTML(label="Processed Output"),
    title="BioMedical Information Extraction",
    description="""
        <div style='text-align: left;'>
            Made by: Sumit Kumar (2311006), Ramavath Tharun (21219) <br>
            Supervisor: Dr. Tanmay Basu<br>
            Indian Institute of Science Education and Research<br>
        </div>
        <div style='text-align: center; margin-top: 10px;'>
            <b>Upload a PDF file or enter text, then select a keyword group or enter custom keywords to extract and highlight relevant sentences.</b>
        </div>
    """,
    examples=None,  # You can add example files or texts if desired
    allow_flagging="never",
    cache_examples=True,
    # Add custom CSS to style the <mark> tag if necessary
    css="""
        mark {
            background-color: blue;
            padding: 0;
            border-radius: 2px;
        }
        /* Optional: Adjust paragraph spacing */
        p {
            margin-bottom: 10px;
        }
    """
)

iface.launch(share=True)