sumit4352
/

biomedical_info_extraction

Model card Files Files and versions

xet

Community

sumit4352 commited on Feb 25, 2025

Commit

ae4fa62

verified ·

1 Parent(s): 98f1b36

Upload 2 files

Browse files

Files changed (2) hide show

config.py +104 -0
main.py +332 -0

config.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import re
+# Keywords
+keywords = {
+    "Gender": ["male", "female", " man ", "woman", " men ", " men,", "women", "boy", "girl", "males", "females"],
+    "Age": [" age ", "age,", " aged ", "years old", "year-old", "year olds", "elderly", "adults", "young", "youth"],
+    "Patients": ["patient", "patients", "case", "cases", "subject", "subjects", "individual", "individuals"],
+    "Participants": ["participant", "participants", "attendee", "attendees", "respondent", "respondents"],
+    "Inclusion Criteria": ["inclusion", "eligibility criteria", "study inclusion", "included"],
+    "Exclusion Criteria": ["exclusion", "not eligible", "study exclusion", "excluded"],
+    "Study Types": [
+        "Case Report", "Case Series", "Cross-sectional Study", "Case-Control Study", "Cohort Study", "Randomized Controlled Clinical Trial",
+        "Non-Randomized Controlled Trial", "Pilot Study", "Feasibility Study", "Longitudinal Study", "Retrospective Study", "Prospective Study",
+        "Observational Study", "Experimental Study", "Interventional Study", "Descriptive Study", "Analytical Study", "Quasi-Experimental Study",
+        "Epidemiological Study", "Ecological Study", "Systematic Review", "Meta-Analysis", "Mixed-Methods Study", "Narrative Review", "Scoping Review",
+        "Rapid Review", "Umbrella Review", "Diagnostic Accuracy Study", "Validation Study", "Genome-Wide Association Study (GWAS)",
+        "Gene-Environment Interaction Study", "Linkage Study", "Sensitivity/Specificity Study", "Cost-Effectiveness Study", "Health Technology Assessment",
+        "Quality Improvement Study", "Translational Research", "Implementation Science Study", "Psychometric Study", "Community-Based Participatory Research (CBPR)",
+        "In Vitro Study", "In Vivo Study", "Simulation Study", "Phenomenological Study", "Ethnographic Study", "Grounded Theory Study", "Narrative Study",
+        "Case Study", "Pragmatic Trial", "Cluster Randomized Trial", "Adaptive Trial", "Phase 1 Clinical Trial", "Phase 2 Clinical Trial", "Phase 3 Clinical Trial",
+        "Phase 4 Clinical Trial", "Real-World Evidence Study", "Comparative Effectiveness Study", "Proof-of-Concept Study", "Dose-Response Study", "Cross-Over Study",
+        "Nested Study", "Multicenter Study", "Delphi Study", "Pragmatic Clinical Trial", "Registry-Based Study", "Historical Cohort Study",
+        "Nested Case-Control Study", " double-blind ", "double blind", "placebo-controlled", "placebo controlled", "Cross-sectional analysis"
+    ],
+    "Co-morbidities": ["comorbidities", "co-morbidities", "comor-bidities", " comorbidities ", "comorbidities"],
+    "Country": ["Afghanistan", "Australia", "Brazil", "Canada", "China", "France", "Germany", "India", "Japan", "Mexico", "Nigeria", "Russia",
+                "South Africa", "United Kingdom", "United States", "Prefer Not to Answer"],
+    "Race/Ethnicity": ["white", "Black", "African American", "Asian", "Native Hawaiian", "Other Pacific Islander", "American Indian",
+                       "Alaska Native", "Other Race", "Two or More Races", "Hispanic", "latino", "Not Hispanic or latino"],
+    "Follow-Up": ["years", "year", "weeks", "week", "months", "month", "days", "day"],
+    "Remark": [
+        "displayed", "exhibited", "revealed", "indicated", "illustrated",  "Showed",
+        "noticed", "perceived", "detected", "discerned", "identified",   "Observed",
+        "progress", "enhancement", "advancement", "growth", "betterment", "Improvement",
+        "proved", "exhibited", "showcased", "conveyed", "validated",  "Demonstrated",
+        "similar", "equivalent", "parallel", "analogous", "akin",  "Comparable",
+        "more secure", "less risky", "protected", "shielded", "guarded",  "Safer",
+        "chosen", "picked", "opted", "designated", "elected", "Selected"
+    ],
+    "Intervention Groups": [
+        "intervention grorup", "intervention groups", "treatment groups", "treatment group", "control groups", "control group", "placebo group",
+        "placebo groups"
+    ],
+    "Outcomes": [
+        "results", "findings", "observations", "conclusion", "outcome", "clinical outcome", "results:",
+        "efficacy", "effectiveness", "treatment response", "pain reduction", "symptom improvement",
+        "disease progression", "treatment success", "remission rate", "response rate", "conclusion:",
+        "adverse effects", "side effects", "complications", "recurrence", "recovery time", "result:",
+        "statistical significance", "p-value", "confidence interval", "hazard ratio", "risk reduction"
+    ],
+    "Assessment Tools": [
+        "Visual Analog Scale (VAS)", "WOMAC", "Western Ontario and McMaster Universities Osteoarthritis Index",
+        "Numeric Rating Scale (NRS)", "McGill Pain Questionnaire (MPQ)", "Timed Up and Go Test (TUG)",
+        "6-Minute Walk Test (6MWT)", "gait analysis", "joint range of motion", "functional independence measure",
+        "SF-36", "EQ-5D", "Beck Depression Inventory", "Hospital Anxiety and Depression Scale (HADS)",
+        "blood tests", "CRP", "C-reactive protein", "ESR", "TNF-α", "IL-6", "synovial fluid analysis",
+        "X-ray", "Kellgren-Lawrence grade", "MRI", "magnetic resonance imaging", "musculoskeletal ultrasound",
+        "cartilage thickness", "bone marrow lesion", "mental health scales", "quality of life assessments", "ELISA"
+    ]
+}
+# Author name regex pattern
+author_pattern = r'\b(?:[A-Z]\.\s*)*[A-Z][a-zA-Z\.\-\']+(?:\s[A-Z][a-zA-Z\.\-\']+)*\b(?:\s[0-9]+)?'
+# Words and patterns to exclude
+exclude_words = {
+    "Aim", "This", "the", "Article", "School", "Topical", "with", "compress",
+    "Research", "Capsi", "India", "Australia", "and", "others", "January", "February",
+    "March", "April", "May", "June", "July", "August", "September", "October",
+    "November", "December", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
+    "Saturday", "Sunday", "AM", "PM", "University", "College", "Institute", "School",
+    "of", "in", "on", "at", "by", "for", "with", "about", "against", "between",
+    "into", "through", "during", "before", "after", "above", "below", "to", "from",
+    "up", "down", "in", "out", "over", "under", "again", "further", "then", "once",
+    "here", "there", "when", "where", "why", "how", "all", "any", "both", "each",
+    "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only",
+    "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just",
+    "don", "should", "now", "Ginger", "Migraine"
+}
+# Regex patterns
+numeric_regex = re.compile(r"\b(?:-?\d+\.?\d*%?|\d+-\d+%?|\d+(?: \d+)*%?)\b")
+exclude_brackets_regex = re.compile(r"[$$($$]\s*[\d,/-]+\s*[$$)$$]")
+date_regex = re.compile(r"\b(?:\d{1,2}/\d{1,2}/\d{2,4}|\d{1,2}-\d{1,2}-\d{2,4}|\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\b \d{1,2}, \d{4})\b", re.IGNORECASE)
+table_regex = re.compile(r"^(?:\s*\d+\s+)+$")
+# Build regex patterns for exact matches
+gender_regex = re.compile(rf'\b(?:{"|".join(map(re.escape, keywords["Gender"]))})\b', re.IGNORECASE)
+age_regex = re.compile(rf'\b(?:{"|".join(map(re.escape, keywords["Age"]))})\b', re.IGNORECASE)
+outcomes_regex = re.compile(rf'\b(?:{"|".join(map(re.escape, keywords["Outcomes"]))})\b', re.IGNORECASE)
+assessment_tools_regex = re.compile(rf'\b(?:{"|".join(map(re.escape, keywords["Assessment Tools"]))})\b', re.IGNORECASE)
+# Time duration regex pattern
+follow_up = re.compile(
+    rf'\b(?:\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s*(?:{"|".join(map(re.escape, keywords["Follow-Up"]))})(?:\b|s\b|-| to \d+)\b',
+    re.IGNORECASE
+)
+# Key sections for extraction
+key_sections = [
+    "Summary", "Overview", "Synopsis", "Results", "Findings", "Observations", "Conclusion",
+    "Assessment", "Evaluation", "Outcomes", "Measurements", "Test Results", "Analysis",
+    "Abstract", "A B S T R A C T", "Background"
+]

main.py ADDED Viewed

	@@ -0,0 +1,332 @@

+# prac.py
+import gradio as gr
+import re
+import fitz  # PyMuPDF
+import spacy
+from config import (
+    keywords,
+    numeric_regex,
+    exclude_brackets_regex,
+    date_regex,
+    table_regex,
+    gender_regex,
+    age_regex,
+    author_pattern,
+    exclude_words,
+    key_sections,
+    follow_up
+)
+# Load spaCy's English model
+nlp = spacy.load("en_core_web_sm")
+def normalize_text(text):
+    """Normalize text by removing extra whitespace."""
+    text = re.sub(r'([a-zA-Z0-9])([\),.!?;-]+)([a-zA-Z])', r'\1\2 \3', text )    # Space between delimmiter and letter
+    text = re.sub(r'([a-z])([\.])([\s]*)([a-z])', r'\1 \3\4', text)              # Reomove '.' between two lowercase letters e.g., et al. xxx
+    text = re.sub(r'([0-9]+)([\.]+)([0-9]+)([\.]+)([0-9]+)', r'\1-\3-\5', text)  # Reomove '.' between three decimal numbers e.g., et 000.55.66
+    text = re.sub(r'([a-z])([\.]*)([0-9])', r'\1\2 \3', text)                    # Space between letter and no.
+    text = re.sub(r'(\s)([a-z0-9]+)([A-Z])([\w]+)', r'\1\2. \3\4', text)         # Put a '.' after a lowercase letter/number followed by Uppercase e.g., drains removed by day threeHe continued to
+    text = re.sub(r'([a-z0-9])([\n]+)([A-Z])', r'\1\. \3', text)                 # Put a between lowercase letter/number, \n and uppercase letter e.g., xxx5 \n Yyy
+    text = re.sub(r'(\.)([\s]*)([\.]+)', r'\1', text)                            # Removing extra '.'s, if any
+    text = re.sub(r'([a-zA-Z0-9])([\s]*)([-])([\s]*)([a-zA-Z0-9])', r'\1\3\5', text) # Replace words like trans - anethole with trans-anethole
+    # return text
+    return " ".join(line.strip() for line in text.splitlines())
+def extract_sentences(text):
+    """Split text into sentences."""
+    return re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s', text)
+def contains_valid_numeric(sentence):
+    """Check if a sentence contains valid numeric values."""
+    matches = numeric_regex.findall(sentence)
+    bracketed_numbers = exclude_brackets_regex.findall(sentence)
+    return bool(matches) and len(matches) != len(bracketed_numbers)
+def matches_criteria(sentence, check_time_duration=False):
+    """Check if a sentence matches any of the defined keyword criteria."""
+    if date_regex.search(sentence) or table_regex.match(sentence):
+        return False
+    # Gender: Whole-word match only
+    contains_gender = bool(gender_regex.search(sentence))
+    # Age: Must contain numeric + age-related keyword as a whole word
+    # contains_age_and_numeric = bool(re.search(
+    #     r"\b(\d{1,3})\s*(?:years? old|year-old|year olds?|aged|age|young|elderly)\b",
+    #     sentence, re.IGNORECASE
+    # ))
+    contains_age_and_numeric = bool(re.search(
+    r"\b(?:\d{1,3}(?:–\d{1,3})?)\s*(?:years?|year-old|year olds?|aged\b|ages\b)\b",
+    sentence, re.IGNORECASE
+    ))
+    # Patients: Must contain numeric + patients
+    contains_patients_and_numeric = bool(re.search(
+        r"\b(\d+)\s*(?:patient|patients|case|cases|subject|subjects)\b",
+        sentence, re.IGNORECASE
+    ))
+    # Participants: Must contain numeric + participants
+    contains_participants_and_numeric = bool(re.search(
+        r"\b(\d+)\s*(?:participant|participants|attendee|respondent|volunteer)\b",
+        sentence, re.IGNORECASE
+    ))
+    # Inclusion and Exclusion: Must contain numeric + keyword
+    contains_inclusion_and_numeric = bool(re.search(
+        r"\b(\d+)\s*(?:inclusion|eligibility criteria|study inclusion)\b",
+        sentence, re.IGNORECASE
+    ))
+    contains_exclusion_and_numeric = bool(re.search(
+        r"\b(\d+)\s*(?:exclusion|study exclusion|not eligible)\b",
+        sentence, re.IGNORECASE
+    ))
+    # Co-morbidities: Matches keyword only
+    contains_comorbidities = any(kw in sentence.lower() for kw in keywords["Co-morbidities"])
+    # Time durations: Matches numeric + time unit
+    time_duration_regex = re.compile(
+        r'\b(?:\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s*'
+        + r"(?:years|year|weeks|week|months|month|days|day)\b",
+        re.IGNORECASE
+    )
+    contains_time_duration = bool(time_duration_regex.search(sentence))
+    # Ensure the sentence contains valid numeric values
+    contains_valid_numeric_value = contains_valid_numeric(sentence)
+    # Additional criteria based on Remark and Intervention Groups
+    contains_remark = any(kw in sentence.lower() for kw in keywords["Remark"])
+    contains_intervention = any(kw in sentence.lower() for kw in keywords["Intervention Groups"])
+    contains_study_type = any(kw in sentence.lower() for kw in keywords["Study Types"])
+    contains_country = any(kw in sentence.lower() for kw in keywords["Country"])
+    contains_race = any(kw in sentence.lower() for kw in keywords["Race/Ethnicity"])
+    if check_time_duration:
+        return contains_time_duration
+    return (
+        contains_valid_numeric_value and (
+            contains_gender
+            or contains_age_and_numeric
+            or contains_patients_and_numeric
+            or contains_participants_and_numeric
+            or contains_inclusion_and_numeric
+            or contains_exclusion_and_numeric
+            or contains_comorbidities
+            or contains_time_duration
+            or contains_remark
+            or contains_intervention
+            or contains_study_type
+            or contains_country
+        )
+    )
+def matches_keyword(sentence, user_keywords):
+    """Check if a sentence contains any of the user-specified keywords."""
+    return any(keyword.lower() in sentence.lower() for keyword in user_keywords)
+def extract_authors(page):
+    """Extract authors' names from the text above specified headers."""
+    full_text = page.get_text()
+    # Find the position of key sections
+    section_positions = {section: full_text.find(section) for section in key_sections}
+    # Filter out sections not found
+    section_positions = {k: v for k, v in section_positions.items() if v != -1}
+    # Determine the closest section and extract text above it
+    if section_positions:
+        closest_section = min(section_positions, key=section_positions.get)
+        cutoff_position = section_positions[closest_section]
+        text_to_search = full_text[:cutoff_position]  # Extract text above the section
+    else:
+        text_to_search = full_text
+    # Find author names using regex
+    author_matches = re.findall(author_pattern, text_to_search)
+    # Use NLP to further refine author name extraction
+    doc = nlp(text_to_search)
+    nlp_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
+    # Combine regex and NLP results, filtering out unwanted words
+    combined_names = set(author_matches + nlp_names)
+    filtered_authors = [name for name in combined_names if name.lower() not in exclude_words]
+    return list(set(filtered_authors))
+def highlight_keywords(sentence, user_keywords):
+    """Highlight user_keywords in the sentence using <mark> tags."""
+    if not user_keywords:
+        return sentence
+    # Separate single-word and multi-word keywords
+    single_words = [kw for kw in user_keywords if ' ' not in kw]
+    phrases = [kw for kw in user_keywords if ' ' in kw]
+    # Escape keywords for regex
+    escaped_single_words = [re.escape(kw) for kw in single_words]
+    escaped_phrases = [re.escape(kw) for kw in phrases]
+    # Build regex patterns
+    patterns = []
+    if escaped_single_words:
+        single_word_pattern = r'\b(?:' + '|'.join(escaped_single_words) + r')\b'
+        patterns.append(single_word_pattern)
+    if escaped_phrases:
+        phrase_pattern = r'(?:' + '|'.join(escaped_phrases) + r')'
+        patterns.append(phrase_pattern)
+    # Combine patterns into a single regex
+    if patterns:
+        combined_pattern = re.compile('|'.join(patterns), re.IGNORECASE)
+    else:
+        return sentence
+    # Function to add <mark> tags
+    def replacer(match):
+        return f"<mark>{match.group(0)}</mark>"
+    # Substitute matched keywords with highlighted version
+    highlighted_sentence = combined_pattern.sub(replacer, sentence)
+    return highlighted_sentence
+def process_file(file_path, user_keywords, check_time_duration=False):
+    """
+    Process the PDF file and extract sentences based on criteria,
+    then filter by user keywords and highlight them.
+    """
+    doc = fitz.open(file_path)
+    first_page = doc[0]
+    author_names = extract_authors(first_page)
+    authors_str = ', '.join(author_names)
+    all_extracted_sentences = []
+    for page in doc:
+        text = normalize_text(page.get_text())
+        sentences = extract_sentences(text)
+        extracted = [sentence.strip() for sentence in sentences if matches_criteria(sentence, check_time_duration)]
+        all_extracted_sentences.extend(extracted)
+    if not check_time_duration:
+        filtered_sentences = [sentence for sentence in all_extracted_sentences if matches_keyword(sentence, user_keywords)]
+    else:
+        filtered_sentences = all_extracted_sentences
+    # Highlight keywords in the filtered sentences
+    highlighted_sentences = [highlight_keywords(sentence, user_keywords) for sentence in filtered_sentences]
+    doc.close()
+    return highlighted_sentences, authors_str
+def process_text(input_text, user_keywords, check_time_duration=False):
+    """
+    Process the input text and extract sentences based on criteria,
+    then filter by user keywords and highlight them.
+    """
+    refined_text = normalize_text(input_text)
+    sentences = extract_sentences(refined_text)
+    extracted_sentences = [sentence.strip() for sentence in sentences if matches_criteria(sentence, check_time_duration)]
+    if not check_time_duration:
+        filtered_sentences = [sentence for sentence in extracted_sentences if matches_keyword(sentence, user_keywords)]
+    else:
+        filtered_sentences = extracted_sentences
+    # Highlight keywords in the filtered sentences
+    highlighted_sentences = [highlight_keywords(sentence, user_keywords) for sentence in filtered_sentences]
+    return highlighted_sentences, "Authors not extracted from text input."
+def handle_input(file_path=None, input_text=None, keyword_group=None, custom_keywords=None, time_duration=False):
+    """
+    Handle user input from the Gradio interface,
+    process the file or text, and return highlighted sentences with authors.
+    """
+    # Decide on which keywords to use
+    user_keywords = []
+    if keyword_group:
+        user_keywords = keywords.get(keyword_group, [])
+    if custom_keywords:
+        user_keywords.extend(kw.strip() for kw in custom_keywords.split(",") if kw.strip())
+    if not user_keywords and not time_duration:
+        return "No keyword provided."
+    if file_path:
+        extracted_sentences, authors_str = process_file(file_path, user_keywords, time_duration)
+    elif input_text:
+        extracted_sentences, authors_str = process_text(input_text, user_keywords, time_duration)
+    else:
+        return "No input provided."
+    if extracted_sentences:
+        # Combine authors and highlighted sentences into HTML
+        highlighted_html = f"<p><b>Authors:</b> {authors_str}</p>"
+        for sentence in extracted_sentences:
+            highlighted_html += f"<p>{sentence}</p>"
+        return highlighted_html
+    return "No matching sentences found."
+# Gradio Interface
+iface = gr.Interface(
+    fn=handle_input,
+    inputs=[
+        gr.File(label="Upload PDF or Text File", type="filepath"),
+        gr.Textbox(label="Enter Text", placeholder="Type or paste text here..."),
+        gr.Radio(
+            choices=list(keywords.keys()),
+            label="Information related to..."
+        ),
+        gr.Textbox(
+            label="Enter Custom Keywords",
+            placeholder="e.g., migraine, headache"
+        ),
+        # gr.Checkbox(
+        #     label="Check Time Duration Criteria",
+        #     value=False
+        # )
+    ],
+    outputs=gr.HTML(label="Processed Output"),
+    title="BioMedical Information Extraction",
+    description="""
+        <div style='text-align: left;'>
+            Made by: Sumit Kumar (2311006), Ramavath Tharun (21219) <br>
+            Supervisor: Dr. Tanmay Basu<br>
+            Indian Institute of Science Education and Research<br>
+        </div>
+        <div style='text-align: center; margin-top: 10px;'>
+            <b>Upload a PDF file or enter text, then select a keyword group or enter custom keywords to extract and highlight relevant sentences.</b>
+        </div>
+    """,
+    examples=None,  # You can add example files or texts if desired
+    allow_flagging="never",
+    cache_examples=True,
+    # Add custom CSS to style the <mark> tag if necessary
+    css="""
+        mark {
+            background-color: blue;
+            padding: 0;
+            border-radius: 2px;
+        }
+        /* Optional: Adjust paragraph spacing */
+        p {
+            margin-bottom: 10px;
+        }
+    """
+)
+iface.launch(share=True)