Upload 2 files
Browse files
config.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
# Keywords
|
| 4 |
+
keywords = {
|
| 5 |
+
"Gender": ["male", "female", " man ", "woman", " men ", " men,", "women", "boy", "girl", "males", "females"],
|
| 6 |
+
"Age": [" age ", "age,", " aged ", "years old", "year-old", "year olds", "elderly", "adults", "young", "youth"],
|
| 7 |
+
"Patients": ["patient", "patients", "case", "cases", "subject", "subjects", "individual", "individuals"],
|
| 8 |
+
"Participants": ["participant", "participants", "attendee", "attendees", "respondent", "respondents"],
|
| 9 |
+
"Inclusion Criteria": ["inclusion", "eligibility criteria", "study inclusion", "included"],
|
| 10 |
+
"Exclusion Criteria": ["exclusion", "not eligible", "study exclusion", "excluded"],
|
| 11 |
+
"Study Types": [
|
| 12 |
+
"Case Report", "Case Series", "Cross-sectional Study", "Case-Control Study", "Cohort Study", "Randomized Controlled Clinical Trial",
|
| 13 |
+
"Non-Randomized Controlled Trial", "Pilot Study", "Feasibility Study", "Longitudinal Study", "Retrospective Study", "Prospective Study",
|
| 14 |
+
"Observational Study", "Experimental Study", "Interventional Study", "Descriptive Study", "Analytical Study", "Quasi-Experimental Study",
|
| 15 |
+
"Epidemiological Study", "Ecological Study", "Systematic Review", "Meta-Analysis", "Mixed-Methods Study", "Narrative Review", "Scoping Review",
|
| 16 |
+
"Rapid Review", "Umbrella Review", "Diagnostic Accuracy Study", "Validation Study", "Genome-Wide Association Study (GWAS)",
|
| 17 |
+
"Gene-Environment Interaction Study", "Linkage Study", "Sensitivity/Specificity Study", "Cost-Effectiveness Study", "Health Technology Assessment",
|
| 18 |
+
"Quality Improvement Study", "Translational Research", "Implementation Science Study", "Psychometric Study", "Community-Based Participatory Research (CBPR)",
|
| 19 |
+
"In Vitro Study", "In Vivo Study", "Simulation Study", "Phenomenological Study", "Ethnographic Study", "Grounded Theory Study", "Narrative Study",
|
| 20 |
+
"Case Study", "Pragmatic Trial", "Cluster Randomized Trial", "Adaptive Trial", "Phase 1 Clinical Trial", "Phase 2 Clinical Trial", "Phase 3 Clinical Trial",
|
| 21 |
+
"Phase 4 Clinical Trial", "Real-World Evidence Study", "Comparative Effectiveness Study", "Proof-of-Concept Study", "Dose-Response Study", "Cross-Over Study",
|
| 22 |
+
"Nested Study", "Multicenter Study", "Delphi Study", "Pragmatic Clinical Trial", "Registry-Based Study", "Historical Cohort Study",
|
| 23 |
+
"Nested Case-Control Study", " double-blind ", "double blind", "placebo-controlled", "placebo controlled", "Cross-sectional analysis"
|
| 24 |
+
],
|
| 25 |
+
"Co-morbidities": ["comorbidities", "co-morbidities", "comor-bidities", " comorbidities ", "comorbidities"],
|
| 26 |
+
"Country": ["Afghanistan", "Australia", "Brazil", "Canada", "China", "France", "Germany", "India", "Japan", "Mexico", "Nigeria", "Russia",
|
| 27 |
+
"South Africa", "United Kingdom", "United States", "Prefer Not to Answer"],
|
| 28 |
+
"Race/Ethnicity": ["white", "Black", "African American", "Asian", "Native Hawaiian", "Other Pacific Islander", "American Indian",
|
| 29 |
+
"Alaska Native", "Other Race", "Two or More Races", "Hispanic", "latino", "Not Hispanic or latino"],
|
| 30 |
+
"Follow-Up": ["years", "year", "weeks", "week", "months", "month", "days", "day"],
|
| 31 |
+
"Remark": [
|
| 32 |
+
"displayed", "exhibited", "revealed", "indicated", "illustrated", "Showed",
|
| 33 |
+
"noticed", "perceived", "detected", "discerned", "identified", "Observed",
|
| 34 |
+
"progress", "enhancement", "advancement", "growth", "betterment", "Improvement",
|
| 35 |
+
"proved", "exhibited", "showcased", "conveyed", "validated", "Demonstrated",
|
| 36 |
+
"similar", "equivalent", "parallel", "analogous", "akin", "Comparable",
|
| 37 |
+
"more secure", "less risky", "protected", "shielded", "guarded", "Safer",
|
| 38 |
+
"chosen", "picked", "opted", "designated", "elected", "Selected"
|
| 39 |
+
],
|
| 40 |
+
"Intervention Groups": [
|
| 41 |
+
"intervention grorup", "intervention groups", "treatment groups", "treatment group", "control groups", "control group", "placebo group",
|
| 42 |
+
"placebo groups"
|
| 43 |
+
],
|
| 44 |
+
"Outcomes": [
|
| 45 |
+
"results", "findings", "observations", "conclusion", "outcome", "clinical outcome", "results:",
|
| 46 |
+
"efficacy", "effectiveness", "treatment response", "pain reduction", "symptom improvement",
|
| 47 |
+
"disease progression", "treatment success", "remission rate", "response rate", "conclusion:",
|
| 48 |
+
"adverse effects", "side effects", "complications", "recurrence", "recovery time", "result:",
|
| 49 |
+
"statistical significance", "p-value", "confidence interval", "hazard ratio", "risk reduction"
|
| 50 |
+
],
|
| 51 |
+
"Assessment Tools": [
|
| 52 |
+
"Visual Analog Scale (VAS)", "WOMAC", "Western Ontario and McMaster Universities Osteoarthritis Index",
|
| 53 |
+
"Numeric Rating Scale (NRS)", "McGill Pain Questionnaire (MPQ)", "Timed Up and Go Test (TUG)",
|
| 54 |
+
"6-Minute Walk Test (6MWT)", "gait analysis", "joint range of motion", "functional independence measure",
|
| 55 |
+
"SF-36", "EQ-5D", "Beck Depression Inventory", "Hospital Anxiety and Depression Scale (HADS)",
|
| 56 |
+
"blood tests", "CRP", "C-reactive protein", "ESR", "TNF-α", "IL-6", "synovial fluid analysis",
|
| 57 |
+
"X-ray", "Kellgren-Lawrence grade", "MRI", "magnetic resonance imaging", "musculoskeletal ultrasound",
|
| 58 |
+
"cartilage thickness", "bone marrow lesion", "mental health scales", "quality of life assessments", "ELISA"
|
| 59 |
+
]
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
# Author name regex pattern
|
| 63 |
+
author_pattern = r'\b(?:[A-Z]\.\s*)*[A-Z][a-zA-Z\.\-\']+(?:\s[A-Z][a-zA-Z\.\-\']+)*\b(?:\s[0-9]+)?'
|
| 64 |
+
|
| 65 |
+
# Words and patterns to exclude
|
| 66 |
+
exclude_words = {
|
| 67 |
+
"Aim", "This", "the", "Article", "School", "Topical", "with", "compress",
|
| 68 |
+
"Research", "Capsi", "India", "Australia", "and", "others", "January", "February",
|
| 69 |
+
"March", "April", "May", "June", "July", "August", "September", "October",
|
| 70 |
+
"November", "December", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
|
| 71 |
+
"Saturday", "Sunday", "AM", "PM", "University", "College", "Institute", "School",
|
| 72 |
+
"of", "in", "on", "at", "by", "for", "with", "about", "against", "between",
|
| 73 |
+
"into", "through", "during", "before", "after", "above", "below", "to", "from",
|
| 74 |
+
"up", "down", "in", "out", "over", "under", "again", "further", "then", "once",
|
| 75 |
+
"here", "there", "when", "where", "why", "how", "all", "any", "both", "each",
|
| 76 |
+
"few", "more", "most", "other", "some", "such", "no", "nor", "not", "only",
|
| 77 |
+
"own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just",
|
| 78 |
+
"don", "should", "now", "Ginger", "Migraine"
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
# Regex patterns
|
| 82 |
+
numeric_regex = re.compile(r"\b(?:-?\d+\.?\d*%?|\d+-\d+%?|\d+(?: \d+)*%?)\b")
|
| 83 |
+
exclude_brackets_regex = re.compile(r"[$$($$]\s*[\d,/-]+\s*[$$)$$]")
|
| 84 |
+
date_regex = re.compile(r"\b(?:\d{1,2}/\d{1,2}/\d{2,4}|\d{1,2}-\d{1,2}-\d{2,4}|\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\b \d{1,2}, \d{4})\b", re.IGNORECASE)
|
| 85 |
+
table_regex = re.compile(r"^(?:\s*\d+\s+)+$")
|
| 86 |
+
|
| 87 |
+
# Build regex patterns for exact matches
|
| 88 |
+
gender_regex = re.compile(rf'\b(?:{"|".join(map(re.escape, keywords["Gender"]))})\b', re.IGNORECASE)
|
| 89 |
+
age_regex = re.compile(rf'\b(?:{"|".join(map(re.escape, keywords["Age"]))})\b', re.IGNORECASE)
|
| 90 |
+
outcomes_regex = re.compile(rf'\b(?:{"|".join(map(re.escape, keywords["Outcomes"]))})\b', re.IGNORECASE)
|
| 91 |
+
assessment_tools_regex = re.compile(rf'\b(?:{"|".join(map(re.escape, keywords["Assessment Tools"]))})\b', re.IGNORECASE)
|
| 92 |
+
|
| 93 |
+
# Time duration regex pattern
|
| 94 |
+
follow_up = re.compile(
|
| 95 |
+
rf'\b(?:\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s*(?:{"|".join(map(re.escape, keywords["Follow-Up"]))})(?:\b|s\b|-| to \d+)\b',
|
| 96 |
+
re.IGNORECASE
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# Key sections for extraction
|
| 100 |
+
key_sections = [
|
| 101 |
+
"Summary", "Overview", "Synopsis", "Results", "Findings", "Observations", "Conclusion",
|
| 102 |
+
"Assessment", "Evaluation", "Outcomes", "Measurements", "Test Results", "Analysis",
|
| 103 |
+
"Abstract", "A B S T R A C T", "Background"
|
| 104 |
+
]
|
main.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# prac.py
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
import re
|
| 5 |
+
import fitz # PyMuPDF
|
| 6 |
+
import spacy
|
| 7 |
+
from config import (
|
| 8 |
+
keywords,
|
| 9 |
+
numeric_regex,
|
| 10 |
+
exclude_brackets_regex,
|
| 11 |
+
date_regex,
|
| 12 |
+
table_regex,
|
| 13 |
+
gender_regex,
|
| 14 |
+
age_regex,
|
| 15 |
+
author_pattern,
|
| 16 |
+
exclude_words,
|
| 17 |
+
key_sections,
|
| 18 |
+
follow_up
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# Load spaCy's English model
|
| 22 |
+
nlp = spacy.load("en_core_web_sm")
|
| 23 |
+
|
| 24 |
+
def normalize_text(text):
|
| 25 |
+
"""Normalize text by removing extra whitespace."""
|
| 26 |
+
text = re.sub(r'([a-zA-Z0-9])([\),.!?;-]+)([a-zA-Z])', r'\1\2 \3', text ) # Space between delimmiter and letter
|
| 27 |
+
text = re.sub(r'([a-z])([\.])([\s]*)([a-z])', r'\1 \3\4', text) # Reomove '.' between two lowercase letters e.g., et al. xxx
|
| 28 |
+
text = re.sub(r'([0-9]+)([\.]+)([0-9]+)([\.]+)([0-9]+)', r'\1-\3-\5', text) # Reomove '.' between three decimal numbers e.g., et 000.55.66
|
| 29 |
+
text = re.sub(r'([a-z])([\.]*)([0-9])', r'\1\2 \3', text) # Space between letter and no.
|
| 30 |
+
text = re.sub(r'(\s)([a-z0-9]+)([A-Z])([\w]+)', r'\1\2. \3\4', text) # Put a '.' after a lowercase letter/number followed by Uppercase e.g., drains removed by day threeHe continued to
|
| 31 |
+
text = re.sub(r'([a-z0-9])([\n]+)([A-Z])', r'\1\. \3', text) # Put a between lowercase letter/number, \n and uppercase letter e.g., xxx5 \n Yyy
|
| 32 |
+
text = re.sub(r'(\.)([\s]*)([\.]+)', r'\1', text) # Removing extra '.'s, if any
|
| 33 |
+
text = re.sub(r'([a-zA-Z0-9])([\s]*)([-])([\s]*)([a-zA-Z0-9])', r'\1\3\5', text) # Replace words like trans - anethole with trans-anethole
|
| 34 |
+
# return text
|
| 35 |
+
return " ".join(line.strip() for line in text.splitlines())
|
| 36 |
+
|
| 37 |
+
def extract_sentences(text):
|
| 38 |
+
"""Split text into sentences."""
|
| 39 |
+
return re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s', text)
|
| 40 |
+
|
| 41 |
+
def contains_valid_numeric(sentence):
|
| 42 |
+
"""Check if a sentence contains valid numeric values."""
|
| 43 |
+
matches = numeric_regex.findall(sentence)
|
| 44 |
+
bracketed_numbers = exclude_brackets_regex.findall(sentence)
|
| 45 |
+
return bool(matches) and len(matches) != len(bracketed_numbers)
|
| 46 |
+
|
| 47 |
+
def matches_criteria(sentence, check_time_duration=False):
|
| 48 |
+
"""Check if a sentence matches any of the defined keyword criteria."""
|
| 49 |
+
if date_regex.search(sentence) or table_regex.match(sentence):
|
| 50 |
+
return False
|
| 51 |
+
|
| 52 |
+
# Gender: Whole-word match only
|
| 53 |
+
contains_gender = bool(gender_regex.search(sentence))
|
| 54 |
+
|
| 55 |
+
# Age: Must contain numeric + age-related keyword as a whole word
|
| 56 |
+
# contains_age_and_numeric = bool(re.search(
|
| 57 |
+
# r"\b(\d{1,3})\s*(?:years? old|year-old|year olds?|aged|age|young|elderly)\b",
|
| 58 |
+
# sentence, re.IGNORECASE
|
| 59 |
+
# ))
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
contains_age_and_numeric = bool(re.search(
|
| 63 |
+
r"\b(?:\d{1,3}(?:–\d{1,3})?)\s*(?:years?|year-old|year olds?|aged\b|ages\b)\b",
|
| 64 |
+
sentence, re.IGNORECASE
|
| 65 |
+
))
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# Patients: Must contain numeric + patients
|
| 70 |
+
contains_patients_and_numeric = bool(re.search(
|
| 71 |
+
r"\b(\d+)\s*(?:patient|patients|case|cases|subject|subjects)\b",
|
| 72 |
+
sentence, re.IGNORECASE
|
| 73 |
+
))
|
| 74 |
+
|
| 75 |
+
# Participants: Must contain numeric + participants
|
| 76 |
+
contains_participants_and_numeric = bool(re.search(
|
| 77 |
+
r"\b(\d+)\s*(?:participant|participants|attendee|respondent|volunteer)\b",
|
| 78 |
+
sentence, re.IGNORECASE
|
| 79 |
+
))
|
| 80 |
+
|
| 81 |
+
# Inclusion and Exclusion: Must contain numeric + keyword
|
| 82 |
+
contains_inclusion_and_numeric = bool(re.search(
|
| 83 |
+
r"\b(\d+)\s*(?:inclusion|eligibility criteria|study inclusion)\b",
|
| 84 |
+
sentence, re.IGNORECASE
|
| 85 |
+
))
|
| 86 |
+
contains_exclusion_and_numeric = bool(re.search(
|
| 87 |
+
r"\b(\d+)\s*(?:exclusion|study exclusion|not eligible)\b",
|
| 88 |
+
sentence, re.IGNORECASE
|
| 89 |
+
))
|
| 90 |
+
|
| 91 |
+
# Co-morbidities: Matches keyword only
|
| 92 |
+
contains_comorbidities = any(kw in sentence.lower() for kw in keywords["Co-morbidities"])
|
| 93 |
+
|
| 94 |
+
# Time durations: Matches numeric + time unit
|
| 95 |
+
time_duration_regex = re.compile(
|
| 96 |
+
r'\b(?:\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s*'
|
| 97 |
+
+ r"(?:years|year|weeks|week|months|month|days|day)\b",
|
| 98 |
+
re.IGNORECASE
|
| 99 |
+
)
|
| 100 |
+
contains_time_duration = bool(time_duration_regex.search(sentence))
|
| 101 |
+
|
| 102 |
+
# Ensure the sentence contains valid numeric values
|
| 103 |
+
contains_valid_numeric_value = contains_valid_numeric(sentence)
|
| 104 |
+
|
| 105 |
+
# Additional criteria based on Remark and Intervention Groups
|
| 106 |
+
contains_remark = any(kw in sentence.lower() for kw in keywords["Remark"])
|
| 107 |
+
contains_intervention = any(kw in sentence.lower() for kw in keywords["Intervention Groups"])
|
| 108 |
+
contains_study_type = any(kw in sentence.lower() for kw in keywords["Study Types"])
|
| 109 |
+
contains_country = any(kw in sentence.lower() for kw in keywords["Country"])
|
| 110 |
+
contains_race = any(kw in sentence.lower() for kw in keywords["Race/Ethnicity"])
|
| 111 |
+
|
| 112 |
+
if check_time_duration:
|
| 113 |
+
return contains_time_duration
|
| 114 |
+
|
| 115 |
+
return (
|
| 116 |
+
contains_valid_numeric_value and (
|
| 117 |
+
contains_gender
|
| 118 |
+
or contains_age_and_numeric
|
| 119 |
+
or contains_patients_and_numeric
|
| 120 |
+
or contains_participants_and_numeric
|
| 121 |
+
or contains_inclusion_and_numeric
|
| 122 |
+
or contains_exclusion_and_numeric
|
| 123 |
+
or contains_comorbidities
|
| 124 |
+
or contains_time_duration
|
| 125 |
+
or contains_remark
|
| 126 |
+
or contains_intervention
|
| 127 |
+
or contains_study_type
|
| 128 |
+
or contains_country
|
| 129 |
+
)
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
def matches_keyword(sentence, user_keywords):
|
| 133 |
+
"""Check if a sentence contains any of the user-specified keywords."""
|
| 134 |
+
return any(keyword.lower() in sentence.lower() for keyword in user_keywords)
|
| 135 |
+
|
| 136 |
+
def extract_authors(page):
|
| 137 |
+
"""Extract authors' names from the text above specified headers."""
|
| 138 |
+
full_text = page.get_text()
|
| 139 |
+
|
| 140 |
+
# Find the position of key sections
|
| 141 |
+
section_positions = {section: full_text.find(section) for section in key_sections}
|
| 142 |
+
# Filter out sections not found
|
| 143 |
+
section_positions = {k: v for k, v in section_positions.items() if v != -1}
|
| 144 |
+
|
| 145 |
+
# Determine the closest section and extract text above it
|
| 146 |
+
if section_positions:
|
| 147 |
+
closest_section = min(section_positions, key=section_positions.get)
|
| 148 |
+
cutoff_position = section_positions[closest_section]
|
| 149 |
+
text_to_search = full_text[:cutoff_position] # Extract text above the section
|
| 150 |
+
else:
|
| 151 |
+
text_to_search = full_text
|
| 152 |
+
|
| 153 |
+
# Find author names using regex
|
| 154 |
+
author_matches = re.findall(author_pattern, text_to_search)
|
| 155 |
+
|
| 156 |
+
# Use NLP to further refine author name extraction
|
| 157 |
+
doc = nlp(text_to_search)
|
| 158 |
+
nlp_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
|
| 159 |
+
|
| 160 |
+
# Combine regex and NLP results, filtering out unwanted words
|
| 161 |
+
combined_names = set(author_matches + nlp_names)
|
| 162 |
+
filtered_authors = [name for name in combined_names if name.lower() not in exclude_words]
|
| 163 |
+
|
| 164 |
+
return list(set(filtered_authors))
|
| 165 |
+
|
| 166 |
+
def highlight_keywords(sentence, user_keywords):
|
| 167 |
+
"""Highlight user_keywords in the sentence using <mark> tags."""
|
| 168 |
+
if not user_keywords:
|
| 169 |
+
return sentence
|
| 170 |
+
|
| 171 |
+
# Separate single-word and multi-word keywords
|
| 172 |
+
single_words = [kw for kw in user_keywords if ' ' not in kw]
|
| 173 |
+
phrases = [kw for kw in user_keywords if ' ' in kw]
|
| 174 |
+
|
| 175 |
+
# Escape keywords for regex
|
| 176 |
+
escaped_single_words = [re.escape(kw) for kw in single_words]
|
| 177 |
+
escaped_phrases = [re.escape(kw) for kw in phrases]
|
| 178 |
+
|
| 179 |
+
# Build regex patterns
|
| 180 |
+
patterns = []
|
| 181 |
+
if escaped_single_words:
|
| 182 |
+
single_word_pattern = r'\b(?:' + '|'.join(escaped_single_words) + r')\b'
|
| 183 |
+
patterns.append(single_word_pattern)
|
| 184 |
+
if escaped_phrases:
|
| 185 |
+
phrase_pattern = r'(?:' + '|'.join(escaped_phrases) + r')'
|
| 186 |
+
patterns.append(phrase_pattern)
|
| 187 |
+
|
| 188 |
+
# Combine patterns into a single regex
|
| 189 |
+
if patterns:
|
| 190 |
+
combined_pattern = re.compile('|'.join(patterns), re.IGNORECASE)
|
| 191 |
+
else:
|
| 192 |
+
return sentence
|
| 193 |
+
|
| 194 |
+
# Function to add <mark> tags
|
| 195 |
+
def replacer(match):
|
| 196 |
+
return f"<mark>{match.group(0)}</mark>"
|
| 197 |
+
|
| 198 |
+
# Substitute matched keywords with highlighted version
|
| 199 |
+
highlighted_sentence = combined_pattern.sub(replacer, sentence)
|
| 200 |
+
return highlighted_sentence
|
| 201 |
+
|
| 202 |
+
def process_file(file_path, user_keywords, check_time_duration=False):
|
| 203 |
+
"""
|
| 204 |
+
Process the PDF file and extract sentences based on criteria,
|
| 205 |
+
then filter by user keywords and highlight them.
|
| 206 |
+
"""
|
| 207 |
+
doc = fitz.open(file_path)
|
| 208 |
+
first_page = doc[0]
|
| 209 |
+
author_names = extract_authors(first_page)
|
| 210 |
+
authors_str = ', '.join(author_names)
|
| 211 |
+
|
| 212 |
+
all_extracted_sentences = []
|
| 213 |
+
|
| 214 |
+
for page in doc:
|
| 215 |
+
text = normalize_text(page.get_text())
|
| 216 |
+
sentences = extract_sentences(text)
|
| 217 |
+
extracted = [sentence.strip() for sentence in sentences if matches_criteria(sentence, check_time_duration)]
|
| 218 |
+
all_extracted_sentences.extend(extracted)
|
| 219 |
+
|
| 220 |
+
if not check_time_duration:
|
| 221 |
+
filtered_sentences = [sentence for sentence in all_extracted_sentences if matches_keyword(sentence, user_keywords)]
|
| 222 |
+
else:
|
| 223 |
+
filtered_sentences = all_extracted_sentences
|
| 224 |
+
|
| 225 |
+
# Highlight keywords in the filtered sentences
|
| 226 |
+
highlighted_sentences = [highlight_keywords(sentence, user_keywords) for sentence in filtered_sentences]
|
| 227 |
+
|
| 228 |
+
doc.close()
|
| 229 |
+
return highlighted_sentences, authors_str
|
| 230 |
+
|
| 231 |
+
def process_text(input_text, user_keywords, check_time_duration=False):
|
| 232 |
+
"""
|
| 233 |
+
Process the input text and extract sentences based on criteria,
|
| 234 |
+
then filter by user keywords and highlight them.
|
| 235 |
+
"""
|
| 236 |
+
refined_text = normalize_text(input_text)
|
| 237 |
+
sentences = extract_sentences(refined_text)
|
| 238 |
+
extracted_sentences = [sentence.strip() for sentence in sentences if matches_criteria(sentence, check_time_duration)]
|
| 239 |
+
|
| 240 |
+
if not check_time_duration:
|
| 241 |
+
filtered_sentences = [sentence for sentence in extracted_sentences if matches_keyword(sentence, user_keywords)]
|
| 242 |
+
else:
|
| 243 |
+
filtered_sentences = extracted_sentences
|
| 244 |
+
|
| 245 |
+
# Highlight keywords in the filtered sentences
|
| 246 |
+
highlighted_sentences = [highlight_keywords(sentence, user_keywords) for sentence in filtered_sentences]
|
| 247 |
+
|
| 248 |
+
return highlighted_sentences, "Authors not extracted from text input."
|
| 249 |
+
|
| 250 |
+
def handle_input(file_path=None, input_text=None, keyword_group=None, custom_keywords=None, time_duration=False):
|
| 251 |
+
"""
|
| 252 |
+
Handle user input from the Gradio interface,
|
| 253 |
+
process the file or text, and return highlighted sentences with authors.
|
| 254 |
+
"""
|
| 255 |
+
# Decide on which keywords to use
|
| 256 |
+
user_keywords = []
|
| 257 |
+
if keyword_group:
|
| 258 |
+
user_keywords = keywords.get(keyword_group, [])
|
| 259 |
+
if custom_keywords:
|
| 260 |
+
user_keywords.extend(kw.strip() for kw in custom_keywords.split(",") if kw.strip())
|
| 261 |
+
|
| 262 |
+
if not user_keywords and not time_duration:
|
| 263 |
+
return "No keyword provided."
|
| 264 |
+
|
| 265 |
+
if file_path:
|
| 266 |
+
extracted_sentences, authors_str = process_file(file_path, user_keywords, time_duration)
|
| 267 |
+
elif input_text:
|
| 268 |
+
extracted_sentences, authors_str = process_text(input_text, user_keywords, time_duration)
|
| 269 |
+
else:
|
| 270 |
+
return "No input provided."
|
| 271 |
+
|
| 272 |
+
if extracted_sentences:
|
| 273 |
+
# Combine authors and highlighted sentences into HTML
|
| 274 |
+
highlighted_html = f"<p><b>Authors:</b> {authors_str}</p>"
|
| 275 |
+
for sentence in extracted_sentences:
|
| 276 |
+
highlighted_html += f"<p>{sentence}</p>"
|
| 277 |
+
return highlighted_html
|
| 278 |
+
|
| 279 |
+
return "No matching sentences found."
|
| 280 |
+
|
| 281 |
+
# Gradio Interface
|
| 282 |
+
iface = gr.Interface(
|
| 283 |
+
fn=handle_input,
|
| 284 |
+
inputs=[
|
| 285 |
+
gr.File(label="Upload PDF or Text File", type="filepath"),
|
| 286 |
+
gr.Textbox(label="Enter Text", placeholder="Type or paste text here..."),
|
| 287 |
+
gr.Radio(
|
| 288 |
+
choices=list(keywords.keys()),
|
| 289 |
+
label="Information related to..."
|
| 290 |
+
),
|
| 291 |
+
gr.Textbox(
|
| 292 |
+
label="Enter Custom Keywords",
|
| 293 |
+
placeholder="e.g., migraine, headache"
|
| 294 |
+
),
|
| 295 |
+
# gr.Checkbox(
|
| 296 |
+
# label="Check Time Duration Criteria",
|
| 297 |
+
# value=False
|
| 298 |
+
# )
|
| 299 |
+
],
|
| 300 |
+
outputs=gr.HTML(label="Processed Output"),
|
| 301 |
+
title="BioMedical Information Extraction",
|
| 302 |
+
description="""
|
| 303 |
+
<div style='text-align: left;'>
|
| 304 |
+
Made by: Sumit Kumar (2311006), Ramavath Tharun (21219) <br>
|
| 305 |
+
Supervisor: Dr. Tanmay Basu<br>
|
| 306 |
+
Indian Institute of Science Education and Research<br>
|
| 307 |
+
</div>
|
| 308 |
+
<div style='text-align: center; margin-top: 10px;'>
|
| 309 |
+
<b>Upload a PDF file or enter text, then select a keyword group or enter custom keywords to extract and highlight relevant sentences.</b>
|
| 310 |
+
</div>
|
| 311 |
+
""",
|
| 312 |
+
examples=None, # You can add example files or texts if desired
|
| 313 |
+
allow_flagging="never",
|
| 314 |
+
cache_examples=True,
|
| 315 |
+
# Add custom CSS to style the <mark> tag if necessary
|
| 316 |
+
css="""
|
| 317 |
+
mark {
|
| 318 |
+
background-color: blue;
|
| 319 |
+
padding: 0;
|
| 320 |
+
border-radius: 2px;
|
| 321 |
+
}
|
| 322 |
+
/* Optional: Adjust paragraph spacing */
|
| 323 |
+
p {
|
| 324 |
+
margin-bottom: 10px;
|
| 325 |
+
}
|
| 326 |
+
"""
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
iface.launch(share=True)
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
|