sumit4352's picture
Upload 2 files
ae4fa62 verified
import re
# Keywords
keywords = {
"Gender": ["male", "female", " man ", "woman", " men ", " men,", "women", "boy", "girl", "males", "females"],
"Age": [" age ", "age,", " aged ", "years old", "year-old", "year olds", "elderly", "adults", "young", "youth"],
"Patients": ["patient", "patients", "case", "cases", "subject", "subjects", "individual", "individuals"],
"Participants": ["participant", "participants", "attendee", "attendees", "respondent", "respondents"],
"Inclusion Criteria": ["inclusion", "eligibility criteria", "study inclusion", "included"],
"Exclusion Criteria": ["exclusion", "not eligible", "study exclusion", "excluded"],
"Study Types": [
"Case Report", "Case Series", "Cross-sectional Study", "Case-Control Study", "Cohort Study", "Randomized Controlled Clinical Trial",
"Non-Randomized Controlled Trial", "Pilot Study", "Feasibility Study", "Longitudinal Study", "Retrospective Study", "Prospective Study",
"Observational Study", "Experimental Study", "Interventional Study", "Descriptive Study", "Analytical Study", "Quasi-Experimental Study",
"Epidemiological Study", "Ecological Study", "Systematic Review", "Meta-Analysis", "Mixed-Methods Study", "Narrative Review", "Scoping Review",
"Rapid Review", "Umbrella Review", "Diagnostic Accuracy Study", "Validation Study", "Genome-Wide Association Study (GWAS)",
"Gene-Environment Interaction Study", "Linkage Study", "Sensitivity/Specificity Study", "Cost-Effectiveness Study", "Health Technology Assessment",
"Quality Improvement Study", "Translational Research", "Implementation Science Study", "Psychometric Study", "Community-Based Participatory Research (CBPR)",
"In Vitro Study", "In Vivo Study", "Simulation Study", "Phenomenological Study", "Ethnographic Study", "Grounded Theory Study", "Narrative Study",
"Case Study", "Pragmatic Trial", "Cluster Randomized Trial", "Adaptive Trial", "Phase 1 Clinical Trial", "Phase 2 Clinical Trial", "Phase 3 Clinical Trial",
"Phase 4 Clinical Trial", "Real-World Evidence Study", "Comparative Effectiveness Study", "Proof-of-Concept Study", "Dose-Response Study", "Cross-Over Study",
"Nested Study", "Multicenter Study", "Delphi Study", "Pragmatic Clinical Trial", "Registry-Based Study", "Historical Cohort Study",
"Nested Case-Control Study", " double-blind ", "double blind", "placebo-controlled", "placebo controlled", "Cross-sectional analysis"
],
"Co-morbidities": ["comorbidities", "co-morbidities", "comor-bidities", " comorbidities ", "comorbidities"],
"Country": ["Afghanistan", "Australia", "Brazil", "Canada", "China", "France", "Germany", "India", "Japan", "Mexico", "Nigeria", "Russia",
"South Africa", "United Kingdom", "United States", "Prefer Not to Answer"],
"Race/Ethnicity": ["white", "Black", "African American", "Asian", "Native Hawaiian", "Other Pacific Islander", "American Indian",
"Alaska Native", "Other Race", "Two or More Races", "Hispanic", "latino", "Not Hispanic or latino"],
"Follow-Up": ["years", "year", "weeks", "week", "months", "month", "days", "day"],
"Remark": [
"displayed", "exhibited", "revealed", "indicated", "illustrated", "Showed",
"noticed", "perceived", "detected", "discerned", "identified", "Observed",
"progress", "enhancement", "advancement", "growth", "betterment", "Improvement",
"proved", "exhibited", "showcased", "conveyed", "validated", "Demonstrated",
"similar", "equivalent", "parallel", "analogous", "akin", "Comparable",
"more secure", "less risky", "protected", "shielded", "guarded", "Safer",
"chosen", "picked", "opted", "designated", "elected", "Selected"
],
"Intervention Groups": [
"intervention grorup", "intervention groups", "treatment groups", "treatment group", "control groups", "control group", "placebo group",
"placebo groups"
],
"Outcomes": [
"results", "findings", "observations", "conclusion", "outcome", "clinical outcome", "results:",
"efficacy", "effectiveness", "treatment response", "pain reduction", "symptom improvement",
"disease progression", "treatment success", "remission rate", "response rate", "conclusion:",
"adverse effects", "side effects", "complications", "recurrence", "recovery time", "result:",
"statistical significance", "p-value", "confidence interval", "hazard ratio", "risk reduction"
],
"Assessment Tools": [
"Visual Analog Scale (VAS)", "WOMAC", "Western Ontario and McMaster Universities Osteoarthritis Index",
"Numeric Rating Scale (NRS)", "McGill Pain Questionnaire (MPQ)", "Timed Up and Go Test (TUG)",
"6-Minute Walk Test (6MWT)", "gait analysis", "joint range of motion", "functional independence measure",
"SF-36", "EQ-5D", "Beck Depression Inventory", "Hospital Anxiety and Depression Scale (HADS)",
"blood tests", "CRP", "C-reactive protein", "ESR", "TNF-α", "IL-6", "synovial fluid analysis",
"X-ray", "Kellgren-Lawrence grade", "MRI", "magnetic resonance imaging", "musculoskeletal ultrasound",
"cartilage thickness", "bone marrow lesion", "mental health scales", "quality of life assessments", "ELISA"
]
}
# Author name regex pattern
author_pattern = r'\b(?:[A-Z]\.\s*)*[A-Z][a-zA-Z\.\-\']+(?:\s[A-Z][a-zA-Z\.\-\']+)*\b(?:\s[0-9]+)?'
# Words and patterns to exclude
exclude_words = {
"Aim", "This", "the", "Article", "School", "Topical", "with", "compress",
"Research", "Capsi", "India", "Australia", "and", "others", "January", "February",
"March", "April", "May", "June", "July", "August", "September", "October",
"November", "December", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
"Saturday", "Sunday", "AM", "PM", "University", "College", "Institute", "School",
"of", "in", "on", "at", "by", "for", "with", "about", "against", "between",
"into", "through", "during", "before", "after", "above", "below", "to", "from",
"up", "down", "in", "out", "over", "under", "again", "further", "then", "once",
"here", "there", "when", "where", "why", "how", "all", "any", "both", "each",
"few", "more", "most", "other", "some", "such", "no", "nor", "not", "only",
"own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just",
"don", "should", "now", "Ginger", "Migraine"
}
# Regex patterns
numeric_regex = re.compile(r"\b(?:-?\d+\.?\d*%?|\d+-\d+%?|\d+(?: \d+)*%?)\b")
exclude_brackets_regex = re.compile(r"[$$($$]\s*[\d,/-]+\s*[$$)$$]")
date_regex = re.compile(r"\b(?:\d{1,2}/\d{1,2}/\d{2,4}|\d{1,2}-\d{1,2}-\d{2,4}|\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\b \d{1,2}, \d{4})\b", re.IGNORECASE)
table_regex = re.compile(r"^(?:\s*\d+\s+)+$")
# Build regex patterns for exact matches
gender_regex = re.compile(rf'\b(?:{"|".join(map(re.escape, keywords["Gender"]))})\b', re.IGNORECASE)
age_regex = re.compile(rf'\b(?:{"|".join(map(re.escape, keywords["Age"]))})\b', re.IGNORECASE)
outcomes_regex = re.compile(rf'\b(?:{"|".join(map(re.escape, keywords["Outcomes"]))})\b', re.IGNORECASE)
assessment_tools_regex = re.compile(rf'\b(?:{"|".join(map(re.escape, keywords["Assessment Tools"]))})\b', re.IGNORECASE)
# Time duration regex pattern
follow_up = re.compile(
rf'\b(?:\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s*(?:{"|".join(map(re.escape, keywords["Follow-Up"]))})(?:\b|s\b|-| to \d+)\b',
re.IGNORECASE
)
# Key sections for extraction
key_sections = [
"Summary", "Overview", "Synopsis", "Results", "Findings", "Observations", "Conclusion",
"Assessment", "Evaluation", "Outcomes", "Measurements", "Test Results", "Analysis",
"Abstract", "A B S T R A C T", "Background"
]