Upload app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,509 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import re
|
| 3 |
+
import time
|
| 4 |
+
import logging
|
| 5 |
+
import fitz # PyMuPDF
|
| 6 |
+
import io
|
| 7 |
+
|
| 8 |
+
# Configure logging
|
| 9 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def check_US_UK_consistency(text):
|
| 13 |
+
"""
|
| 14 |
+
Searches the input text for inconsistent use of US and UK English spellings.
|
| 15 |
+
Returns a list of issues for inconsistent spellings found.
|
| 16 |
+
Excludes matches if an integer between 1900 and 2100 appears within 200 characters after the match.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
text (str): The string to search through.
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
list: List of strings describing the inconsistencies found, or empty list if none.
|
| 23 |
+
"""
|
| 24 |
+
issues = []
|
| 25 |
+
|
| 26 |
+
spelling_pairs = [
|
| 27 |
+
('analyze(?:d|ing)?', 'analyse(?:d|ing)?'),
|
| 28 |
+
('(?:un)?catalyze(?:d|s|ing)?', '(?:un)?catalyse(?:d|s|ing)?'),
|
| 29 |
+
('sulfur', 'sulphur'),
|
| 30 |
+
('aluminum', 'aluminium'),
|
| 31 |
+
('color(?:ed|ing|s|less)?', 'colour(?:ed|ing|s|less)?'),
|
| 32 |
+
('flavor(?:ed|ing|s)?', 'flavour(?:ed|ing|s)?'),
|
| 33 |
+
('liter', 'litre'),
|
| 34 |
+
('fiber', 'fibre'),
|
| 35 |
+
('meter', 'metre'),
|
| 36 |
+
('neighbor(?:ed|ing|s)?', 'neighbour(?:ed|ing|s)?'),
|
| 37 |
+
('(?:re)?organiz(?:e|ed|ing|es|ation)', '(?:re)?organis(?:e|ed|ing|es|ation)'),
|
| 38 |
+
('vapor', 'vapour'),
|
| 39 |
+
('behavior', 'behaviour'),
|
| 40 |
+
('realiz(?:e|ed|ing|es|ation)', 'realis(?:e|ed|ing|es|ation)'),
|
| 41 |
+
('synthetize(?:d|s)?', 'synthetise(?:d|s)?'),
|
| 42 |
+
('characteriz(?:e|ed|ing|es|ation)', 'characteris(?:e|ed|ing|es|ation)'),
|
| 43 |
+
('(?:re)?crystalliz(?:e|ed|ing|es|ation)', '(?:re)?crystallis(?:e|ed|ing|es|ation)'),
|
| 44 |
+
('polymeriz(?:e|ed|ing|es|ation)', 'polymeris(?:e|ed|ing|es|ation)'),
|
| 45 |
+
('oxidized', 'oxidised'),
|
| 46 |
+
('neutraliz(?:e|ed|ing|es|ation)', 'neutralis(?:e|ed|ing|es|ation)'),
|
| 47 |
+
('hydrolyzed', 'hydrolysed'),
|
| 48 |
+
('standardiz(?:e|ed|ing|es|ation)', 'standardis(?:e|ed|ing|es|ation)'),
|
| 49 |
+
('ioniz(?:e|ed|ing|es|ation)', 'ionis(?:e|ed|ing|es|ation)'),
|
| 50 |
+
('solubiliz(?:e|ed|ing|es|ation)', 'solubilis(?:e|ed|ing|es|ation)'),
|
| 51 |
+
('functionalized', 'functionalised'),
|
| 52 |
+
('electrolyzed', 'electrolysed'),
|
| 53 |
+
('homogeniz(?:e|ed|ing|es|ation)', 'homogenis(?:e|ed|ing|es|ation)'),
|
| 54 |
+
('lyophiliz(?:e|ed|ing|es|ation)', 'lyophilis(?:e|ed|ing|es|ation)'),
|
| 55 |
+
('polariz(?:e|ed|ing|es|ation)', 'polaris(?:e|ed|ing|es|ation)'),
|
| 56 |
+
('isomeriz(?:e|ed|ing|es|ation)', 'isomeris(?:e|ed|ing|es|ation)'),
|
| 57 |
+
('immobiliz(?:e|ed|ing|es|ation)', 'immobilis(?:e|ed|ing|es|ation)'),
|
| 58 |
+
('stabiliz(?:e|ed|ing|es|ation)', 'stabilis(?:e|ed|ing|es|ation)'),
|
| 59 |
+
('optimiz(?:e|ed|ing|es|ation)', 'optimis(?:e|ed|ing|es|ation)'),
|
| 60 |
+
('odor', 'odour'),
|
| 61 |
+
('galvaniz(?:e|ed|ing|es|ation)', 'galvanis(?:e|ed|ing|es|ation)'),
|
| 62 |
+
('(?:re)?model(?:ing|ed|s)?', '(?:re)?modell(?:ing|ed|s)?'),
|
| 63 |
+
('(?:re)?label(?:ing|ed|s)?', '(?:re)?labell(?:ing|ed|s)?'),
|
| 64 |
+
('gray', 'grey'),
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
year_pattern = re.compile(r'\b(19\d{2}|20\d{2}|2100)\b')
|
| 69 |
+
|
| 70 |
+
for us, uk in spelling_pairs:
|
| 71 |
+
us_matches = [m for m in re.finditer(r'\b' + us + r'\b', text, re.I)]
|
| 72 |
+
uk_matches = [m for m in re.finditer(r'\b' + uk + r'\b', text, re.I)]
|
| 73 |
+
|
| 74 |
+
valid_us_matches = []
|
| 75 |
+
for match in us_matches:
|
| 76 |
+
after_text = text[match.end():match.end()+200]
|
| 77 |
+
if not year_pattern.search(after_text):
|
| 78 |
+
valid_us_matches.append(match)
|
| 79 |
+
|
| 80 |
+
valid_uk_matches = []
|
| 81 |
+
for match in uk_matches:
|
| 82 |
+
after_text = text[match.end():match.end()+200]
|
| 83 |
+
if not year_pattern.search(after_text):
|
| 84 |
+
valid_uk_matches.append(match)
|
| 85 |
+
|
| 86 |
+
if valid_us_matches and valid_uk_matches:
|
| 87 |
+
issue = f"Inconsistent UK/US spelling detected:\n\n"
|
| 88 |
+
|
| 89 |
+
# Add US spelling examples (limit to 3)
|
| 90 |
+
issue += "US spelling examples:\n"
|
| 91 |
+
for match in valid_us_matches[:3]:
|
| 92 |
+
start, end = match.span()
|
| 93 |
+
context = text[max(0, start-20):end+20]
|
| 94 |
+
issue += f" • ...{context}...\n"
|
| 95 |
+
|
| 96 |
+
# Add UK spelling examples (limit to 3)
|
| 97 |
+
issue += "\nUK spelling examples:\n"
|
| 98 |
+
for match in valid_uk_matches[:3]:
|
| 99 |
+
start, end = match.span()
|
| 100 |
+
context = text[max(0, start-20):end+20]
|
| 101 |
+
issue += f" • ...{context}...\n"
|
| 102 |
+
|
| 103 |
+
issue += "\n→ Reminder: Maintain consistent spelling throughout the manuscript!"
|
| 104 |
+
issues.append(issue)
|
| 105 |
+
|
| 106 |
+
return issues
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def transform_citations(text, journal_patterns=None):
|
| 111 |
+
"""
|
| 112 |
+
Transform all citations in a text from format "Journal Vol(Issue), Pages (Year)"
|
| 113 |
+
to "Journal Year, Vol(Issue), Pages"
|
| 114 |
+
"""
|
| 115 |
+
if journal_patterns is None:
|
| 116 |
+
# Default patterns for common journals
|
| 117 |
+
journal_patterns = [
|
| 118 |
+
r'J\. Am\. Chem\. Soc\.',
|
| 119 |
+
r'Chem\. Eur\. J\.',
|
| 120 |
+
r'Angew\. Chem\. Int\. Ed\.',
|
| 121 |
+
r'ACS\. Catal.\.',
|
| 122 |
+
r'Org\. Lett.\.',
|
| 123 |
+
r'Tetrahedron\. Lett.\.',
|
| 124 |
+
# Add more journal patterns as needed
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
# Create pattern for full citation
|
| 128 |
+
journal_group = f"({'|'.join(journal_patterns)})"
|
| 129 |
+
# Updated volume_group to include optional issue in parentheses
|
| 130 |
+
volume_group = r'(\d+(?:\(\d+\))?)'
|
| 131 |
+
pages_group = r'(\d+(?:[-–]\d+)?)'
|
| 132 |
+
year_group = r'\((\d{4})\)'
|
| 133 |
+
|
| 134 |
+
pattern = f"{journal_group}\\s+{volume_group},\\s*{pages_group}\\s*{year_group}"
|
| 135 |
+
|
| 136 |
+
def replace_citation(match):
|
| 137 |
+
journal = match.group(1)
|
| 138 |
+
volume = match.group(2) # Now includes issue if present
|
| 139 |
+
pages = match.group(3)
|
| 140 |
+
year = match.group(4)
|
| 141 |
+
|
| 142 |
+
# Check if there's a period after the citation
|
| 143 |
+
end_period = '.' if match.string[match.end():].startswith('.') else ''
|
| 144 |
+
|
| 145 |
+
return f"{journal} {year}, {volume}, {pages}{end_period}"
|
| 146 |
+
|
| 147 |
+
# Replace all matching citations in the text
|
| 148 |
+
processed_text = re.sub(pattern, replace_citation, text)
|
| 149 |
+
|
| 150 |
+
return processed_text
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def validate_citation(text):
|
| 154 |
+
"""
|
| 155 |
+
Validates citations in the format "<Journal> <Year>, <Volume>" where <Year> and <Volume> are integers.
|
| 156 |
+
Checks if Year - Volume equals the journal's founding year offset.
|
| 157 |
+
"""
|
| 158 |
+
# Dictionary mapping journals to their founding year offsets
|
| 159 |
+
journal_offsets = {
|
| 160 |
+
"J. Am. Chem. Soc.": 1878,
|
| 161 |
+
"Org. Lett.": 1998,
|
| 162 |
+
"Chem. Eur. J.": 1994,
|
| 163 |
+
"ACS Catal.": 2010,
|
| 164 |
+
"Angew. Chem. Int. Ed.": 1961,
|
| 165 |
+
"Tetrahedron Lett.": 1959,
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
# Create the regex pattern, sorting by length to match longer names first
|
| 169 |
+
sorted_journals = sorted(journal_offsets.keys(), key=len, reverse=True)
|
| 170 |
+
journal_patterns = [re.escape(name) for name in sorted_journals]
|
| 171 |
+
journals_regex = '|'.join(journal_patterns)
|
| 172 |
+
|
| 173 |
+
# Complete pattern with year and volume groups
|
| 174 |
+
pattern = f"({journals_regex})\\s+(\\d+),\\s*(\\d+)"
|
| 175 |
+
|
| 176 |
+
# Find all matches in the text
|
| 177 |
+
matches = re.finditer(pattern, text)
|
| 178 |
+
|
| 179 |
+
results = []
|
| 180 |
+
for match in matches:
|
| 181 |
+
journal = match.group(1) # Exact journal match
|
| 182 |
+
year = int(match.group(2))
|
| 183 |
+
volume = int(match.group(3))
|
| 184 |
+
|
| 185 |
+
offset = journal_offsets.get(journal)
|
| 186 |
+
citation = f"{journal} {year}, {volume}"
|
| 187 |
+
|
| 188 |
+
if offset is None:
|
| 189 |
+
results.append(f"{citation}: Journal not supported.")
|
| 190 |
+
elif year - volume != offset:
|
| 191 |
+
results.append(f"{citation}: wrong year or volume (expected offset {offset})")
|
| 192 |
+
|
| 193 |
+
return results
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def check_text(text):
|
| 197 |
+
"""
|
| 198 |
+
Searches the input text for various patterns using regex and shows context around matches.
|
| 199 |
+
"""
|
| 200 |
+
patterns = {
|
| 201 |
+
r'\b(\S+\s+\d+(?:\.\d+)?\s+oC\b)': "Use the ° symbol in °C, not a superscripted o: ",
|
| 202 |
+
r'\b\d+(?:\.\d+)?\s+%\s+yield\b': "No space between the numeric value and %: ",
|
| 203 |
+
r'\b\d+(?:\.\d+)?\s*mg/ml\b': "The volume is specified in mL, not ml: ",
|
| 204 |
+
r'\b\d+(?:\.\d+)?\s+ml\b': "The volume is specified in mL, not ml: ",
|
| 205 |
+
r'\b(?:one|two|three)(?!-)\s+neck(?:ed)?\b|\b(?:round|flat)(?!-)\s+bottom\b|\bpear(?!-)\s+shaped\b': (
|
| 206 |
+
"Hyphenate 'one-necked' and 'round-bottom, e.g. one-necked round-bottom flask): "
|
| 207 |
+
),
|
| 208 |
+
r'\b\d+(?:\.\d+)?[-]\s*[mL]L\s+round\b': "No hyphen around L and mL",
|
| 209 |
+
r'\banti-bacterial\b': "Use 'antibacterial' without hyphen: ",
|
| 210 |
+
r'\bco-operation\b': "Use 'cooperation' without hyphen: ",
|
| 211 |
+
r'\bmicro-organism\b': "Use 'microorganism' without hyphen: ",
|
| 212 |
+
r'\bmulti-colored\b': "Use 'multicolored' without hyphen: ",
|
| 213 |
+
r'\bnon-polar\b': "Use 'nonpolar' without hyphen: ",
|
| 214 |
+
r'\bphoto-redox\b': "Use 'photoredox' without hyphen: ",
|
| 215 |
+
r'\bpre-cooled\b': "Use 'precooled' without hyphen: ",
|
| 216 |
+
r'\bsuper-acid\b': "Use 'superacid' without hyphen: ",
|
| 217 |
+
r'\bmembered-ring\b': "Use 'membered ring' without hyphen: ",
|
| 218 |
+
r'\bMembered-Ring\b': "Use 'Membered Ring' without hyphen: ",
|
| 219 |
+
r'\bBronsted acid\b': "Use ø in Brønsted: ",
|
| 220 |
+
r'X-Ray': "Always use lowercase r in X-ray (even when capitalized): ",
|
| 221 |
+
r'x ray': "Use X and hyphen in X-ray: ",
|
| 222 |
+
r'X ray': "Use hyphen in X-ray: ",
|
| 223 |
+
r'\(-\)-': "Use (–)- instead of ",
|
| 224 |
+
r'\b\d+(?:\.\d+)?mL\b': "Missing space between value and mL: ",
|
| 225 |
+
r'\b\d+(?:\.\d+)?µm\b': "Missing space between value and µm: ",
|
| 226 |
+
r'\b\d+(?:\.\d+)?mm\b': "Missing space between value and mm: ",
|
| 227 |
+
r'\b\d+(?:\.\d+)?cm\b': "Missing space between value and cm: ",
|
| 228 |
+
r'\b\d+(?:\.\d+)?mg\b': "Missing space between value and mg: ",
|
| 229 |
+
r'\b\d+(?:\.\d+)?min\b': "Missing space between value and min: ",
|
| 230 |
+
r'(?<!\[)\b\d+(?:\.\d+)?M\b': "Missing space if M means molar (concentration): ",
|
| 231 |
+
r'\b\d+(?:\.\d+)?mM\b': "Missing space between value and mM: ",
|
| 232 |
+
r'\b\d+(?:\.\d+)?μM\b': "Missing space between value and μM: ",
|
| 233 |
+
r'\b(?!1[45]N)(\d+(?:\.\d+)?)N\b': "Missing space if N means normal (concentration): ",
|
| 234 |
+
r'\b\d+(?:\.\d+)?K\b': "Missing space if K means Kelvin: ",
|
| 235 |
+
r'\b\d+,\d+(?=\s?(?:g|mg|mol|mmol|M|h|min|°C|mL)\b)': "Incorrect use of a comma instead of a decimal point",
|
| 236 |
+
r',\s*\d+\.\s+\d+(?=\s?(?:g|mg|mol|mmol|h|min|°C|mL)\b)': "Unintended space? ",
|
| 237 |
+
r'\b(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)\s* °C\b': "Use en dash (–) for temperature ranges: ",
|
| 238 |
+
r'\b(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)\s* g\b': "Use en dash (–) for mass ranges: ",
|
| 239 |
+
r'\b(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)\s* mg\b': "Use en dash (–) for mass ranges: ",
|
| 240 |
+
r'from\s+(\d+(?:\.\d+)?)\s*[–—]\s*(\d+(?:\.\d+)?(?:\s*[A-Za-z°]*)?)\b':
|
| 241 |
+
"Do not use en dash in a 'from X—Y' construction. Use 'from X to Y' instead: ",
|
| 242 |
+
r'between\s+(\d+(?:\.\d+)?)\s*[–—]\s*(\d+(?:\.\d+)?(?:\s*[A-Za-z°]*)?)\b':
|
| 243 |
+
"Do not use en dash in a 'between X—Y' construction. Use 'between X and Y' instead: ",
|
| 244 |
+
r'\b\d+\s+fold\b': "Hyphenate numeral and 'fold': ",
|
| 245 |
+
r'\b\d+(?:\.\d+)?°C\b': "Missing space between value and °C: ",
|
| 246 |
+
r'\b\d+(?:\.\d+)?° K\b': "Use K without °, e.g. 298 K: ",
|
| 247 |
+
r'\b\d+(?:\.\d+)?±\d+(?:\.\d+)?\b': "Missing spaces around the ± symbol: ",
|
| 248 |
+
r'\b\d+(?:\.\d+)?\s*uL\b': "Use μL instead of uL for microliters: ",
|
| 249 |
+
r'\b\d+(?:\.\d+)?\s*ug\b': "Use μg instead of ug for micrograms: ",
|
| 250 |
+
r'\b\d+(?:\.\d+)?\s*umol\b': "Use μmol instead of umol for micromol: ",
|
| 251 |
+
r'\b\d+(?:\.\d+)?\s*uM\b': "Use μM instead of uM for micromolar: ",
|
| 252 |
+
r'\b\d+(?:\.\d+)?ppm\b': "Missing space between value and ppm: ",
|
| 253 |
+
r'\b\d+(?:\.\d+)?bar\b': "Missing space between value and bar: ",
|
| 254 |
+
r'\b\d+(?:\.\d+)?mbar\b': "Missing space between value and mbar: ",
|
| 255 |
+
r'\b\d+(?:\.\d+)?\s*mol/l\b': "Use mol/L instead of mol/l: ",
|
| 256 |
+
r'\b\d+(?:\.\d+)?\s*g/l\b': "Use g/L instead of g/l: ",
|
| 257 |
+
r'\b\d+(?:\.\d+)?\s*mol·l–1\b': "Use mol·L⁻¹ instead of mol·l⁻¹: ",
|
| 258 |
+
r'\b\d+(?:\.\d+)?\s*g·l–1\b': "Use g·L⁻¹ instead of g·l⁻¹: ",
|
| 259 |
+
r'\b\d+(?:\.\d+)?\s*mhz\b': "Use MHz (capital H): ",
|
| 260 |
+
r'\b\d+(?:\.\d+)?\s*gr\b': "Use g instead of gr: ",
|
| 261 |
+
r'\b\d+(?:\.\d+)?\s*hrs?\b': "Use h instead of hr/hrs: ",
|
| 262 |
+
r'/[Ee]natio/': "Misspelling of enantio...: ",
|
| 263 |
+
r'/[Aa]symetr/': "Misspelling of asymmetr...: ",
|
| 264 |
+
r'/[Pp]thal/': "Misspelling of phthal...: ",
|
| 265 |
+
r'/[Nn]aphth.../': "Misspelling of naphth...: ",
|
| 266 |
+
r'/[Ss]terosel.../': "Misspelling of stereosel...: ",
|
| 267 |
+
r'\s+(-?\d+(\.\d+)?)\s+eq\.(?!\s*\d)': "Use 'equiv' for equivalents and 'eq.' for equation: ",
|
| 268 |
+
r'\s+(-?\d+(\.\d+)?)\s+eq\)(?!\s*\d)': "Use 'equiv' for equivalents: ",
|
| 269 |
+
r'[Cc]alc[\'´](?:d|ed)': "Use Calcd or calcd instead of ",
|
| 270 |
+
r'treated with': "Check if 'reacted/washed/extracted with' etc. is more appropriate than ",
|
| 271 |
+
|
| 272 |
+
# Joining names
|
| 273 |
+
r'Diels-Alder': "Use en dash (–) for Diels–Alder: ",
|
| 274 |
+
r'Bednorz-Müller': "Use en dash (–) for Bednorz–Müller: ",
|
| 275 |
+
r'Beer-Lambert': "Use en dash (–) for Beer–Lambert: ",
|
| 276 |
+
r'Bose-Einstein': "Use en dash (–) for Bose–Einstein: ",
|
| 277 |
+
r'Debye-Hückel': "Use en dash (–) for Debye–Hückel: ",
|
| 278 |
+
r'Fermi-Dirac': "Use en dash (–) for Fermi–Dirac: ",
|
| 279 |
+
r'Fischer-Tropsch': "Use en dash (–) for Fischer–Tropsch: ",
|
| 280 |
+
r'Fisher-Johns': "Use en dash (–) for Fisher–Johns: ",
|
| 281 |
+
r'Flory-Huggins': "Use en dash (–) for Flory–Huggins: ",
|
| 282 |
+
r'Franck-Condon': "Use en dash (–) for Franck–Condon: ",
|
| 283 |
+
r'Friedel-Crafts': "Use en dash (–) for Friedel–Crafts: ",
|
| 284 |
+
r'Geiger-Müller': "Use en dash (–) for Geiger–Müller: ",
|
| 285 |
+
r'Henderson-Hasselbalch': "Use en dash (–) for Henderson–Hasselbalch: ",
|
| 286 |
+
r'Jahn-Teller': "Use en dash (–) for Jahn–Teller: ",
|
| 287 |
+
r'Lee-Yang-Parr': "Use en dash (–) for Lee–Yang–Parr: ",
|
| 288 |
+
r'Lineweaver-Burk': "Use en dash (–) for Lineweaver–Burk: ",
|
| 289 |
+
r'Mark-Houwink': "Use en dash (–) for Mark–Houwink: ",
|
| 290 |
+
r'Meerwein-Ponndorf': "Use en dash (–) for Meerwein–Ponndorf: ",
|
| 291 |
+
r'Michaelis-Menten': "Use en dash (–) for Michaelis–Menten: ",
|
| 292 |
+
r'Stern-Volmer': "Use en dash (–) for Stern–Volmer: ",
|
| 293 |
+
r"van't Hoff-Le Bel": "Use en dash (–) for van't Hoff–Le Bel: ",
|
| 294 |
+
r'Wolff-Kishner': "Use en dash (–) for Wolff–Kishner: ",
|
| 295 |
+
r'Young-Laplace': "Use en dash (–) for Young–Laplace: ",
|
| 296 |
+
r'Ziegler-Natta': "Use en dash (–) for Ziegler–Natta: ",
|
| 297 |
+
r'Baeyer-Villiger': "Use en dash (–) for Baeyer–Villiger: ",
|
| 298 |
+
r'Schotten-Baumann': "Use en dash (–) for Schotten–Baumann: ",
|
| 299 |
+
r'Buchwald-Hartwig': "Use en dash (–) for Buchwald–Hartwig: ",
|
| 300 |
+
r'Kumada-Corriu': "Use en dash (–) for Kumada–Corriu: ",
|
| 301 |
+
r'Nozaki-Hiyama': "Use en dash (–) for Nozaki–Hiyama: ",
|
| 302 |
+
r'Suzuki-Miyaura': "Use en dash (–) for Suzuki–Miyaura: ",
|
| 303 |
+
r'Mizoroki-Heck': "Use en dash (–) for Mizoroki–Heck: ",
|
| 304 |
+
r'Wittig-Horner': "Use en dash (–) for Wittig–Horner: ",
|
| 305 |
+
r'Claisen-Schmidt': "Use en dash (–) for Claisen–Schmidt: ",
|
| 306 |
+
r'Stille-Kelly': "Use en dash (–) for Stille–Kelly: ",
|
| 307 |
+
r'Reformatsky-Claisen': "Use en dash (–) for Reformatsky–Claisen: ",
|
| 308 |
+
r'Sonogashira-Hagihara': "Use en dash (–) for Sonogashira–Hagihara: ",
|
| 309 |
+
r'Grubbs-Hoveyda': "Use en dash (–) for Grubbs–Hoveyda: ",
|
| 310 |
+
r'Hoveyda-Grubbs': "Use en dash (–) for Hoveyda–Grubbs: ",
|
| 311 |
+
r'Petasis-Ferrier': "Use en dash (–) for Petasis–Ferrier: ",
|
| 312 |
+
r'Mukaiyama-Michael': "Use en dash (–) for Mukaiyama–Michael: ",
|
| 313 |
+
r'Tsuji-Trost': "Use en dash (–) for Tsuji–Trost: ",
|
| 314 |
+
r'Horner-Wadsworth-Emmons': "Use en dash (–) for Horner–Wadsworth–Emmons: ",
|
| 315 |
+
r'Jorgensen-Hayashi': "Use en dash (–) and ø in Jørgensen–Hayashi: ",
|
| 316 |
+
r'Jørgensen-Hayashi': "Use en dash (–) in Jørgensen–Hayashi: ",
|
| 317 |
+
r'Ullmann-Goldberg': "Use en dash (–) for Ullmann–Goldberg: ",
|
| 318 |
+
r'Chan-Lam': "Use en dash (–) for Chan–Lam: ",
|
| 319 |
+
r'Hiyama-Denmark': "Use en dash (–) for Hiyama–Denmark: ",
|
| 320 |
+
r'Negishi-Brown': "Use en dash (–) for Negishi–Brown: ",
|
| 321 |
+
r'Corey-Fuchs': "Use en dash (–) for Corey–Fuchs: ",
|
| 322 |
+
r'Wacker-Tsuji': "Use en dash (–) for Wacker–Tsuji: ",
|
| 323 |
+
r'Stork-Danheiser': "Use en dash (–) for Stork–Danheiser: ",
|
| 324 |
+
r'Balz-Schiemann': "Use en dash (–) for Balz–Schiemann: ",
|
| 325 |
+
r'Barton-McCombie': "Use en dash (–) for Barton–McCombie: ",
|
| 326 |
+
r'Knoevenagel-Doebner': "Use en dash (–) for Knoevenagel–Doebner: ",
|
| 327 |
+
r'Gattermann-Koch': "Use en dash (–) for Gattermann–Koch: ",
|
| 328 |
+
r'Mukaiyama-Mannich': "Use en dash (–) for Mukaiyama–Mannich: ",
|
| 329 |
+
r'Evans-Tishchenko': "Use en dash (–) for Evans–Tishchenko: ",
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
r'\b\d+(?:\.\d+)?\s*degree\b': "Use ° instead of 'degree': ",
|
| 335 |
+
r'\b\d+(?:\.\d+)?\s*percent\b': "Use % instead of 'percent': ",
|
| 336 |
+
r'\bvacuumed\b': "Use 'evacuated' or 'under vacuum' instead of 'vacuumed': ",
|
| 337 |
+
r'\b[Hh]eated\s+up\b': "Omit 'up', use 'heated': ",
|
| 338 |
+
r'\b[Cc]ooled\s+down\b': "Omit 'down', use 'cooled': ",
|
| 339 |
+
|
| 340 |
+
r'(\d)\s*x\s*(\d)': "Use × operator instead of letter x for multiplication ",
|
| 341 |
+
r'->': "Use → instead of ->",
|
| 342 |
+
r' MIN ': "Use min for minutes",
|
| 343 |
+
r'vaccuum': "Misspelling of vacuum ",
|
| 344 |
+
r'reduced vacuum': "Use 'reduced pressure' ",
|
| 345 |
+
r'(\d)×(\d)': "Leave space before and after × operator ",
|
| 346 |
+
r'mol×L': "Use mol·L ",
|
| 347 |
+
r'g×mol': "Use g·mol ",
|
| 348 |
+
r'J×K': "Use J·K ",
|
| 349 |
+
r'J×mol': "Use J·mol ",
|
| 350 |
+
r'g×L': "Use g·L ",
|
| 351 |
+
r'g×L': "Use mg·mL ",
|
| 352 |
+
r'mol\.L-1': "Use mol·L-1 ",
|
| 353 |
+
r'mol\.mL': "Use mol·mL ",
|
| 354 |
+
r'g\.mol': "Use g·mol ",
|
| 355 |
+
r'J\.K': "Use J·K ",
|
| 356 |
+
r'J\.mol': "Use J·mol ",
|
| 357 |
+
r'g\.L-1': "Use g·L-1 ",
|
| 358 |
+
r'g\.L-1': "Use mg·mL-1 ",
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
results = []
|
| 362 |
+
# Iterate over each pattern in the dictionary
|
| 363 |
+
for pattern_str, message in patterns.items():
|
| 364 |
+
regex = re.compile(pattern_str)
|
| 365 |
+
for match in regex.finditer(text):
|
| 366 |
+
# Get the start and end positions of the match
|
| 367 |
+
start = max(0, match.start() - 5) # Get up to 5 chars before match
|
| 368 |
+
end = min(len(text), match.end() + 10) # Get up to 10 chars after match
|
| 369 |
+
|
| 370 |
+
# Extract the context
|
| 371 |
+
context = text[start:end]
|
| 372 |
+
|
| 373 |
+
# Store the warning message and the context
|
| 374 |
+
results.append(f"{message}...{context}...")
|
| 375 |
+
|
| 376 |
+
return results
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
def extract_text_from_pdf(pdf_file):
|
| 380 |
+
"""
|
| 381 |
+
Extract text from a PDF file using fitz (PyMuPDF).
|
| 382 |
+
|
| 383 |
+
Args:
|
| 384 |
+
pdf_file: The uploaded PDF file object
|
| 385 |
+
|
| 386 |
+
Returns:
|
| 387 |
+
str: The extracted text from the PDF
|
| 388 |
+
"""
|
| 389 |
+
try:
|
| 390 |
+
# Create a byte stream from the uploaded file
|
| 391 |
+
pdf_bytes = io.BytesIO(pdf_file.getvalue())
|
| 392 |
+
|
| 393 |
+
# Open the PDF with fitz
|
| 394 |
+
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 395 |
+
|
| 396 |
+
# Extract text from all pages
|
| 397 |
+
text = ""
|
| 398 |
+
for page_num in range(len(doc)):
|
| 399 |
+
page = doc[page_num]
|
| 400 |
+
text += page.get_text() + " "
|
| 401 |
+
|
| 402 |
+
# Close the document
|
| 403 |
+
doc.close()
|
| 404 |
+
|
| 405 |
+
return text
|
| 406 |
+
except Exception as e:
|
| 407 |
+
st.error(f"Error extracting text from PDF: {str(e)}")
|
| 408 |
+
return ""
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
def main():
|
| 412 |
+
st.set_page_config(page_title="Chemistry Text Analyzer", page_icon="🧪", layout="wide")
|
| 413 |
+
|
| 414 |
+
st.title("Chemistry Text Analyzer")
|
| 415 |
+
st.write("""
|
| 416 |
+
This app analyzes chemistry text for common errors, inconsistencies, and formatting issues.
|
| 417 |
+
Upload a PDF file or paste your text in the box below to analyze it.
|
| 418 |
+
""")
|
| 419 |
+
|
| 420 |
+
# Create tabs for different input methods
|
| 421 |
+
tab1, tab2 = st.tabs(["Upload PDF", "Text Input"])
|
| 422 |
+
|
| 423 |
+
with tab1:
|
| 424 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type=['pdf'])
|
| 425 |
+
analyze_pdf = st.button("Analyze PDF")
|
| 426 |
+
|
| 427 |
+
if analyze_pdf and uploaded_file is not None:
|
| 428 |
+
with st.spinner("Extracting text from PDF..."):
|
| 429 |
+
text_content = extract_text_from_pdf(uploaded_file)
|
| 430 |
+
|
| 431 |
+
if text_content:
|
| 432 |
+
st.success(f"Successfully extracted text from {uploaded_file.name}")
|
| 433 |
+
st.write("---")
|
| 434 |
+
analyze_content(text_content)
|
| 435 |
+
else:
|
| 436 |
+
st.error("Failed to extract text from the PDF. Please check if the PDF contains extractable text.")
|
| 437 |
+
|
| 438 |
+
with tab2:
|
| 439 |
+
# Text input area
|
| 440 |
+
text_input = st.text_area("Paste your text here:", height=300)
|
| 441 |
+
analyze_text = st.button("Analyze Text")
|
| 442 |
+
|
| 443 |
+
if analyze_text:
|
| 444 |
+
if not text_input:
|
| 445 |
+
st.warning("Please paste some text to analyze.")
|
| 446 |
+
else:
|
| 447 |
+
st.write("---")
|
| 448 |
+
# Replace newlines with spaces to match the original behavior
|
| 449 |
+
text_content = text_input.replace('\n', ' ')
|
| 450 |
+
analyze_content(text_content)
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
def analyze_content(text_content):
|
| 454 |
+
"""
|
| 455 |
+
Analyze the text content and display results.
|
| 456 |
+
|
| 457 |
+
Args:
|
| 458 |
+
text_content: The text to analyze
|
| 459 |
+
"""
|
| 460 |
+
start_time = time.time()
|
| 461 |
+
|
| 462 |
+
with st.spinner("Analyzing text..."):
|
| 463 |
+
# Use expanders for each analysis type to keep the UI clean
|
| 464 |
+
with st.expander("Text Format and Style Issues", expanded=True):
|
| 465 |
+
text_issues = check_text(text_content)
|
| 466 |
+
if text_issues:
|
| 467 |
+
for issue in text_issues:
|
| 468 |
+
st.write(issue)
|
| 469 |
+
else:
|
| 470 |
+
st.write("No text format or style issues found.")
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
# Language Issues - Separate expander, not nested
|
| 477 |
+
with st.expander("Language Issues", expanded=True):
|
| 478 |
+
language_issues = check_US_UK_consistency(text_content)
|
| 479 |
+
if language_issues:
|
| 480 |
+
for issue in language_issues:
|
| 481 |
+
st.markdown(issue)
|
| 482 |
+
else:
|
| 483 |
+
st.write("No US/UK spelling inconsistencies found.")
|
| 484 |
+
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
with st.expander("Citation Analysis", expanded=True):
|
| 490 |
+
# Transform citations (this returns the transformed text)
|
| 491 |
+
transformed_text = transform_citations(text_content)
|
| 492 |
+
if transformed_text != text_content:
|
| 493 |
+
st.write("Citations were transformed to the proper format.")
|
| 494 |
+
|
| 495 |
+
# Validate citations
|
| 496 |
+
citation_issues = validate_citation(text_content)
|
| 497 |
+
if citation_issues:
|
| 498 |
+
st.write("Citation issues found:")
|
| 499 |
+
for issue in citation_issues:
|
| 500 |
+
st.write(issue)
|
| 501 |
+
else:
|
| 502 |
+
st.write("No citation issues found.")
|
| 503 |
+
|
| 504 |
+
elapsed_time = time.time() - start_time
|
| 505 |
+
st.write(f"Analysis completed in {elapsed_time:.2f} seconds.")
|
| 506 |
+
|
| 507 |
+
|
| 508 |
+
if __name__ == '__main__':
|
| 509 |
+
main()
|