Matchball commited on
Commit
19e2b80
·
verified ·
1 Parent(s): d16df8b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +509 -0
app.py ADDED
@@ -0,0 +1,509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import re
3
+ import time
4
+ import logging
5
+ import fitz # PyMuPDF
6
+ import io
7
+
8
+ # Configure logging
9
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
10
+
11
+
12
+ def check_US_UK_consistency(text):
13
+ """
14
+ Searches the input text for inconsistent use of US and UK English spellings.
15
+ Returns a list of issues for inconsistent spellings found.
16
+ Excludes matches if an integer between 1900 and 2100 appears within 200 characters after the match.
17
+
18
+ Args:
19
+ text (str): The string to search through.
20
+
21
+ Returns:
22
+ list: List of strings describing the inconsistencies found, or empty list if none.
23
+ """
24
+ issues = []
25
+
26
+ spelling_pairs = [
27
+ ('analyze(?:d|ing)?', 'analyse(?:d|ing)?'),
28
+ ('(?:un)?catalyze(?:d|s|ing)?', '(?:un)?catalyse(?:d|s|ing)?'),
29
+ ('sulfur', 'sulphur'),
30
+ ('aluminum', 'aluminium'),
31
+ ('color(?:ed|ing|s|less)?', 'colour(?:ed|ing|s|less)?'),
32
+ ('flavor(?:ed|ing|s)?', 'flavour(?:ed|ing|s)?'),
33
+ ('liter', 'litre'),
34
+ ('fiber', 'fibre'),
35
+ ('meter', 'metre'),
36
+ ('neighbor(?:ed|ing|s)?', 'neighbour(?:ed|ing|s)?'),
37
+ ('(?:re)?organiz(?:e|ed|ing|es|ation)', '(?:re)?organis(?:e|ed|ing|es|ation)'),
38
+ ('vapor', 'vapour'),
39
+ ('behavior', 'behaviour'),
40
+ ('realiz(?:e|ed|ing|es|ation)', 'realis(?:e|ed|ing|es|ation)'),
41
+ ('synthetize(?:d|s)?', 'synthetise(?:d|s)?'),
42
+ ('characteriz(?:e|ed|ing|es|ation)', 'characteris(?:e|ed|ing|es|ation)'),
43
+ ('(?:re)?crystalliz(?:e|ed|ing|es|ation)', '(?:re)?crystallis(?:e|ed|ing|es|ation)'),
44
+ ('polymeriz(?:e|ed|ing|es|ation)', 'polymeris(?:e|ed|ing|es|ation)'),
45
+ ('oxidized', 'oxidised'),
46
+ ('neutraliz(?:e|ed|ing|es|ation)', 'neutralis(?:e|ed|ing|es|ation)'),
47
+ ('hydrolyzed', 'hydrolysed'),
48
+ ('standardiz(?:e|ed|ing|es|ation)', 'standardis(?:e|ed|ing|es|ation)'),
49
+ ('ioniz(?:e|ed|ing|es|ation)', 'ionis(?:e|ed|ing|es|ation)'),
50
+ ('solubiliz(?:e|ed|ing|es|ation)', 'solubilis(?:e|ed|ing|es|ation)'),
51
+ ('functionalized', 'functionalised'),
52
+ ('electrolyzed', 'electrolysed'),
53
+ ('homogeniz(?:e|ed|ing|es|ation)', 'homogenis(?:e|ed|ing|es|ation)'),
54
+ ('lyophiliz(?:e|ed|ing|es|ation)', 'lyophilis(?:e|ed|ing|es|ation)'),
55
+ ('polariz(?:e|ed|ing|es|ation)', 'polaris(?:e|ed|ing|es|ation)'),
56
+ ('isomeriz(?:e|ed|ing|es|ation)', 'isomeris(?:e|ed|ing|es|ation)'),
57
+ ('immobiliz(?:e|ed|ing|es|ation)', 'immobilis(?:e|ed|ing|es|ation)'),
58
+ ('stabiliz(?:e|ed|ing|es|ation)', 'stabilis(?:e|ed|ing|es|ation)'),
59
+ ('optimiz(?:e|ed|ing|es|ation)', 'optimis(?:e|ed|ing|es|ation)'),
60
+ ('odor', 'odour'),
61
+ ('galvaniz(?:e|ed|ing|es|ation)', 'galvanis(?:e|ed|ing|es|ation)'),
62
+ ('(?:re)?model(?:ing|ed|s)?', '(?:re)?modell(?:ing|ed|s)?'),
63
+ ('(?:re)?label(?:ing|ed|s)?', '(?:re)?labell(?:ing|ed|s)?'),
64
+ ('gray', 'grey'),
65
+ ]
66
+
67
+
68
+ year_pattern = re.compile(r'\b(19\d{2}|20\d{2}|2100)\b')
69
+
70
+ for us, uk in spelling_pairs:
71
+ us_matches = [m for m in re.finditer(r'\b' + us + r'\b', text, re.I)]
72
+ uk_matches = [m for m in re.finditer(r'\b' + uk + r'\b', text, re.I)]
73
+
74
+ valid_us_matches = []
75
+ for match in us_matches:
76
+ after_text = text[match.end():match.end()+200]
77
+ if not year_pattern.search(after_text):
78
+ valid_us_matches.append(match)
79
+
80
+ valid_uk_matches = []
81
+ for match in uk_matches:
82
+ after_text = text[match.end():match.end()+200]
83
+ if not year_pattern.search(after_text):
84
+ valid_uk_matches.append(match)
85
+
86
+ if valid_us_matches and valid_uk_matches:
87
+ issue = f"Inconsistent UK/US spelling detected:\n\n"
88
+
89
+ # Add US spelling examples (limit to 3)
90
+ issue += "US spelling examples:\n"
91
+ for match in valid_us_matches[:3]:
92
+ start, end = match.span()
93
+ context = text[max(0, start-20):end+20]
94
+ issue += f" • ...{context}...\n"
95
+
96
+ # Add UK spelling examples (limit to 3)
97
+ issue += "\nUK spelling examples:\n"
98
+ for match in valid_uk_matches[:3]:
99
+ start, end = match.span()
100
+ context = text[max(0, start-20):end+20]
101
+ issue += f" • ...{context}...\n"
102
+
103
+ issue += "\n→ Reminder: Maintain consistent spelling throughout the manuscript!"
104
+ issues.append(issue)
105
+
106
+ return issues
107
+
108
+
109
+
110
+ def transform_citations(text, journal_patterns=None):
111
+ """
112
+ Transform all citations in a text from format "Journal Vol(Issue), Pages (Year)"
113
+ to "Journal Year, Vol(Issue), Pages"
114
+ """
115
+ if journal_patterns is None:
116
+ # Default patterns for common journals
117
+ journal_patterns = [
118
+ r'J\. Am\. Chem\. Soc\.',
119
+ r'Chem\. Eur\. J\.',
120
+ r'Angew\. Chem\. Int\. Ed\.',
121
+ r'ACS\. Catal.\.',
122
+ r'Org\. Lett.\.',
123
+ r'Tetrahedron\. Lett.\.',
124
+ # Add more journal patterns as needed
125
+ ]
126
+
127
+ # Create pattern for full citation
128
+ journal_group = f"({'|'.join(journal_patterns)})"
129
+ # Updated volume_group to include optional issue in parentheses
130
+ volume_group = r'(\d+(?:\(\d+\))?)'
131
+ pages_group = r'(\d+(?:[-–]\d+)?)'
132
+ year_group = r'\((\d{4})\)'
133
+
134
+ pattern = f"{journal_group}\\s+{volume_group},\\s*{pages_group}\\s*{year_group}"
135
+
136
+ def replace_citation(match):
137
+ journal = match.group(1)
138
+ volume = match.group(2) # Now includes issue if present
139
+ pages = match.group(3)
140
+ year = match.group(4)
141
+
142
+ # Check if there's a period after the citation
143
+ end_period = '.' if match.string[match.end():].startswith('.') else ''
144
+
145
+ return f"{journal} {year}, {volume}, {pages}{end_period}"
146
+
147
+ # Replace all matching citations in the text
148
+ processed_text = re.sub(pattern, replace_citation, text)
149
+
150
+ return processed_text
151
+
152
+
153
+ def validate_citation(text):
154
+ """
155
+ Validates citations in the format "<Journal> <Year>, <Volume>" where <Year> and <Volume> are integers.
156
+ Checks if Year - Volume equals the journal's founding year offset.
157
+ """
158
+ # Dictionary mapping journals to their founding year offsets
159
+ journal_offsets = {
160
+ "J. Am. Chem. Soc.": 1878,
161
+ "Org. Lett.": 1998,
162
+ "Chem. Eur. J.": 1994,
163
+ "ACS Catal.": 2010,
164
+ "Angew. Chem. Int. Ed.": 1961,
165
+ "Tetrahedron Lett.": 1959,
166
+ }
167
+
168
+ # Create the regex pattern, sorting by length to match longer names first
169
+ sorted_journals = sorted(journal_offsets.keys(), key=len, reverse=True)
170
+ journal_patterns = [re.escape(name) for name in sorted_journals]
171
+ journals_regex = '|'.join(journal_patterns)
172
+
173
+ # Complete pattern with year and volume groups
174
+ pattern = f"({journals_regex})\\s+(\\d+),\\s*(\\d+)"
175
+
176
+ # Find all matches in the text
177
+ matches = re.finditer(pattern, text)
178
+
179
+ results = []
180
+ for match in matches:
181
+ journal = match.group(1) # Exact journal match
182
+ year = int(match.group(2))
183
+ volume = int(match.group(3))
184
+
185
+ offset = journal_offsets.get(journal)
186
+ citation = f"{journal} {year}, {volume}"
187
+
188
+ if offset is None:
189
+ results.append(f"{citation}: Journal not supported.")
190
+ elif year - volume != offset:
191
+ results.append(f"{citation}: wrong year or volume (expected offset {offset})")
192
+
193
+ return results
194
+
195
+
196
+ def check_text(text):
197
+ """
198
+ Searches the input text for various patterns using regex and shows context around matches.
199
+ """
200
+ patterns = {
201
+ r'\b(\S+\s+\d+(?:\.\d+)?\s+oC\b)': "Use the ° symbol in °C, not a superscripted o: ",
202
+ r'\b\d+(?:\.\d+)?\s+%\s+yield\b': "No space between the numeric value and %: ",
203
+ r'\b\d+(?:\.\d+)?\s*mg/ml\b': "The volume is specified in mL, not ml: ",
204
+ r'\b\d+(?:\.\d+)?\s+ml\b': "The volume is specified in mL, not ml: ",
205
+ r'\b(?:one|two|three)(?!-)\s+neck(?:ed)?\b|\b(?:round|flat)(?!-)\s+bottom\b|\bpear(?!-)\s+shaped\b': (
206
+ "Hyphenate 'one-necked' and 'round-bottom, e.g. one-necked round-bottom flask): "
207
+ ),
208
+ r'\b\d+(?:\.\d+)?[-]\s*[mL]L\s+round\b': "No hyphen around L and mL",
209
+ r'\banti-bacterial\b': "Use 'antibacterial' without hyphen: ",
210
+ r'\bco-operation\b': "Use 'cooperation' without hyphen: ",
211
+ r'\bmicro-organism\b': "Use 'microorganism' without hyphen: ",
212
+ r'\bmulti-colored\b': "Use 'multicolored' without hyphen: ",
213
+ r'\bnon-polar\b': "Use 'nonpolar' without hyphen: ",
214
+ r'\bphoto-redox\b': "Use 'photoredox' without hyphen: ",
215
+ r'\bpre-cooled\b': "Use 'precooled' without hyphen: ",
216
+ r'\bsuper-acid\b': "Use 'superacid' without hyphen: ",
217
+ r'\bmembered-ring\b': "Use 'membered ring' without hyphen: ",
218
+ r'\bMembered-Ring\b': "Use 'Membered Ring' without hyphen: ",
219
+ r'\bBronsted acid\b': "Use ø in Brønsted: ",
220
+ r'X-Ray': "Always use lowercase r in X-ray (even when capitalized): ",
221
+ r'x ray': "Use X and hyphen in X-ray: ",
222
+ r'X ray': "Use hyphen in X-ray: ",
223
+ r'\(-\)-': "Use (–)- instead of ",
224
+ r'\b\d+(?:\.\d+)?mL\b': "Missing space between value and mL: ",
225
+ r'\b\d+(?:\.\d+)?µm\b': "Missing space between value and µm: ",
226
+ r'\b\d+(?:\.\d+)?mm\b': "Missing space between value and mm: ",
227
+ r'\b\d+(?:\.\d+)?cm\b': "Missing space between value and cm: ",
228
+ r'\b\d+(?:\.\d+)?mg\b': "Missing space between value and mg: ",
229
+ r'\b\d+(?:\.\d+)?min\b': "Missing space between value and min: ",
230
+ r'(?<!\[)\b\d+(?:\.\d+)?M\b': "Missing space if M means molar (concentration): ",
231
+ r'\b\d+(?:\.\d+)?mM\b': "Missing space between value and mM: ",
232
+ r'\b\d+(?:\.\d+)?μM\b': "Missing space between value and μM: ",
233
+ r'\b(?!1[45]N)(\d+(?:\.\d+)?)N\b': "Missing space if N means normal (concentration): ",
234
+ r'\b\d+(?:\.\d+)?K\b': "Missing space if K means Kelvin: ",
235
+ r'\b\d+,\d+(?=\s?(?:g|mg|mol|mmol|M|h|min|°C|mL)\b)': "Incorrect use of a comma instead of a decimal point",
236
+ r',\s*\d+\.\s+\d+(?=\s?(?:g|mg|mol|mmol|h|min|°C|mL)\b)': "Unintended space? ",
237
+ r'\b(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)\s* °C\b': "Use en dash (–) for temperature ranges: ",
238
+ r'\b(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)\s* g\b': "Use en dash (–) for mass ranges: ",
239
+ r'\b(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)\s* mg\b': "Use en dash (–) for mass ranges: ",
240
+ r'from\s+(\d+(?:\.\d+)?)\s*[–—]\s*(\d+(?:\.\d+)?(?:\s*[A-Za-z°]*)?)\b':
241
+ "Do not use en dash in a 'from X—Y' construction. Use 'from X to Y' instead: ",
242
+ r'between\s+(\d+(?:\.\d+)?)\s*[–—]\s*(\d+(?:\.\d+)?(?:\s*[A-Za-z°]*)?)\b':
243
+ "Do not use en dash in a 'between X—Y' construction. Use 'between X and Y' instead: ",
244
+ r'\b\d+\s+fold\b': "Hyphenate numeral and 'fold': ",
245
+ r'\b\d+(?:\.\d+)?°C\b': "Missing space between value and °C: ",
246
+ r'\b\d+(?:\.\d+)?° K\b': "Use K without °, e.g. 298 K: ",
247
+ r'\b\d+(?:\.\d+)?±\d+(?:\.\d+)?\b': "Missing spaces around the ± symbol: ",
248
+ r'\b\d+(?:\.\d+)?\s*uL\b': "Use μL instead of uL for microliters: ",
249
+ r'\b\d+(?:\.\d+)?\s*ug\b': "Use μg instead of ug for micrograms: ",
250
+ r'\b\d+(?:\.\d+)?\s*umol\b': "Use μmol instead of umol for micromol: ",
251
+ r'\b\d+(?:\.\d+)?\s*uM\b': "Use μM instead of uM for micromolar: ",
252
+ r'\b\d+(?:\.\d+)?ppm\b': "Missing space between value and ppm: ",
253
+ r'\b\d+(?:\.\d+)?bar\b': "Missing space between value and bar: ",
254
+ r'\b\d+(?:\.\d+)?mbar\b': "Missing space between value and mbar: ",
255
+ r'\b\d+(?:\.\d+)?\s*mol/l\b': "Use mol/L instead of mol/l: ",
256
+ r'\b\d+(?:\.\d+)?\s*g/l\b': "Use g/L instead of g/l: ",
257
+ r'\b\d+(?:\.\d+)?\s*mol·l–1\b': "Use mol·L⁻¹ instead of mol·l⁻¹: ",
258
+ r'\b\d+(?:\.\d+)?\s*g·l–1\b': "Use g·L⁻¹ instead of g·l⁻¹: ",
259
+ r'\b\d+(?:\.\d+)?\s*mhz\b': "Use MHz (capital H): ",
260
+ r'\b\d+(?:\.\d+)?\s*gr\b': "Use g instead of gr: ",
261
+ r'\b\d+(?:\.\d+)?\s*hrs?\b': "Use h instead of hr/hrs: ",
262
+ r'/[Ee]natio/': "Misspelling of enantio...: ",
263
+ r'/[Aa]symetr/': "Misspelling of asymmetr...: ",
264
+ r'/[Pp]thal/': "Misspelling of phthal...: ",
265
+ r'/[Nn]aphth.../': "Misspelling of naphth...: ",
266
+ r'/[Ss]terosel.../': "Misspelling of stereosel...: ",
267
+ r'\s+(-?\d+(\.\d+)?)\s+eq\.(?!\s*\d)': "Use 'equiv' for equivalents and 'eq.' for equation: ",
268
+ r'\s+(-?\d+(\.\d+)?)\s+eq\)(?!\s*\d)': "Use 'equiv' for equivalents: ",
269
+ r'[Cc]alc[\'´](?:d|ed)': "Use Calcd or calcd instead of ",
270
+ r'treated with': "Check if 'reacted/washed/extracted with' etc. is more appropriate than ",
271
+
272
+ # Joining names
273
+ r'Diels-Alder': "Use en dash (–) for Diels–Alder: ",
274
+ r'Bednorz-Müller': "Use en dash (–) for Bednorz–Müller: ",
275
+ r'Beer-Lambert': "Use en dash (–) for Beer–Lambert: ",
276
+ r'Bose-Einstein': "Use en dash (–) for Bose–Einstein: ",
277
+ r'Debye-Hückel': "Use en dash (–) for Debye–Hückel: ",
278
+ r'Fermi-Dirac': "Use en dash (–) for Fermi–Dirac: ",
279
+ r'Fischer-Tropsch': "Use en dash (–) for Fischer–Tropsch: ",
280
+ r'Fisher-Johns': "Use en dash (–) for Fisher–Johns: ",
281
+ r'Flory-Huggins': "Use en dash (–) for Flory–Huggins: ",
282
+ r'Franck-Condon': "Use en dash (–) for Franck–Condon: ",
283
+ r'Friedel-Crafts': "Use en dash (–) for Friedel–Crafts: ",
284
+ r'Geiger-Müller': "Use en dash (–) for Geiger–Müller: ",
285
+ r'Henderson-Hasselbalch': "Use en dash (–) for Henderson–Hasselbalch: ",
286
+ r'Jahn-Teller': "Use en dash (–) for Jahn–Teller: ",
287
+ r'Lee-Yang-Parr': "Use en dash (–) for Lee–Yang–Parr: ",
288
+ r'Lineweaver-Burk': "Use en dash (–) for Lineweaver–Burk: ",
289
+ r'Mark-Houwink': "Use en dash (–) for Mark–Houwink: ",
290
+ r'Meerwein-Ponndorf': "Use en dash (–) for Meerwein–Ponndorf: ",
291
+ r'Michaelis-Menten': "Use en dash (–) for Michaelis–Menten: ",
292
+ r'Stern-Volmer': "Use en dash (–) for Stern–Volmer: ",
293
+ r"van't Hoff-Le Bel": "Use en dash (–) for van't Hoff–Le Bel: ",
294
+ r'Wolff-Kishner': "Use en dash (–) for Wolff–Kishner: ",
295
+ r'Young-Laplace': "Use en dash (–) for Young–Laplace: ",
296
+ r'Ziegler-Natta': "Use en dash (–) for Ziegler–Natta: ",
297
+ r'Baeyer-Villiger': "Use en dash (–) for Baeyer–Villiger: ",
298
+ r'Schotten-Baumann': "Use en dash (–) for Schotten–Baumann: ",
299
+ r'Buchwald-Hartwig': "Use en dash (–) for Buchwald–Hartwig: ",
300
+ r'Kumada-Corriu': "Use en dash (–) for Kumada–Corriu: ",
301
+ r'Nozaki-Hiyama': "Use en dash (–) for Nozaki–Hiyama: ",
302
+ r'Suzuki-Miyaura': "Use en dash (–) for Suzuki–Miyaura: ",
303
+ r'Mizoroki-Heck': "Use en dash (–) for Mizoroki–Heck: ",
304
+ r'Wittig-Horner': "Use en dash (–) for Wittig–Horner: ",
305
+ r'Claisen-Schmidt': "Use en dash (–) for Claisen–Schmidt: ",
306
+ r'Stille-Kelly': "Use en dash (–) for Stille–Kelly: ",
307
+ r'Reformatsky-Claisen': "Use en dash (–) for Reformatsky–Claisen: ",
308
+ r'Sonogashira-Hagihara': "Use en dash (–) for Sonogashira–Hagihara: ",
309
+ r'Grubbs-Hoveyda': "Use en dash (–) for Grubbs–Hoveyda: ",
310
+ r'Hoveyda-Grubbs': "Use en dash (–) for Hoveyda–Grubbs: ",
311
+ r'Petasis-Ferrier': "Use en dash (–) for Petasis–Ferrier: ",
312
+ r'Mukaiyama-Michael': "Use en dash (–) for Mukaiyama–Michael: ",
313
+ r'Tsuji-Trost': "Use en dash (–) for Tsuji–Trost: ",
314
+ r'Horner-Wadsworth-Emmons': "Use en dash (–) for Horner–Wadsworth–Emmons: ",
315
+ r'Jorgensen-Hayashi': "Use en dash (–) and ø in Jørgensen–Hayashi: ",
316
+ r'Jørgensen-Hayashi': "Use en dash (–) in Jørgensen–Hayashi: ",
317
+ r'Ullmann-Goldberg': "Use en dash (–) for Ullmann–Goldberg: ",
318
+ r'Chan-Lam': "Use en dash (–) for Chan–Lam: ",
319
+ r'Hiyama-Denmark': "Use en dash (–) for Hiyama–Denmark: ",
320
+ r'Negishi-Brown': "Use en dash (–) for Negishi–Brown: ",
321
+ r'Corey-Fuchs': "Use en dash (–) for Corey–Fuchs: ",
322
+ r'Wacker-Tsuji': "Use en dash (–) for Wacker–Tsuji: ",
323
+ r'Stork-Danheiser': "Use en dash (–) for Stork–Danheiser: ",
324
+ r'Balz-Schiemann': "Use en dash (–) for Balz–Schiemann: ",
325
+ r'Barton-McCombie': "Use en dash (–) for Barton–McCombie: ",
326
+ r'Knoevenagel-Doebner': "Use en dash (–) for Knoevenagel–Doebner: ",
327
+ r'Gattermann-Koch': "Use en dash (–) for Gattermann–Koch: ",
328
+ r'Mukaiyama-Mannich': "Use en dash (–) for Mukaiyama–Mannich: ",
329
+ r'Evans-Tishchenko': "Use en dash (–) for Evans–Tishchenko: ",
330
+
331
+
332
+
333
+
334
+ r'\b\d+(?:\.\d+)?\s*degree\b': "Use ° instead of 'degree': ",
335
+ r'\b\d+(?:\.\d+)?\s*percent\b': "Use % instead of 'percent': ",
336
+ r'\bvacuumed\b': "Use 'evacuated' or 'under vacuum' instead of 'vacuumed': ",
337
+ r'\b[Hh]eated\s+up\b': "Omit 'up', use 'heated': ",
338
+ r'\b[Cc]ooled\s+down\b': "Omit 'down', use 'cooled': ",
339
+
340
+ r'(\d)\s*x\s*(\d)': "Use × operator instead of letter x for multiplication ",
341
+ r'->': "Use → instead of ->",
342
+ r' MIN ': "Use min for minutes",
343
+ r'vaccuum': "Misspelling of vacuum ",
344
+ r'reduced vacuum': "Use 'reduced pressure' ",
345
+ r'(\d)×(\d)': "Leave space before and after × operator ",
346
+ r'mol×L': "Use mol·L ",
347
+ r'g×mol': "Use g·mol ",
348
+ r'J×K': "Use J·K ",
349
+ r'J×mol': "Use J·mol ",
350
+ r'g×L': "Use g·L ",
351
+ r'g×L': "Use mg·mL ",
352
+ r'mol\.L-1': "Use mol·L-1 ",
353
+ r'mol\.mL': "Use mol·mL ",
354
+ r'g\.mol': "Use g·mol ",
355
+ r'J\.K': "Use J·K ",
356
+ r'J\.mol': "Use J·mol ",
357
+ r'g\.L-1': "Use g·L-1 ",
358
+ r'g\.L-1': "Use mg·mL-1 ",
359
+ }
360
+
361
+ results = []
362
+ # Iterate over each pattern in the dictionary
363
+ for pattern_str, message in patterns.items():
364
+ regex = re.compile(pattern_str)
365
+ for match in regex.finditer(text):
366
+ # Get the start and end positions of the match
367
+ start = max(0, match.start() - 5) # Get up to 5 chars before match
368
+ end = min(len(text), match.end() + 10) # Get up to 10 chars after match
369
+
370
+ # Extract the context
371
+ context = text[start:end]
372
+
373
+ # Store the warning message and the context
374
+ results.append(f"{message}...{context}...")
375
+
376
+ return results
377
+
378
+
379
+ def extract_text_from_pdf(pdf_file):
380
+ """
381
+ Extract text from a PDF file using fitz (PyMuPDF).
382
+
383
+ Args:
384
+ pdf_file: The uploaded PDF file object
385
+
386
+ Returns:
387
+ str: The extracted text from the PDF
388
+ """
389
+ try:
390
+ # Create a byte stream from the uploaded file
391
+ pdf_bytes = io.BytesIO(pdf_file.getvalue())
392
+
393
+ # Open the PDF with fitz
394
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
395
+
396
+ # Extract text from all pages
397
+ text = ""
398
+ for page_num in range(len(doc)):
399
+ page = doc[page_num]
400
+ text += page.get_text() + " "
401
+
402
+ # Close the document
403
+ doc.close()
404
+
405
+ return text
406
+ except Exception as e:
407
+ st.error(f"Error extracting text from PDF: {str(e)}")
408
+ return ""
409
+
410
+
411
+ def main():
412
+ st.set_page_config(page_title="Chemistry Text Analyzer", page_icon="🧪", layout="wide")
413
+
414
+ st.title("Chemistry Text Analyzer")
415
+ st.write("""
416
+ This app analyzes chemistry text for common errors, inconsistencies, and formatting issues.
417
+ Upload a PDF file or paste your text in the box below to analyze it.
418
+ """)
419
+
420
+ # Create tabs for different input methods
421
+ tab1, tab2 = st.tabs(["Upload PDF", "Text Input"])
422
+
423
+ with tab1:
424
+ uploaded_file = st.file_uploader("Choose a PDF file", type=['pdf'])
425
+ analyze_pdf = st.button("Analyze PDF")
426
+
427
+ if analyze_pdf and uploaded_file is not None:
428
+ with st.spinner("Extracting text from PDF..."):
429
+ text_content = extract_text_from_pdf(uploaded_file)
430
+
431
+ if text_content:
432
+ st.success(f"Successfully extracted text from {uploaded_file.name}")
433
+ st.write("---")
434
+ analyze_content(text_content)
435
+ else:
436
+ st.error("Failed to extract text from the PDF. Please check if the PDF contains extractable text.")
437
+
438
+ with tab2:
439
+ # Text input area
440
+ text_input = st.text_area("Paste your text here:", height=300)
441
+ analyze_text = st.button("Analyze Text")
442
+
443
+ if analyze_text:
444
+ if not text_input:
445
+ st.warning("Please paste some text to analyze.")
446
+ else:
447
+ st.write("---")
448
+ # Replace newlines with spaces to match the original behavior
449
+ text_content = text_input.replace('\n', ' ')
450
+ analyze_content(text_content)
451
+
452
+
453
+ def analyze_content(text_content):
454
+ """
455
+ Analyze the text content and display results.
456
+
457
+ Args:
458
+ text_content: The text to analyze
459
+ """
460
+ start_time = time.time()
461
+
462
+ with st.spinner("Analyzing text..."):
463
+ # Use expanders for each analysis type to keep the UI clean
464
+ with st.expander("Text Format and Style Issues", expanded=True):
465
+ text_issues = check_text(text_content)
466
+ if text_issues:
467
+ for issue in text_issues:
468
+ st.write(issue)
469
+ else:
470
+ st.write("No text format or style issues found.")
471
+
472
+
473
+
474
+
475
+
476
+ # Language Issues - Separate expander, not nested
477
+ with st.expander("Language Issues", expanded=True):
478
+ language_issues = check_US_UK_consistency(text_content)
479
+ if language_issues:
480
+ for issue in language_issues:
481
+ st.markdown(issue)
482
+ else:
483
+ st.write("No US/UK spelling inconsistencies found.")
484
+
485
+
486
+
487
+
488
+
489
+ with st.expander("Citation Analysis", expanded=True):
490
+ # Transform citations (this returns the transformed text)
491
+ transformed_text = transform_citations(text_content)
492
+ if transformed_text != text_content:
493
+ st.write("Citations were transformed to the proper format.")
494
+
495
+ # Validate citations
496
+ citation_issues = validate_citation(text_content)
497
+ if citation_issues:
498
+ st.write("Citation issues found:")
499
+ for issue in citation_issues:
500
+ st.write(issue)
501
+ else:
502
+ st.write("No citation issues found.")
503
+
504
+ elapsed_time = time.time() - start_time
505
+ st.write(f"Analysis completed in {elapsed_time:.2f} seconds.")
506
+
507
+
508
+ if __name__ == '__main__':
509
+ main()