File size: 24,178 Bytes
19e2b80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ba8983
19e2b80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0e3bb2
19e2b80
 
f0e3bb2
 
19e2b80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92e301f
 
 
 
 
 
 
 
 
 
 
19e2b80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97272d7
 
 
 
 
 
 
 
 
 
 
 
19e2b80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
import streamlit as st
import re
import time
import logging
import fitz  # PyMuPDF
import io

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def check_US_UK_consistency(text):
    """
    Searches the input text for inconsistent use of US and UK English spellings.
    Returns a list of issues for inconsistent spellings found.
    Excludes matches if an integer between 1900 and 2100 appears within 200 characters after the match.

    Args:
        text (str): The string to search through.
    
    Returns:
        list: List of strings describing the inconsistencies found, or empty list if none.
    """
    issues = []

    spelling_pairs = [
        ('analyze(?:d|ing)?', 'analyse(?:d|ing)?'),
        ('(?:un)?catalyze(?:d|s|ing)?', '(?:un)?catalyse(?:d|s|ing)?'),
        ('sulfur', 'sulphur'),
        ('aluminum', 'aluminium'),
        ('color(?:ed|ing|s|less)?', 'colour(?:ed|ing|s|less)?'),
        ('flavor(?:ed|ing|s)?', 'flavour(?:ed|ing|s)?'),
        ('liter', 'litre'),
        ('fiber', 'fibre'),
        ('meter', 'metre'),
        ('neighbor(?:ed|ing|s)?', 'neighbour(?:ed|ing|s)?'),
        ('(?:re)?organiz(?:e|ed|ing|es|ation)', '(?:re)?organis(?:e|ed|ing|es|ation)'),
        ('vapor', 'vapour'),
        ('behavior', 'behaviour'),
        ('realiz(?:e|ed|ing|es|ation)', 'realis(?:e|ed|ing|es|ation)'),
        ('synthetize(?:d|s)?', 'synthetise(?:d|s)?'),
        ('characteriz(?:e|ed|ing|es|ation)', 'characteris(?:e|ed|ing|es|ation)'),
        ('(?:re)?crystalliz(?:e|ed|ing|es|ation)', '(?:re)?crystallis(?:e|ed|ing|es|ation)'),
        ('polymeriz(?:e|ed|ing|es|ation)', 'polymeris(?:e|ed|ing|es|ation)'),
        ('oxidized', 'oxidised'),
        ('neutraliz(?:e|ed|ing|es|ation)', 'neutralis(?:e|ed|ing|es|ation)'),
        ('hydrolyzed', 'hydrolysed'),
        ('standardiz(?:e|ed|ing|es|ation)', 'standardis(?:e|ed|ing|es|ation)'),
        ('ioniz(?:e|ed|ing|es|ation)', 'ionis(?:e|ed|ing|es|ation)'),
        ('solubiliz(?:e|ed|ing|es|ation)', 'solubilis(?:e|ed|ing|es|ation)'),
        ('functionalized', 'functionalised'),
        ('electrolyzed', 'electrolysed'),
        ('homogeniz(?:e|ed|ing|es|ation)', 'homogenis(?:e|ed|ing|es|ation)'),
        ('lyophiliz(?:e|ed|ing|es|ation)', 'lyophilis(?:e|ed|ing|es|ation)'),
        ('polariz(?:e|ed|ing|es|ation)', 'polaris(?:e|ed|ing|es|ation)'),
        ('isomeriz(?:e|ed|ing|es|ation)', 'isomeris(?:e|ed|ing|es|ation)'),
        ('immobiliz(?:e|ed|ing|es|ation)', 'immobilis(?:e|ed|ing|es|ation)'),
        ('stabiliz(?:e|ed|ing|es|ation)', 'stabilis(?:e|ed|ing|es|ation)'),
        ('optimiz(?:e|ed|ing|es|ation)', 'optimis(?:e|ed|ing|es|ation)'),
        ('odor', 'odour'),
        ('galvaniz(?:e|ed|ing|es|ation)', 'galvanis(?:e|ed|ing|es|ation)'),
        ('(?:re)?model(?:ing|ed|s)?', '(?:re)?modell(?:ing|ed|s)?'),
        ('(?:re)?label(?:ing|ed|s)?', '(?:re)?labell(?:ing|ed|s)?'),
        ('gray', 'grey'),
    ]


    year_pattern = re.compile(r'\b(19\d{2}|20\d{2}|2100)\b')

    for us, uk in spelling_pairs:
        us_matches = [m for m in re.finditer(r'\b' + us + r'\b', text, re.I)]
        uk_matches = [m for m in re.finditer(r'\b' + uk + r'\b', text, re.I)]

        valid_us_matches = []
        for match in us_matches:
            after_text = text[match.end():match.end()+200]
            if not year_pattern.search(after_text):
                valid_us_matches.append(match)

        valid_uk_matches = []
        for match in uk_matches:
            after_text = text[match.end():match.end()+200]
            if not year_pattern.search(after_text):
                valid_uk_matches.append(match)

        if valid_us_matches and valid_uk_matches:
            issue = f"Inconsistent UK/US spelling detected:\n\n"
            
            # Add US spelling examples (limit to 3)
            issue += "US spelling examples:\n"
            for match in valid_us_matches[:3]:
                start, end = match.span()
                context = text[max(0, start-20):end+20]
                issue += f"  • ...{context}...\n"
            
            # Add UK spelling examples (limit to 3)
            issue += "\nUK spelling examples:\n"
            for match in valid_uk_matches[:3]:
                start, end = match.span()
                context = text[max(0, start-20):end+20]
                issue += f"  • ...{context}...\n"
            
            issue += "\n→ Reminder: Maintain consistent spelling throughout the manuscript!"
            issues.append(issue)

    return issues



def transform_citations(text, journal_patterns=None):
    """
    Transform all citations in a text from format "Journal Vol(Issue), Pages (Year)"
    to "Journal Year, Vol(Issue), Pages"
    """
    if journal_patterns is None:
        # Default patterns for common journals
        journal_patterns = [
            r'J\. Am\. Chem\. Soc\.',
            r'Chem\. Eur\. J\.',
            r'Angew\. Chem\. Int\. Ed\.',
            r'ACS\. Catal.\.',
            r'Org\. Lett.\.',
            r'Tetrahedron\. Lett.\.',
            # Add more journal patterns as needed
        ]

    # Create pattern for full citation
    journal_group = f"({'|'.join(journal_patterns)})"
    # Updated volume_group to include optional issue in parentheses
    volume_group = r'(\d+(?:\(\d+\))?)'
    pages_group = r'(\d+(?:[-–]\d+)?)'
    year_group = r'\((\d{4})\)'

    pattern = f"{journal_group}\\s+{volume_group},\\s*{pages_group}\\s*{year_group}"

    def replace_citation(match):
        journal = match.group(1)
        volume = match.group(2)  # Now includes issue if present
        pages = match.group(3)
        year = match.group(4)

        # Check if there's a period after the citation
        end_period = '.' if match.string[match.end():].startswith('.') else ''

        return f"{journal} {year}, {volume}, {pages}{end_period}"

    # Replace all matching citations in the text
    processed_text = re.sub(pattern, replace_citation, text)

    return processed_text


def validate_citation(text):
    """
    Validates citations in the format "<Journal> <Year>, <Volume>" where <Year> and <Volume> are integers.
    Checks if Year - Volume equals the journal's founding year offset.
    """
    # Dictionary mapping journals to their founding year offsets
    journal_offsets = {
        "J. Am. Chem. Soc.": 1878,
        "Org. Lett.": 1998,
        "Chem. Eur. J.": 1994,
        "ACS Catal.": 2010,
        "Angew. Chem. Int. Ed.": 1961,
        "Tetrahedron Lett.": 1959,
    }

    # Create the regex pattern, sorting by length to match longer names first
    sorted_journals = sorted(journal_offsets.keys(), key=len, reverse=True)
    journal_patterns = [re.escape(name) for name in sorted_journals]
    journals_regex = '|'.join(journal_patterns)

    # Complete pattern with year and volume groups
    pattern = f"({journals_regex})\\s+(\\d+),\\s*(\\d+)"

    # Find all matches in the text
    matches = re.finditer(pattern, text)

    results = []
    for match in matches:
        journal = match.group(1)  # Exact journal match
        year = int(match.group(2))
        volume = int(match.group(3))

        offset = journal_offsets.get(journal)
        citation = f"{journal} {year}, {volume}"

        if offset is None:
            results.append(f"{citation}: Journal not supported.")
        elif year - volume != offset:
            results.append(f"{citation}: wrong year or volume (expected offset {offset})")

    return results


def check_text(text):
    """
    Searches the input text for various patterns using regex and shows context around matches.
    """
    patterns = {
        r'\b(\S+\s+\d+(?:\.\d+)?\s+oC\b)': "Use the ° symbol in °C, not a superscripted o: ",
        r'\b\d+(?:\.\d+)?\s+%\s+yield\b': "No space between the numeric value and %: ",
        r'\b\d+(?:\.\d+)?\s*mg/ml\b': "The volume is specified in mL, not ml: ",
        r'\b\d+(?:\.\d+)?\s+ml\b': "The volume is specified in mL, not ml: ",
        r'\b(?:one|two|three)(?!-)\s+neck(?:ed)?\b|\b(?:round|flat)(?!-)\s+bottom\b|\bpear(?!-)\s+shaped\b': (
            "Hyphenate 'one-necked' and 'round-bottom, e.g. one-necked round-bottom flask): "
        ),
        r'\b\d+(?:\.\d+)?[-]\s*[mL]L\s+round\b': "No hyphen around L and mL",
        r'\banti-bacterial\b': "Use 'antibacterial' without hyphen: ",
        r'\bco-operation\b': "Use 'cooperation' without hyphen: ",
        r'\bmicro-organism\b': "Use 'microorganism' without hyphen: ",
        r'\bmulti-colored\b': "Use 'multicolored' without hyphen: ",
        r'\bnon-polar\b': "Use 'nonpolar' without hyphen: ",
        r'\bphoto-redox\b': "Use 'photoredox' without hyphen: ",
        r'\bpre-cooled\b': "Use 'precooled' without hyphen: ",
        r'\bsuper-acid\b': "Use 'superacid' without hyphen: ",
        r'\bmembered-ring\b': "Use 'membered ring' without hyphen: ",
        r'\bMembered-Ring\b': "Use 'Membered Ring' without hyphen: ",
        r'\bBronsted acid\b': "Use ø in Brønsted: ",
        r'X-Ray': "Always use lowercase r in X-ray (even when capitalized): ",
        r'x ray': "Use X and hyphen in X-ray: ",
        r'X ray': "Use hyphen in X-ray: ",
        r'\(-\)-': "Use (–)- instead of ",
        r'\b\d+(?:\.\d+)?mL\b': "Missing space between value and mL: ",
        r'\b\d+(?:\.\d+)?µm\b': "Missing space between value and µm: ",
        r'\b\d+(?:\.\d+)?mm\b': "Missing space between value and mm: ",
        r'\b\d+(?:\.\d+)?cm\b': "Missing space between value and cm: ",
        r'\b\d+(?:\.\d+)?mg\b': "Missing space between value and mg: ",
        r'\b\d+(?:\.\d+)?min\b': "Missing space between value and min: ",
        r'(?<!\[)\b\d+(?:\.\d+)?M\b': "Missing space if M means molar (concentration): ",
        r'\b\d+(?:\.\d+)?mM\b': "Missing space between value and mM: ",
        r'\b\d+(?:\.\d+)?μM\b': "Missing space between value and μM: ",
        r'\b(?!1[45]N)(\d+(?:\.\d+)?)N\b': "Missing space if N means normal (concentration): ",
        r'\b\d+(?:\.\d+)?K\b': "Missing space if K means Kelvin: ",
        r'\b\d+,\d+(?=\s?(?:g|mg|mol|mmol|M|h|min|°C|mL)\b)': "Incorrect use of a comma instead of a decimal point",
        r',\s*\d+\.\s+\d+(?=\s?(?:g|mg|mol|mmol|h|min|°C|mL)\b)': "Unintended space? ",
        r'(?<![a-zA-Z0-9])([‒−–-]\d+(?:\.\d+)?)\s*[‒−–-]{1,2}\s*([‒−–-]\d+(?:\.\d+)?)(?![a-zA-Z0-9])': "Use '–a.b to –c.d' for negative numeric ranges: ",
        r'\b(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)\s* °C\b': "Use en dash (–) for temperature ranges: ",
        r'\b(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)\s* g\b': "Use en dash (–) for mass ranges: ",
        r'\b(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)\s* mg\b': "Use en dash (–) for mass ranges: ",
        r'from\s+(\d+(?:\.\d+)?)\s*[–—]\s*(\d+(?:\.\d+)?(?:\s*[A-Za-z°]*)?)\b':
            "Do not use en dash in a 'from  X—Y' construction. Use 'from X to Y' instead: ",
        r'between\s+(\d+(?:\.\d+)?)\s*[–—]\s*(\d+(?:\.\d+)?(?:\s*[A-Za-z°]*)?)\b':
            "Do not use en dash in a 'between X—Y' construction. Use 'between X and Y' instead: ",
        r'\b\d+\s+fold\b': "Hyphenate numeral and 'fold': ",
        r'\b\d+(?:\.\d+)?°C\b': "Missing space between value and °C: ",
        r'\b\d+(?:\.\d+)?° K\b': "Use K without °, e.g. 298 K: ",
        r'\b\d+(?:\.\d+)?±\d+(?:\.\d+)?\b': "Missing spaces around the ± symbol: ",
        r'\b\d+(?:\.\d+)?\s*uL\b': "Use μL instead of uL for microliters: ",
        r'\b\d+(?:\.\d+)?\s*ug\b': "Use μg instead of ug for micrograms: ",
        r'\b\d+(?:\.\d+)?\s*umol\b': "Use μmol instead of umol for micromol: ",
        r'\b\d+(?:\.\d+)?\s*uM\b': "Use μM instead of uM for micromolar: ",
        r'\b\d+(?:\.\d+)?ppm\b': "Missing space between value and ppm: ",
        r'\b\d+(?:\.\d+)?bar\b': "Missing space between value and bar: ",
        r'\b\d+(?:\.\d+)?mbar\b': "Missing space between value and mbar: ",
        r'\b\d+(?:\.\d+)?\s*mol/l\b': "Use mol/L instead of mol/l: ",
        r'\b\d+(?:\.\d+)?\s*g/l\b': "Use g/L instead of g/l: ",
        r'\b\d+(?:\.\d+)?\s*mol·l–1\b': "Use mol·L⁻¹ instead of mol·l⁻¹: ",
        r'\b\d+(?:\.\d+)?\s*g·l–1\b': "Use g·L⁻¹ instead of g·l⁻¹: ",
        r'\b\d+(?:\.\d+)?\s*mhz\b': "Use MHz (capital H): ",
        r'\b\d+(?:\.\d+)?\s*gr\b': "Use g instead of gr: ",
        r'\b\d+(?:\.\d+)?\s*hrs?\b': "Use h instead of hr/hrs: ",
        r'/[Ee]natio/': "Misspelling of enantio...: ",
        r'/[Aa]symetr/': "Misspelling of asymmetr...: ",
        r'/[Pp]thal/': "Misspelling of phthal...: ",
        r'/[Nn]aphth.../': "Misspelling of naphth...: ",
        r'/[Ss]terosel.../': "Misspelling of stereosel...: ",
        r'\s+(-?\d+(\.\d+)?)\s+eq\.(?!\s*\d)': "Use 'equiv' for equivalents and 'eq.' for equation: ",
        r'\s+(-?\d+(\.\d+)?)\s+eq\)(?!\s*\d)': "Use 'equiv' for equivalents: ",
        r'[Cc]alc[\'´](?:d|ed)': "Use Calcd or calcd instead of ",
        r'treated with': "Check if 'reacted/washed/extracted with' etc. is more appropriate than ",
        
        # Joining names
        r'Diels-Alder': "Use en dash (–) for Diels–Alder: ",
        r'Bednorz-Müller': "Use en dash (–) for Bednorz–Müller: ",
        r'Beer-Lambert': "Use en dash (–) for Beer–Lambert: ",
        r'Bose-Einstein': "Use en dash (–) for Bose–Einstein: ",
        r'Debye-Hückel': "Use en dash (–) for Debye–Hückel: ",
        r'Fermi-Dirac': "Use en dash (–) for Fermi–Dirac: ",
        r'Fischer-Tropsch': "Use en dash (–) for Fischer–Tropsch: ",
        r'Fisher-Johns': "Use en dash (–) for Fisher–Johns: ",
        r'Flory-Huggins': "Use en dash (–) for Flory–Huggins: ",
        r'Franck-Condon': "Use en dash (–) for Franck–Condon: ",
        r'Friedel-Crafts': "Use en dash (–) for Friedel–Crafts: ",
        r'Geiger-Müller': "Use en dash (–) for Geiger–Müller: ",
        r'Henderson-Hasselbalch': "Use en dash (–) for Henderson–Hasselbalch: ",
        r'Jahn-Teller': "Use en dash (–) for Jahn–Teller: ",
        r'Lee-Yang-Parr': "Use en dash (–) for Lee–Yang–Parr: ",
        r'Lineweaver-Burk': "Use en dash (–) for Lineweaver–Burk: ",
        r'Mark-Houwink': "Use en dash (–) for Mark–Houwink: ",
        r'Meerwein-Ponndorf': "Use en dash (–) for Meerwein–Ponndorf: ",
        r'Michaelis-Menten': "Use en dash (–) for Michaelis–Menten: ",
        r'Stern-Volmer': "Use en dash (–) for Stern–Volmer: ",
        r"van't Hoff-Le Bel": "Use en dash (–) for van't Hoff–Le Bel: ",
        r'Wolff-Kishner': "Use en dash (–) for Wolff–Kishner: ",
        r'Young-Laplace': "Use en dash (–) for Young–Laplace: ",
        r'Ziegler-Natta': "Use en dash (–) for Ziegler–Natta: ",
        r'Baeyer-Villiger': "Use en dash (–) for Baeyer–Villiger: ",
        r'Schotten-Baumann': "Use en dash (–) for Schotten–Baumann: ",
        r'Buchwald-Hartwig': "Use en dash (–) for Buchwald–Hartwig: ",
        r'Kumada-Corriu': "Use en dash (–) for Kumada–Corriu: ",
        r'Nozaki-Hiyama': "Use en dash (–) for Nozaki–Hiyama: ",
        r'Suzuki-Miyaura': "Use en dash (–) for Suzuki–Miyaura: ",
        r'Mizoroki-Heck': "Use en dash (–) for Mizoroki–Heck: ",
        r'Wittig-Horner': "Use en dash (–) for Wittig–Horner: ",
        r'Claisen-Schmidt': "Use en dash (–) for Claisen–Schmidt: ",
        r'Stille-Kelly': "Use en dash (–) for Stille–Kelly: ",
        r'Reformatsky-Claisen': "Use en dash (–) for Reformatsky–Claisen: ",
        r'Sonogashira-Hagihara': "Use en dash (–) for Sonogashira–Hagihara: ",
        r'Grubbs-Hoveyda': "Use en dash (–) for Grubbs–Hoveyda: ",
        r'Hoveyda-Grubbs': "Use en dash (–) for Hoveyda–Grubbs: ",
        r'Petasis-Ferrier': "Use en dash (–) for Petasis–Ferrier: ",
        r'Mukaiyama-Michael': "Use en dash (–) for Mukaiyama–Michael: ",
        r'Tsuji-Trost': "Use en dash (–) for Tsuji–Trost: ",
        r'Horner-Wadsworth-Emmons': "Use en dash (–) for Horner–Wadsworth–Emmons: ",
        r'Jorgensen-Hayashi': "Use en dash (–) and ø in Jørgensen–Hayashi: ",
        r'Jørgensen-Hayashi': "Use en dash (–) in Jørgensen–Hayashi: ",
        r'Ullmann-Goldberg': "Use en dash (–) for Ullmann–Goldberg: ",
        r'Chan-Lam': "Use en dash (–) for Chan–Lam: ",
        r'Hiyama-Denmark': "Use en dash (–) for Hiyama–Denmark: ",
        r'Negishi-Brown': "Use en dash (–) for Negishi–Brown: ",
        r'Corey-Fuchs': "Use en dash (–) for Corey–Fuchs: ",
        r'Wacker-Tsuji': "Use en dash (–) for Wacker–Tsuji: ",
        r'Stork-Danheiser': "Use en dash (–) for Stork–Danheiser: ",
        r'Balz-Schiemann': "Use en dash (–) for Balz–Schiemann: ",
        r'Barton-McCombie': "Use en dash (–) for Barton–McCombie: ",
        r'Knoevenagel-Doebner': "Use en dash (–) for Knoevenagel–Doebner: ",
        r'Gattermann-Koch': "Use en dash (–) for Gattermann–Koch: ",
        r'Mukaiyama-Mannich': "Use en dash (–) for Mukaiyama–Mannich: ",
        r'Evans-Tishchenko': "Use en dash (–) for Evans–Tishchenko: ",



        
        r'\b\d+(?:\.\d+)?\s*degree\b': "Use ° instead of 'degree': ",
        r'\b\d+(?:\.\d+)?\s*percent\b': "Use % instead of 'percent': ",
        r'\bvacuumed\b': "Use 'evacuated' or 'under vacuum' instead of 'vacuumed': ",

        r'\b[Hh]eated\s+up\b': "Omit 'up', use 'heated': ",
        r'\b[Cc]ooled\s+down\b': "Omit 'down', use 'cooled': ",
        r'\b[Ww]armed\s+up\b': "Omit 'up', use 'warmed': ",
        r'\b[Aa]bsorbed\s+on\b': "Check whether 'adsorbed on' is more appropriate: ",

        r'(\d)\s*x\s*(\d)': "Use × operator instead of letter x for multiplication ",
        r'->': "Use → instead of ->",
        r' MIN ': "Use min for minutes",
        r'vaccuum': "Misspelling of vacuum ",
        r'reduced vacuum': "Use 'reduced pressure' ",
        r'(\d)×(\d)': "Leave space before and after × operator ",
        r'mol×L': "Use mol·L ",
        r'g×mol': "Use g·mol ",
        r'J×K': "Use J·K ",
        r'J×mol': "Use J·mol ",
        r'g×L': "Use g·L ",
        r'g×L': "Use mg·mL ",
        r'mol\.L-1': "Use mol·L-1 ",
        r'mol\.mL': "Use mol·mL ",
        r'g\.mol': "Use g·mol ",
        r'J\.K': "Use J·K ",
        r'J\.mol': "Use J·mol ",
        r'g\.L-1': "Use g·L-1 ",
        r'g\.L-1': "Use mg·mL-1 ",

        #Stereochemistry 
        r'relative stereochemistry': "Use 'relative configuration' ",
        r'absolute stereochemistry': "Use 'absolute configuration' ",
        r'assigned stereochemistry': "Use assigned configuration ",
        r'he stereochemistry of': "the configuration of... might be better",
        r'he stereochemistry was': "the configuration was... might be better",
        r'he stereochemistry is': "the configuration is... might be better",
        r'of stereochemistry': "of the configuration... might be better",

        
    }

    results = []
    # Iterate over each pattern in the dictionary
    for pattern_str, message in patterns.items():
        regex = re.compile(pattern_str)
        for match in regex.finditer(text):
            # Get the start and end positions of the match
            start = max(0, match.start() - 5)  # Get up to 5 chars before match
            end = min(len(text), match.end() + 10)  # Get up to 10 chars after match

            # Extract the context
            context = text[start:end]

            # Store the warning message and the context
            results.append(f"{message}...{context}...")

    return results


def extract_text_from_pdf(pdf_file):
    """
    Extract text from a PDF file using fitz (PyMuPDF).

    Args:
        pdf_file: The uploaded PDF file object

    Returns:
        str: The extracted text from the PDF
    """
    try:
        # Create a byte stream from the uploaded file
        pdf_bytes = io.BytesIO(pdf_file.getvalue())

        # Open the PDF with fitz
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")

        # Extract text from all pages
        text = ""
        for page_num in range(len(doc)):
            page = doc[page_num]
            text += page.get_text() + " "

        # Close the document
        doc.close()

        return text
    except Exception as e:
        st.error(f"Error extracting text from PDF: {str(e)}")
        return ""

def main():
    st.set_page_config(page_title="Chemistry Text Analyzer", page_icon="🧪", layout="wide")
    st.markdown(
    """
    <style>
    div.block-container {
    overflow-y: auto !important;
    }
    iframe {
    overflow: visible !important;
    }
    </style>
    """, unsafe_allow_html=True
    )
    st.title("Chemistry Text Analyzer")
    st.write("""
    This app analyzes chemistry text for common errors, inconsistencies, and formatting issues. 
    Upload a PDF file or paste your text in the box below to analyze it.
    """)
    # Create tabs for different input methods
    tab1, tab2 = st.tabs(["Upload PDF", "Text Input"])
    with tab1:
        uploaded_file = st.file_uploader("Choose a PDF file", type=['pdf'])
        analyze_pdf = st.button("Analyze PDF")
        if analyze_pdf and uploaded_file is not None:
            with st.spinner("Extracting text from PDF..."):
                text_content = extract_text_from_pdf(uploaded_file)
                if text_content:
                    st.success(f"Successfully extracted text from {uploaded_file.name}")
                    st.write("---")
                    analyze_content(text_content)
                else:
                    st.error("Failed to extract text from the PDF. Please check if the PDF contains extractable text.")
    with tab2:
        # Text input area
        text_input = st.text_area("Paste your text here:", height=300)
        analyze_text = st.button("Analyze Text")
        if analyze_text:
            if not text_input:
                st.warning("Please paste some text to analyze.")
            else:
                st.write("---")
                # Replace newlines with spaces to match the original behavior
                text_content = text_input.replace('\n', ' ')
                analyze_content(text_content)

def analyze_content(text_content):
    """
    Analyze the text content and display results.

    Args:
        text_content: The text to analyze
    """
    start_time = time.time()

    with st.spinner("Analyzing text..."):
        # Use expanders for each analysis type to keep the UI clean
        with st.expander("Text Format and Style Issues", expanded=True):
            text_issues = check_text(text_content)
            if text_issues:
                for issue in text_issues:
                    st.write(issue)
            else:
                st.write("No text format or style issues found.")


     


        # Language Issues - Separate expander, not nested
        with st.expander("Language Issues", expanded=True):
            language_issues = check_US_UK_consistency(text_content)
            if language_issues:
                for issue in language_issues:
                    st.markdown(issue)
            else:
                st.write("No US/UK spelling inconsistencies found.")
        

        
                

        with st.expander("Citation Analysis", expanded=True):
            # Transform citations (this returns the transformed text)
            transformed_text = transform_citations(text_content)
            if transformed_text != text_content:
                st.write("Citations were transformed to the proper format.")

            # Validate citations    
            citation_issues = validate_citation(text_content)
            if citation_issues:
                st.write("Citation issues found:")
                for issue in citation_issues:
                    st.write(issue)
            else:
                st.write("No citation issues found.")

    elapsed_time = time.time() - start_time
    st.write(f"Analysis completed in {elapsed_time:.2f} seconds.")


if __name__ == '__main__':
    main()