| import streamlit as st |
| import re |
| import time |
| import logging |
| import fitz |
| import io |
|
|
| |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
|
| def check_US_UK_consistency(text): |
| """ |
| Searches the input text for inconsistent use of US and UK English spellings. |
| Returns a list of issues for inconsistent spellings found. |
| Excludes matches if an integer between 1900 and 2100 appears within 200 characters after the match. |
| |
| Args: |
| text (str): The string to search through. |
| |
| Returns: |
| list: List of strings describing the inconsistencies found, or empty list if none. |
| """ |
| issues = [] |
|
|
| spelling_pairs = [ |
| ('analyze(?:d|ing)?', 'analyse(?:d|ing)?'), |
| ('(?:un)?catalyze(?:d|s|ing)?', '(?:un)?catalyse(?:d|s|ing)?'), |
| ('sulfur', 'sulphur'), |
| ('aluminum', 'aluminium'), |
| ('color(?:ed|ing|s|less)?', 'colour(?:ed|ing|s|less)?'), |
| ('flavor(?:ed|ing|s)?', 'flavour(?:ed|ing|s)?'), |
| ('liter', 'litre'), |
| ('fiber', 'fibre'), |
| ('meter', 'metre'), |
| ('neighbor(?:ed|ing|s)?', 'neighbour(?:ed|ing|s)?'), |
| ('(?:re)?organiz(?:e|ed|ing|es|ation)', '(?:re)?organis(?:e|ed|ing|es|ation)'), |
| ('vapor', 'vapour'), |
| ('behavior', 'behaviour'), |
| ('realiz(?:e|ed|ing|es|ation)', 'realis(?:e|ed|ing|es|ation)'), |
| ('synthetize(?:d|s)?', 'synthetise(?:d|s)?'), |
| ('characteriz(?:e|ed|ing|es|ation)', 'characteris(?:e|ed|ing|es|ation)'), |
| ('(?:re)?crystalliz(?:e|ed|ing|es|ation)', '(?:re)?crystallis(?:e|ed|ing|es|ation)'), |
| ('polymeriz(?:e|ed|ing|es|ation)', 'polymeris(?:e|ed|ing|es|ation)'), |
| ('oxidized', 'oxidised'), |
| ('neutraliz(?:e|ed|ing|es|ation)', 'neutralis(?:e|ed|ing|es|ation)'), |
| ('hydrolyzed', 'hydrolysed'), |
| ('standardiz(?:e|ed|ing|es|ation)', 'standardis(?:e|ed|ing|es|ation)'), |
| ('ioniz(?:e|ed|ing|es|ation)', 'ionis(?:e|ed|ing|es|ation)'), |
| ('solubiliz(?:e|ed|ing|es|ation)', 'solubilis(?:e|ed|ing|es|ation)'), |
| ('functionalized', 'functionalised'), |
| ('electrolyzed', 'electrolysed'), |
| ('homogeniz(?:e|ed|ing|es|ation)', 'homogenis(?:e|ed|ing|es|ation)'), |
| ('lyophiliz(?:e|ed|ing|es|ation)', 'lyophilis(?:e|ed|ing|es|ation)'), |
| ('polariz(?:e|ed|ing|es|ation)', 'polaris(?:e|ed|ing|es|ation)'), |
| ('isomeriz(?:e|ed|ing|es|ation)', 'isomeris(?:e|ed|ing|es|ation)'), |
| ('immobiliz(?:e|ed|ing|es|ation)', 'immobilis(?:e|ed|ing|es|ation)'), |
| ('stabiliz(?:e|ed|ing|es|ation)', 'stabilis(?:e|ed|ing|es|ation)'), |
| ('optimiz(?:e|ed|ing|es|ation)', 'optimis(?:e|ed|ing|es|ation)'), |
| ('odor', 'odour'), |
| ('galvaniz(?:e|ed|ing|es|ation)', 'galvanis(?:e|ed|ing|es|ation)'), |
| ('(?:re)?model(?:ing|ed|s)?', '(?:re)?modell(?:ing|ed|s)?'), |
| ('(?:re)?label(?:ing|ed|s)?', '(?:re)?labell(?:ing|ed|s)?'), |
| ('gray', 'grey'), |
| ] |
|
|
|
|
| year_pattern = re.compile(r'\b(19\d{2}|20\d{2}|2100)\b') |
|
|
| for us, uk in spelling_pairs: |
| us_matches = [m for m in re.finditer(r'\b' + us + r'\b', text, re.I)] |
| uk_matches = [m for m in re.finditer(r'\b' + uk + r'\b', text, re.I)] |
|
|
| valid_us_matches = [] |
| for match in us_matches: |
| after_text = text[match.end():match.end()+200] |
| if not year_pattern.search(after_text): |
| valid_us_matches.append(match) |
|
|
| valid_uk_matches = [] |
| for match in uk_matches: |
| after_text = text[match.end():match.end()+200] |
| if not year_pattern.search(after_text): |
| valid_uk_matches.append(match) |
|
|
| if valid_us_matches and valid_uk_matches: |
| issue = f"Inconsistent UK/US spelling detected:\n\n" |
| |
| |
| issue += "US spelling examples:\n" |
| for match in valid_us_matches[:3]: |
| start, end = match.span() |
| context = text[max(0, start-20):end+20] |
| issue += f" • ...{context}...\n" |
| |
| |
| issue += "\nUK spelling examples:\n" |
| for match in valid_uk_matches[:3]: |
| start, end = match.span() |
| context = text[max(0, start-20):end+20] |
| issue += f" • ...{context}...\n" |
| |
| issue += "\n→ Reminder: Maintain consistent spelling throughout the manuscript!" |
| issues.append(issue) |
|
|
| return issues |
|
|
|
|
|
|
| def transform_citations(text, journal_patterns=None): |
| """ |
| Transform all citations in a text from format "Journal Vol(Issue), Pages (Year)" |
| to "Journal Year, Vol(Issue), Pages" |
| """ |
| if journal_patterns is None: |
| |
| journal_patterns = [ |
| r'J\. Am\. Chem\. Soc\.', |
| r'Chem\. Eur\. J\.', |
| r'Angew\. Chem\. Int\. Ed\.', |
| r'ACS\. Catal.\.', |
| r'Org\. Lett.\.', |
| r'Tetrahedron\. Lett.\.', |
| |
| ] |
|
|
| |
| journal_group = f"({'|'.join(journal_patterns)})" |
| |
| volume_group = r'(\d+(?:\(\d+\))?)' |
| pages_group = r'(\d+(?:[-–]\d+)?)' |
| year_group = r'\((\d{4})\)' |
|
|
| pattern = f"{journal_group}\\s+{volume_group},\\s*{pages_group}\\s*{year_group}" |
|
|
| def replace_citation(match): |
| journal = match.group(1) |
| volume = match.group(2) |
| pages = match.group(3) |
| year = match.group(4) |
|
|
| |
| end_period = '.' if match.string[match.end():].startswith('.') else '' |
|
|
| return f"{journal} {year}, {volume}, {pages}{end_period}" |
|
|
| |
| processed_text = re.sub(pattern, replace_citation, text) |
|
|
| return processed_text |
|
|
|
|
| def validate_citation(text): |
| """ |
| Validates citations in the format "<Journal> <Year>, <Volume>" where <Year> and <Volume> are integers. |
| Checks if Year - Volume equals the journal's founding year offset. |
| """ |
| |
| journal_offsets = { |
| "J. Am. Chem. Soc.": 1878, |
| "Org. Lett.": 1998, |
| "Chem. Eur. J.": 1994, |
| "ACS Catal.": 2010, |
| "Angew. Chem. Int. Ed.": 1961, |
| "Tetrahedron Lett.": 1959, |
| } |
|
|
| |
| sorted_journals = sorted(journal_offsets.keys(), key=len, reverse=True) |
| journal_patterns = [re.escape(name) for name in sorted_journals] |
| journals_regex = '|'.join(journal_patterns) |
|
|
| |
| pattern = f"({journals_regex})\\s+(\\d+),\\s*(\\d+)" |
|
|
| |
| matches = re.finditer(pattern, text) |
|
|
| results = [] |
| for match in matches: |
| journal = match.group(1) |
| year = int(match.group(2)) |
| volume = int(match.group(3)) |
|
|
| offset = journal_offsets.get(journal) |
| citation = f"{journal} {year}, {volume}" |
|
|
| if offset is None: |
| results.append(f"{citation}: Journal not supported.") |
| elif year - volume != offset: |
| results.append(f"{citation}: wrong year or volume (expected offset {offset})") |
|
|
| return results |
|
|
|
|
| def check_text(text): |
| """ |
| Searches the input text for various patterns using regex and shows context around matches. |
| """ |
| patterns = { |
| r'\b(\S+\s+\d+(?:\.\d+)?\s+oC\b)': "Use the ° symbol in °C, not a superscripted o: ", |
| r'\b\d+(?:\.\d+)?\s+%\s+yield\b': "No space between the numeric value and %: ", |
| r'\b\d+(?:\.\d+)?\s*mg/ml\b': "The volume is specified in mL, not ml: ", |
| r'\b\d+(?:\.\d+)?\s+ml\b': "The volume is specified in mL, not ml: ", |
| r'\b(?:one|two|three)(?!-)\s+neck(?:ed)?\b|\b(?:round|flat)(?!-)\s+bottom\b|\bpear(?!-)\s+shaped\b': ( |
| "Hyphenate 'one-necked' and 'round-bottom, e.g. one-necked round-bottom flask): " |
| ), |
| r'\b\d+(?:\.\d+)?[-]\s*[mL]L\s+round\b': "No hyphen around L and mL", |
| r'\banti-bacterial\b': "Use 'antibacterial' without hyphen: ", |
| r'\bco-operation\b': "Use 'cooperation' without hyphen: ", |
| r'\bmicro-organism\b': "Use 'microorganism' without hyphen: ", |
| r'\bmulti-colored\b': "Use 'multicolored' without hyphen: ", |
| r'\bnon-polar\b': "Use 'nonpolar' without hyphen: ", |
| r'\bphoto-redox\b': "Use 'photoredox' without hyphen: ", |
| r'\bpre-cooled\b': "Use 'precooled' without hyphen: ", |
| r'\bsuper-acid\b': "Use 'superacid' without hyphen: ", |
| r'\bmembered-ring\b': "Use 'membered ring' without hyphen: ", |
| r'\bMembered-Ring\b': "Use 'Membered Ring' without hyphen: ", |
| r'\bBronsted acid\b': "Use ø in Brønsted: ", |
| r'X-Ray': "Always use lowercase r in X-ray (even when capitalized): ", |
| r'x ray': "Use X and hyphen in X-ray: ", |
| r'X ray': "Use hyphen in X-ray: ", |
| r'\(-\)-': "Use (–)- instead of ", |
| r'\b\d+(?:\.\d+)?mL\b': "Missing space between value and mL: ", |
| r'\b\d+(?:\.\d+)?µm\b': "Missing space between value and µm: ", |
| r'\b\d+(?:\.\d+)?mm\b': "Missing space between value and mm: ", |
| r'\b\d+(?:\.\d+)?cm\b': "Missing space between value and cm: ", |
| r'\b\d+(?:\.\d+)?mg\b': "Missing space between value and mg: ", |
| r'\b\d+(?:\.\d+)?min\b': "Missing space between value and min: ", |
| r'(?<!\[)\b\d+(?:\.\d+)?M\b': "Missing space if M means molar (concentration): ", |
| r'\b\d+(?:\.\d+)?mM\b': "Missing space between value and mM: ", |
| r'\b\d+(?:\.\d+)?μM\b': "Missing space between value and μM: ", |
| r'\b(?!1[45]N)(\d+(?:\.\d+)?)N\b': "Missing space if N means normal (concentration): ", |
| r'\b\d+(?:\.\d+)?K\b': "Missing space if K means Kelvin: ", |
| r'\b\d+,\d+(?=\s?(?:g|mg|mol|mmol|M|h|min|°C|mL)\b)': "Incorrect use of a comma instead of a decimal point", |
| r',\s*\d+\.\s+\d+(?=\s?(?:g|mg|mol|mmol|h|min|°C|mL)\b)': "Unintended space? ", |
| r'(?<![a-zA-Z0-9])([‒−–-]\d+(?:\.\d+)?)\s*[‒−–-]{1,2}\s*([‒−–-]\d+(?:\.\d+)?)(?![a-zA-Z0-9])': "Use '–a.b to –c.d' for negative numeric ranges: ", |
| r'\b(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)\s* °C\b': "Use en dash (–) for temperature ranges: ", |
| r'\b(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)\s* g\b': "Use en dash (–) for mass ranges: ", |
| r'\b(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)\s* mg\b': "Use en dash (–) for mass ranges: ", |
| r'from\s+(\d+(?:\.\d+)?)\s*[–—]\s*(\d+(?:\.\d+)?(?:\s*[A-Za-z°]*)?)\b': |
| "Do not use en dash in a 'from X—Y' construction. Use 'from X to Y' instead: ", |
| r'between\s+(\d+(?:\.\d+)?)\s*[–—]\s*(\d+(?:\.\d+)?(?:\s*[A-Za-z°]*)?)\b': |
| "Do not use en dash in a 'between X—Y' construction. Use 'between X and Y' instead: ", |
| r'\b\d+\s+fold\b': "Hyphenate numeral and 'fold': ", |
| r'\b\d+(?:\.\d+)?°C\b': "Missing space between value and °C: ", |
| r'\b\d+(?:\.\d+)?° K\b': "Use K without °, e.g. 298 K: ", |
| r'\b\d+(?:\.\d+)?±\d+(?:\.\d+)?\b': "Missing spaces around the ± symbol: ", |
| r'\b\d+(?:\.\d+)?\s*uL\b': "Use μL instead of uL for microliters: ", |
| r'\b\d+(?:\.\d+)?\s*ug\b': "Use μg instead of ug for micrograms: ", |
| r'\b\d+(?:\.\d+)?\s*umol\b': "Use μmol instead of umol for micromol: ", |
| r'\b\d+(?:\.\d+)?\s*uM\b': "Use μM instead of uM for micromolar: ", |
| r'\b\d+(?:\.\d+)?ppm\b': "Missing space between value and ppm: ", |
| r'\b\d+(?:\.\d+)?bar\b': "Missing space between value and bar: ", |
| r'\b\d+(?:\.\d+)?mbar\b': "Missing space between value and mbar: ", |
| r'\b\d+(?:\.\d+)?\s*mol/l\b': "Use mol/L instead of mol/l: ", |
| r'\b\d+(?:\.\d+)?\s*g/l\b': "Use g/L instead of g/l: ", |
| r'\b\d+(?:\.\d+)?\s*mol·l–1\b': "Use mol·L⁻¹ instead of mol·l⁻¹: ", |
| r'\b\d+(?:\.\d+)?\s*g·l–1\b': "Use g·L⁻¹ instead of g·l⁻¹: ", |
| r'\b\d+(?:\.\d+)?\s*mhz\b': "Use MHz (capital H): ", |
| r'\b\d+(?:\.\d+)?\s*gr\b': "Use g instead of gr: ", |
| r'\b\d+(?:\.\d+)?\s*hrs?\b': "Use h instead of hr/hrs: ", |
| r'/[Ee]natio/': "Misspelling of enantio...: ", |
| r'/[Aa]symetr/': "Misspelling of asymmetr...: ", |
| r'/[Pp]thal/': "Misspelling of phthal...: ", |
| r'/[Nn]aphth.../': "Misspelling of naphth...: ", |
| r'/[Ss]terosel.../': "Misspelling of stereosel...: ", |
| r'\s+(-?\d+(\.\d+)?)\s+eq\.(?!\s*\d)': "Use 'equiv' for equivalents and 'eq.' for equation: ", |
| r'\s+(-?\d+(\.\d+)?)\s+eq\)(?!\s*\d)': "Use 'equiv' for equivalents: ", |
| r'[Cc]alc[\'´](?:d|ed)': "Use Calcd or calcd instead of ", |
| r'treated with': "Check if 'reacted/washed/extracted with' etc. is more appropriate than ", |
| |
| |
| r'Diels-Alder': "Use en dash (–) for Diels–Alder: ", |
| r'Bednorz-Müller': "Use en dash (–) for Bednorz–Müller: ", |
| r'Beer-Lambert': "Use en dash (–) for Beer–Lambert: ", |
| r'Bose-Einstein': "Use en dash (–) for Bose–Einstein: ", |
| r'Debye-Hückel': "Use en dash (–) for Debye–Hückel: ", |
| r'Fermi-Dirac': "Use en dash (–) for Fermi–Dirac: ", |
| r'Fischer-Tropsch': "Use en dash (–) for Fischer–Tropsch: ", |
| r'Fisher-Johns': "Use en dash (–) for Fisher–Johns: ", |
| r'Flory-Huggins': "Use en dash (–) for Flory–Huggins: ", |
| r'Franck-Condon': "Use en dash (–) for Franck–Condon: ", |
| r'Friedel-Crafts': "Use en dash (–) for Friedel–Crafts: ", |
| r'Geiger-Müller': "Use en dash (–) for Geiger–Müller: ", |
| r'Henderson-Hasselbalch': "Use en dash (–) for Henderson–Hasselbalch: ", |
| r'Jahn-Teller': "Use en dash (–) for Jahn–Teller: ", |
| r'Lee-Yang-Parr': "Use en dash (–) for Lee–Yang–Parr: ", |
| r'Lineweaver-Burk': "Use en dash (–) for Lineweaver–Burk: ", |
| r'Mark-Houwink': "Use en dash (–) for Mark–Houwink: ", |
| r'Meerwein-Ponndorf': "Use en dash (–) for Meerwein–Ponndorf: ", |
| r'Michaelis-Menten': "Use en dash (–) for Michaelis–Menten: ", |
| r'Stern-Volmer': "Use en dash (–) for Stern–Volmer: ", |
| r"van't Hoff-Le Bel": "Use en dash (–) for van't Hoff–Le Bel: ", |
| r'Wolff-Kishner': "Use en dash (–) for Wolff–Kishner: ", |
| r'Young-Laplace': "Use en dash (–) for Young–Laplace: ", |
| r'Ziegler-Natta': "Use en dash (–) for Ziegler–Natta: ", |
| r'Baeyer-Villiger': "Use en dash (–) for Baeyer–Villiger: ", |
| r'Schotten-Baumann': "Use en dash (–) for Schotten–Baumann: ", |
| r'Buchwald-Hartwig': "Use en dash (–) for Buchwald–Hartwig: ", |
| r'Kumada-Corriu': "Use en dash (–) for Kumada–Corriu: ", |
| r'Nozaki-Hiyama': "Use en dash (–) for Nozaki–Hiyama: ", |
| r'Suzuki-Miyaura': "Use en dash (–) for Suzuki–Miyaura: ", |
| r'Mizoroki-Heck': "Use en dash (–) for Mizoroki–Heck: ", |
| r'Wittig-Horner': "Use en dash (–) for Wittig–Horner: ", |
| r'Claisen-Schmidt': "Use en dash (–) for Claisen–Schmidt: ", |
| r'Stille-Kelly': "Use en dash (–) for Stille–Kelly: ", |
| r'Reformatsky-Claisen': "Use en dash (–) for Reformatsky–Claisen: ", |
| r'Sonogashira-Hagihara': "Use en dash (–) for Sonogashira–Hagihara: ", |
| r'Grubbs-Hoveyda': "Use en dash (–) for Grubbs–Hoveyda: ", |
| r'Hoveyda-Grubbs': "Use en dash (–) for Hoveyda–Grubbs: ", |
| r'Petasis-Ferrier': "Use en dash (–) for Petasis–Ferrier: ", |
| r'Mukaiyama-Michael': "Use en dash (–) for Mukaiyama–Michael: ", |
| r'Tsuji-Trost': "Use en dash (–) for Tsuji–Trost: ", |
| r'Horner-Wadsworth-Emmons': "Use en dash (–) for Horner–Wadsworth–Emmons: ", |
| r'Jorgensen-Hayashi': "Use en dash (–) and ø in Jørgensen–Hayashi: ", |
| r'Jørgensen-Hayashi': "Use en dash (–) in Jørgensen–Hayashi: ", |
| r'Ullmann-Goldberg': "Use en dash (–) for Ullmann–Goldberg: ", |
| r'Chan-Lam': "Use en dash (–) for Chan–Lam: ", |
| r'Hiyama-Denmark': "Use en dash (–) for Hiyama–Denmark: ", |
| r'Negishi-Brown': "Use en dash (–) for Negishi–Brown: ", |
| r'Corey-Fuchs': "Use en dash (–) for Corey–Fuchs: ", |
| r'Wacker-Tsuji': "Use en dash (–) for Wacker–Tsuji: ", |
| r'Stork-Danheiser': "Use en dash (–) for Stork–Danheiser: ", |
| r'Balz-Schiemann': "Use en dash (–) for Balz–Schiemann: ", |
| r'Barton-McCombie': "Use en dash (–) for Barton–McCombie: ", |
| r'Knoevenagel-Doebner': "Use en dash (–) for Knoevenagel–Doebner: ", |
| r'Gattermann-Koch': "Use en dash (–) for Gattermann–Koch: ", |
| r'Mukaiyama-Mannich': "Use en dash (–) for Mukaiyama–Mannich: ", |
| r'Evans-Tishchenko': "Use en dash (–) for Evans–Tishchenko: ", |
|
|
|
|
|
|
| |
| r'\b\d+(?:\.\d+)?\s*degree\b': "Use ° instead of 'degree': ", |
| r'\b\d+(?:\.\d+)?\s*percent\b': "Use % instead of 'percent': ", |
| r'\bvacuumed\b': "Use 'evacuated' or 'under vacuum' instead of 'vacuumed': ", |
|
|
| r'\b[Hh]eated\s+up\b': "Omit 'up', use 'heated': ", |
| r'\b[Cc]ooled\s+down\b': "Omit 'down', use 'cooled': ", |
| r'\b[Ww]armed\s+up\b': "Omit 'up', use 'warmed': ", |
| r'\b[Aa]bsorbed\s+on\b': "Check whether 'adsorbed on' is more appropriate: ", |
|
|
| r'(\d)\s*x\s*(\d)': "Use × operator instead of letter x for multiplication ", |
| r'->': "Use → instead of ->", |
| r' MIN ': "Use min for minutes", |
| r'vaccuum': "Misspelling of vacuum ", |
| r'reduced vacuum': "Use 'reduced pressure' ", |
| r'(\d)×(\d)': "Leave space before and after × operator ", |
| r'mol×L': "Use mol·L ", |
| r'g×mol': "Use g·mol ", |
| r'J×K': "Use J·K ", |
| r'J×mol': "Use J·mol ", |
| r'g×L': "Use g·L ", |
| r'g×L': "Use mg·mL ", |
| r'mol\.L-1': "Use mol·L-1 ", |
| r'mol\.mL': "Use mol·mL ", |
| r'g\.mol': "Use g·mol ", |
| r'J\.K': "Use J·K ", |
| r'J\.mol': "Use J·mol ", |
| r'g\.L-1': "Use g·L-1 ", |
| r'g\.L-1': "Use mg·mL-1 ", |
|
|
| |
| r'relative stereochemistry': "Use 'relative configuration' ", |
| r'absolute stereochemistry': "Use 'absolute configuration' ", |
| r'assigned stereochemistry': "Use assigned configuration ", |
| r'he stereochemistry of': "the configuration of... might be better", |
| r'he stereochemistry was': "the configuration was... might be better", |
| r'he stereochemistry is': "the configuration is... might be better", |
| r'of stereochemistry': "of the configuration... might be better", |
|
|
| |
| } |
|
|
| results = [] |
| |
| for pattern_str, message in patterns.items(): |
| regex = re.compile(pattern_str) |
| for match in regex.finditer(text): |
| |
| start = max(0, match.start() - 5) |
| end = min(len(text), match.end() + 10) |
|
|
| |
| context = text[start:end] |
|
|
| |
| results.append(f"{message}...{context}...") |
|
|
| return results |
|
|
|
|
| def extract_text_from_pdf(pdf_file): |
| """ |
| Extract text from a PDF file using fitz (PyMuPDF). |
| |
| Args: |
| pdf_file: The uploaded PDF file object |
| |
| Returns: |
| str: The extracted text from the PDF |
| """ |
| try: |
| |
| pdf_bytes = io.BytesIO(pdf_file.getvalue()) |
|
|
| |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
|
|
| |
| text = "" |
| for page_num in range(len(doc)): |
| page = doc[page_num] |
| text += page.get_text() + " " |
|
|
| |
| doc.close() |
|
|
| return text |
| except Exception as e: |
| st.error(f"Error extracting text from PDF: {str(e)}") |
| return "" |
|
|
| def main(): |
| st.set_page_config(page_title="Chemistry Text Analyzer", page_icon="🧪", layout="wide") |
| st.markdown( |
| """ |
| <style> |
| div.block-container { |
| overflow-y: auto !important; |
| } |
| iframe { |
| overflow: visible !important; |
| } |
| </style> |
| """, unsafe_allow_html=True |
| ) |
| st.title("Chemistry Text Analyzer") |
| st.write(""" |
| This app analyzes chemistry text for common errors, inconsistencies, and formatting issues. |
| Upload a PDF file or paste your text in the box below to analyze it. |
| """) |
| |
| tab1, tab2 = st.tabs(["Upload PDF", "Text Input"]) |
| with tab1: |
| uploaded_file = st.file_uploader("Choose a PDF file", type=['pdf']) |
| analyze_pdf = st.button("Analyze PDF") |
| if analyze_pdf and uploaded_file is not None: |
| with st.spinner("Extracting text from PDF..."): |
| text_content = extract_text_from_pdf(uploaded_file) |
| if text_content: |
| st.success(f"Successfully extracted text from {uploaded_file.name}") |
| st.write("---") |
| analyze_content(text_content) |
| else: |
| st.error("Failed to extract text from the PDF. Please check if the PDF contains extractable text.") |
| with tab2: |
| |
| text_input = st.text_area("Paste your text here:", height=300) |
| analyze_text = st.button("Analyze Text") |
| if analyze_text: |
| if not text_input: |
| st.warning("Please paste some text to analyze.") |
| else: |
| st.write("---") |
| |
| text_content = text_input.replace('\n', ' ') |
| analyze_content(text_content) |
|
|
| def analyze_content(text_content): |
| """ |
| Analyze the text content and display results. |
| |
| Args: |
| text_content: The text to analyze |
| """ |
| start_time = time.time() |
|
|
| with st.spinner("Analyzing text..."): |
| |
| with st.expander("Text Format and Style Issues", expanded=True): |
| text_issues = check_text(text_content) |
| if text_issues: |
| for issue in text_issues: |
| st.write(issue) |
| else: |
| st.write("No text format or style issues found.") |
|
|
|
|
| |
|
|
|
|
| |
| with st.expander("Language Issues", expanded=True): |
| language_issues = check_US_UK_consistency(text_content) |
| if language_issues: |
| for issue in language_issues: |
| st.markdown(issue) |
| else: |
| st.write("No US/UK spelling inconsistencies found.") |
| |
|
|
| |
| |
|
|
| with st.expander("Citation Analysis", expanded=True): |
| |
| transformed_text = transform_citations(text_content) |
| if transformed_text != text_content: |
| st.write("Citations were transformed to the proper format.") |
|
|
| |
| citation_issues = validate_citation(text_content) |
| if citation_issues: |
| st.write("Citation issues found:") |
| for issue in citation_issues: |
| st.write(issue) |
| else: |
| st.write("No citation issues found.") |
|
|
| elapsed_time = time.time() - start_time |
| st.write(f"Analysis completed in {elapsed_time:.2f} seconds.") |
|
|
|
|
| if __name__ == '__main__': |
| main() |