import streamlit as st import re import time import logging import fitz # PyMuPDF import io # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def check_US_UK_consistency(text): """ Searches the input text for inconsistent use of US and UK English spellings. Returns a list of issues for inconsistent spellings found. Excludes matches if an integer between 1900 and 2100 appears within 200 characters after the match. Args: text (str): The string to search through. Returns: list: List of strings describing the inconsistencies found, or empty list if none. """ issues = [] spelling_pairs = [ ('analyze(?:d|ing)?', 'analyse(?:d|ing)?'), ('(?:un)?catalyze(?:d|s|ing)?', '(?:un)?catalyse(?:d|s|ing)?'), ('sulfur', 'sulphur'), ('aluminum', 'aluminium'), ('color(?:ed|ing|s|less)?', 'colour(?:ed|ing|s|less)?'), ('flavor(?:ed|ing|s)?', 'flavour(?:ed|ing|s)?'), ('liter', 'litre'), ('fiber', 'fibre'), ('meter', 'metre'), ('neighbor(?:ed|ing|s)?', 'neighbour(?:ed|ing|s)?'), ('(?:re)?organiz(?:e|ed|ing|es|ation)', '(?:re)?organis(?:e|ed|ing|es|ation)'), ('vapor', 'vapour'), ('behavior', 'behaviour'), ('realiz(?:e|ed|ing|es|ation)', 'realis(?:e|ed|ing|es|ation)'), ('synthetize(?:d|s)?', 'synthetise(?:d|s)?'), ('characteriz(?:e|ed|ing|es|ation)', 'characteris(?:e|ed|ing|es|ation)'), ('(?:re)?crystalliz(?:e|ed|ing|es|ation)', '(?:re)?crystallis(?:e|ed|ing|es|ation)'), ('polymeriz(?:e|ed|ing|es|ation)', 'polymeris(?:e|ed|ing|es|ation)'), ('oxidized', 'oxidised'), ('neutraliz(?:e|ed|ing|es|ation)', 'neutralis(?:e|ed|ing|es|ation)'), ('hydrolyzed', 'hydrolysed'), ('standardiz(?:e|ed|ing|es|ation)', 'standardis(?:e|ed|ing|es|ation)'), ('ioniz(?:e|ed|ing|es|ation)', 'ionis(?:e|ed|ing|es|ation)'), ('solubiliz(?:e|ed|ing|es|ation)', 'solubilis(?:e|ed|ing|es|ation)'), ('functionalized', 'functionalised'), ('electrolyzed', 'electrolysed'), ('homogeniz(?:e|ed|ing|es|ation)', 'homogenis(?:e|ed|ing|es|ation)'), ('lyophiliz(?:e|ed|ing|es|ation)', 'lyophilis(?:e|ed|ing|es|ation)'), ('polariz(?:e|ed|ing|es|ation)', 'polaris(?:e|ed|ing|es|ation)'), ('isomeriz(?:e|ed|ing|es|ation)', 'isomeris(?:e|ed|ing|es|ation)'), ('immobiliz(?:e|ed|ing|es|ation)', 'immobilis(?:e|ed|ing|es|ation)'), ('stabiliz(?:e|ed|ing|es|ation)', 'stabilis(?:e|ed|ing|es|ation)'), ('optimiz(?:e|ed|ing|es|ation)', 'optimis(?:e|ed|ing|es|ation)'), ('odor', 'odour'), ('galvaniz(?:e|ed|ing|es|ation)', 'galvanis(?:e|ed|ing|es|ation)'), ('(?:re)?model(?:ing|ed|s)?', '(?:re)?modell(?:ing|ed|s)?'), ('(?:re)?label(?:ing|ed|s)?', '(?:re)?labell(?:ing|ed|s)?'), ('gray', 'grey'), ] year_pattern = re.compile(r'\b(19\d{2}|20\d{2}|2100)\b') for us, uk in spelling_pairs: us_matches = [m for m in re.finditer(r'\b' + us + r'\b', text, re.I)] uk_matches = [m for m in re.finditer(r'\b' + uk + r'\b', text, re.I)] valid_us_matches = [] for match in us_matches: after_text = text[match.end():match.end()+200] if not year_pattern.search(after_text): valid_us_matches.append(match) valid_uk_matches = [] for match in uk_matches: after_text = text[match.end():match.end()+200] if not year_pattern.search(after_text): valid_uk_matches.append(match) if valid_us_matches and valid_uk_matches: issue = f"Inconsistent UK/US spelling detected:\n\n" # Add US spelling examples (limit to 3) issue += "US spelling examples:\n" for match in valid_us_matches[:3]: start, end = match.span() context = text[max(0, start-20):end+20] issue += f" • ...{context}...\n" # Add UK spelling examples (limit to 3) issue += "\nUK spelling examples:\n" for match in valid_uk_matches[:3]: start, end = match.span() context = text[max(0, start-20):end+20] issue += f" • ...{context}...\n" issue += "\n→ Reminder: Maintain consistent spelling throughout the manuscript!" issues.append(issue) return issues def transform_citations(text, journal_patterns=None): """ Transform all citations in a text from format "Journal Vol(Issue), Pages (Year)" to "Journal Year, Vol(Issue), Pages" """ if journal_patterns is None: # Default patterns for common journals journal_patterns = [ r'J\. Am\. Chem\. Soc\.', r'Chem\. Eur\. J\.', r'Angew\. Chem\. Int\. Ed\.', r'ACS\. Catal.\.', r'Org\. Lett.\.', r'Tetrahedron\. Lett.\.', # Add more journal patterns as needed ] # Create pattern for full citation journal_group = f"({'|'.join(journal_patterns)})" # Updated volume_group to include optional issue in parentheses volume_group = r'(\d+(?:\(\d+\))?)' pages_group = r'(\d+(?:[-–]\d+)?)' year_group = r'\((\d{4})\)' pattern = f"{journal_group}\\s+{volume_group},\\s*{pages_group}\\s*{year_group}" def replace_citation(match): journal = match.group(1) volume = match.group(2) # Now includes issue if present pages = match.group(3) year = match.group(4) # Check if there's a period after the citation end_period = '.' if match.string[match.end():].startswith('.') else '' return f"{journal} {year}, {volume}, {pages}{end_period}" # Replace all matching citations in the text processed_text = re.sub(pattern, replace_citation, text) return processed_text def validate_citation(text): """ Validates citations in the format " , " where and are integers. Checks if Year - Volume equals the journal's founding year offset. """ # Dictionary mapping journals to their founding year offsets journal_offsets = { "J. Am. Chem. Soc.": 1878, "Org. Lett.": 1998, "Chem. Eur. J.": 1994, "ACS Catal.": 2010, "Angew. Chem. Int. Ed.": 1961, "Tetrahedron Lett.": 1959, } # Create the regex pattern, sorting by length to match longer names first sorted_journals = sorted(journal_offsets.keys(), key=len, reverse=True) journal_patterns = [re.escape(name) for name in sorted_journals] journals_regex = '|'.join(journal_patterns) # Complete pattern with year and volume groups pattern = f"({journals_regex})\\s+(\\d+),\\s*(\\d+)" # Find all matches in the text matches = re.finditer(pattern, text) results = [] for match in matches: journal = match.group(1) # Exact journal match year = int(match.group(2)) volume = int(match.group(3)) offset = journal_offsets.get(journal) citation = f"{journal} {year}, {volume}" if offset is None: results.append(f"{citation}: Journal not supported.") elif year - volume != offset: results.append(f"{citation}: wrong year or volume (expected offset {offset})") return results def check_text(text): """ Searches the input text for various patterns using regex and shows context around matches. """ patterns = { r'\b(\S+\s+\d+(?:\.\d+)?\s+oC\b)': "Use the ° symbol in °C, not a superscripted o: ", r'\b\d+(?:\.\d+)?\s+%\s+yield\b': "No space between the numeric value and %: ", r'\b\d+(?:\.\d+)?\s*mg/ml\b': "The volume is specified in mL, not ml: ", r'\b\d+(?:\.\d+)?\s+ml\b': "The volume is specified in mL, not ml: ", r'\b(?:one|two|three)(?!-)\s+neck(?:ed)?\b|\b(?:round|flat)(?!-)\s+bottom\b|\bpear(?!-)\s+shaped\b': ( "Hyphenate 'one-necked' and 'round-bottom, e.g. one-necked round-bottom flask): " ), r'\b\d+(?:\.\d+)?[-]\s*[mL]L\s+round\b': "No hyphen around L and mL", r'\banti-bacterial\b': "Use 'antibacterial' without hyphen: ", r'\bco-operation\b': "Use 'cooperation' without hyphen: ", r'\bmicro-organism\b': "Use 'microorganism' without hyphen: ", r'\bmulti-colored\b': "Use 'multicolored' without hyphen: ", r'\bnon-polar\b': "Use 'nonpolar' without hyphen: ", r'\bphoto-redox\b': "Use 'photoredox' without hyphen: ", r'\bpre-cooled\b': "Use 'precooled' without hyphen: ", r'\bsuper-acid\b': "Use 'superacid' without hyphen: ", r'\bmembered-ring\b': "Use 'membered ring' without hyphen: ", r'\bMembered-Ring\b': "Use 'Membered Ring' without hyphen: ", r'\bBronsted acid\b': "Use ø in Brønsted: ", r'X-Ray': "Always use lowercase r in X-ray (even when capitalized): ", r'x ray': "Use X and hyphen in X-ray: ", r'X ray': "Use hyphen in X-ray: ", r'\(-\)-': "Use (–)- instead of ", r'\b\d+(?:\.\d+)?mL\b': "Missing space between value and mL: ", r'\b\d+(?:\.\d+)?µm\b': "Missing space between value and µm: ", r'\b\d+(?:\.\d+)?mm\b': "Missing space between value and mm: ", r'\b\d+(?:\.\d+)?cm\b': "Missing space between value and cm: ", r'\b\d+(?:\.\d+)?mg\b': "Missing space between value and mg: ", r'\b\d+(?:\.\d+)?min\b': "Missing space between value and min: ", r'(?': "Use → instead of ->", r' MIN ': "Use min for minutes", r'vaccuum': "Misspelling of vacuum ", r'reduced vacuum': "Use 'reduced pressure' ", r'(\d)×(\d)': "Leave space before and after × operator ", r'mol×L': "Use mol·L ", r'g×mol': "Use g·mol ", r'J×K': "Use J·K ", r'J×mol': "Use J·mol ", r'g×L': "Use g·L ", r'g×L': "Use mg·mL ", r'mol\.L-1': "Use mol·L-1 ", r'mol\.mL': "Use mol·mL ", r'g\.mol': "Use g·mol ", r'J\.K': "Use J·K ", r'J\.mol': "Use J·mol ", r'g\.L-1': "Use g·L-1 ", r'g\.L-1': "Use mg·mL-1 ", #Stereochemistry r'relative stereochemistry': "Use 'relative configuration' ", r'absolute stereochemistry': "Use 'absolute configuration' ", r'assigned stereochemistry': "Use assigned configuration ", r'he stereochemistry of': "the configuration of... might be better", r'he stereochemistry was': "the configuration was... might be better", r'he stereochemistry is': "the configuration is... might be better", r'of stereochemistry': "of the configuration... might be better", } results = [] # Iterate over each pattern in the dictionary for pattern_str, message in patterns.items(): regex = re.compile(pattern_str) for match in regex.finditer(text): # Get the start and end positions of the match start = max(0, match.start() - 5) # Get up to 5 chars before match end = min(len(text), match.end() + 10) # Get up to 10 chars after match # Extract the context context = text[start:end] # Store the warning message and the context results.append(f"{message}...{context}...") return results def extract_text_from_pdf(pdf_file): """ Extract text from a PDF file using fitz (PyMuPDF). Args: pdf_file: The uploaded PDF file object Returns: str: The extracted text from the PDF """ try: # Create a byte stream from the uploaded file pdf_bytes = io.BytesIO(pdf_file.getvalue()) # Open the PDF with fitz doc = fitz.open(stream=pdf_bytes, filetype="pdf") # Extract text from all pages text = "" for page_num in range(len(doc)): page = doc[page_num] text += page.get_text() + " " # Close the document doc.close() return text except Exception as e: st.error(f"Error extracting text from PDF: {str(e)}") return "" def main(): st.set_page_config(page_title="Chemistry Text Analyzer", page_icon="🧪", layout="wide") st.markdown( """ """, unsafe_allow_html=True ) st.title("Chemistry Text Analyzer") st.write(""" This app analyzes chemistry text for common errors, inconsistencies, and formatting issues. Upload a PDF file or paste your text in the box below to analyze it. """) # Create tabs for different input methods tab1, tab2 = st.tabs(["Upload PDF", "Text Input"]) with tab1: uploaded_file = st.file_uploader("Choose a PDF file", type=['pdf']) analyze_pdf = st.button("Analyze PDF") if analyze_pdf and uploaded_file is not None: with st.spinner("Extracting text from PDF..."): text_content = extract_text_from_pdf(uploaded_file) if text_content: st.success(f"Successfully extracted text from {uploaded_file.name}") st.write("---") analyze_content(text_content) else: st.error("Failed to extract text from the PDF. Please check if the PDF contains extractable text.") with tab2: # Text input area text_input = st.text_area("Paste your text here:", height=300) analyze_text = st.button("Analyze Text") if analyze_text: if not text_input: st.warning("Please paste some text to analyze.") else: st.write("---") # Replace newlines with spaces to match the original behavior text_content = text_input.replace('\n', ' ') analyze_content(text_content) def analyze_content(text_content): """ Analyze the text content and display results. Args: text_content: The text to analyze """ start_time = time.time() with st.spinner("Analyzing text..."): # Use expanders for each analysis type to keep the UI clean with st.expander("Text Format and Style Issues", expanded=True): text_issues = check_text(text_content) if text_issues: for issue in text_issues: st.write(issue) else: st.write("No text format or style issues found.") # Language Issues - Separate expander, not nested with st.expander("Language Issues", expanded=True): language_issues = check_US_UK_consistency(text_content) if language_issues: for issue in language_issues: st.markdown(issue) else: st.write("No US/UK spelling inconsistencies found.") with st.expander("Citation Analysis", expanded=True): # Transform citations (this returns the transformed text) transformed_text = transform_citations(text_content) if transformed_text != text_content: st.write("Citations were transformed to the proper format.") # Validate citations citation_issues = validate_citation(text_content) if citation_issues: st.write("Citation issues found:") for issue in citation_issues: st.write(issue) else: st.write("No citation issues found.") elapsed_time = time.time() - start_time st.write(f"Analysis completed in {elapsed_time:.2f} seconds.") if __name__ == '__main__': main()