Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import tempfile | |
| from pathlib import Path | |
| import re | |
| import fitz # install PyMuPDF | |
| from molmass import Formula | |
| from functools import reduce | |
| import os | |
| # Define source folder and console outpiut colors | |
| folder_path = r"C:\Users\match\Downloads" | |
| # ------------------------------------------------ | |
| # 1) Amino-acid dictionary (neutral, free AA) | |
| # ------------------------------------------------ | |
| amino_acids = { | |
| "Ala": {"C": 3, "H": 7, "N": 1, "O": 2}, | |
| "Arg": {"C": 6, "H": 14, "N": 4, "O": 2}, | |
| "Asn": {"C": 4, "H": 8, "N": 2, "O": 3}, | |
| "Asp": {"C": 4, "H": 7, "N": 1, "O": 4}, | |
| "Cys": {"C": 3, "H": 7, "N": 1, "O": 2, "S": 1}, | |
| "Gln": {"C": 5, "H": 10, "N": 2, "O": 3}, | |
| "Glu": {"C": 5, "H": 9, "N": 1, "O": 4}, | |
| "Gly": {"C": 2, "H": 5, "N": 1, "O": 2}, | |
| "His": {"C": 6, "H": 9, "N": 3, "O": 2}, | |
| "Ile": {"C": 6, "H": 13, "N": 1, "O": 2}, | |
| "Leu": {"C": 6, "H": 13, "N": 1, "O": 2}, | |
| "Lys": {"C": 6, "H": 14, "N": 2, "O": 2}, | |
| "Met": {"C": 5, "H": 11, "N": 1, "O": 2, "S": 1}, | |
| "Phe": {"C": 9, "H": 11, "N": 1, "O": 2}, | |
| "Pro": {"C": 5, "H": 9, "N": 1, "O": 2}, | |
| "Ser": {"C": 3, "H": 7, "N": 1, "O": 3}, | |
| "Thr": {"C": 4, "H": 9, "N": 1, "O": 3}, | |
| "Trp": {"C": 11, "H": 12, "N": 2, "O": 2}, | |
| "Tyr": {"C": 9, "H": 11, "N": 1, "O": 3}, | |
| "Val": {"C": 5, "H": 11, "N": 1, "O": 2}, | |
| # Non-natural amino acids | |
| "Orn": {"C": 5, "H": 12, "N": 2, "O": 2}, | |
| "Dap": {"C": 7, "H": 16, "N": 2, "O": 4}, # 2,6-diaminopimelic acid | |
| "Aib": {"C": 4, "H": 9, "N": 1, "O": 2}, # α-aminoisobutyric acid | |
| "Acf": {"C": 8, "H": 9, "N": 1, "O": 2, "F": 3}, # 4-acetylphenylalanine | |
| "Azf": {"C": 9, "H": 10, "N": 2, "O": 2}, # azidophenylalanine | |
| "Cha": {"C": 11, "H": 15, "N": 1, "O": 2}, # cyclohexylalanine | |
| "Dpg": {"C": 7, "H": 15, "N": 1, "O": 4}, # dipropylglycine | |
| "Hcy": {"C": 3, "H": 7, "N": 1, "O": 2, "S": 1}, # homocysteine | |
| "Nle": {"C": 6, "H": 13, "N": 1, "O": 2}, # norleucine | |
| "Phg": {"C": 8, "H": 9, "N": 1, "O": 2}, # phenylglycine | |
| "Tpo": {"C": 9, "H": 11, "N": 1, "O": 3, "P": 1}, # phosphothreonine | |
| "Nva": {"C": 5, "H": 11, "N": 1, "O": 2}, # norvaline | |
| "Abu": {"C": 4, "H": 9, "N": 1, "O": 2}, # 2-aminobutyric acid | |
| "Bta": {"C": 8, "H": 8, "N": 1, "O": 2, "S": 1}, # benzothiazolealanine | |
| "Tpa": {"C": 9, "H": 11, "N": 1, "O": 2, "S": 1}, # thiophenylalanine | |
| "Bpa": {"C": 13, "H": 12, "N": 1, "O": 2}, # p-benzoylphenylalanine | |
| "Hph": {"C": 10, "H": 11, "N": 1, "O": 2}, # homophenylalanine | |
| "Dpr": {"C": 3, "H": 8, "N": 2, "O": 2}, # 2,3-diaminopropionic acid | |
| "Pip": {"C": 6, "H": 11, "N": 1, "O": 2}, # pipecolic acid | |
| "Pra": {"C": 5, "H": 7, "N": 1, "O": 2}, # propargylglycine | |
| "Hyp": {"C": 5, "H": 9, "N": 1, "O": 3}, # hydroxyproline | |
| "Sar": {"C": 3, "H": 7, "N": 1, "O": 2}, # sarcosine, N-methylglycine | |
| } | |
| # ------------------------------------------------ | |
| # 2) N-terminal protections | |
| # ------------------------------------------------ | |
| n_term_protections = { | |
| "Fmoc": {"C": 15, "H": 11, "O": 2}, | |
| "Boc": {"C": 5, "H": 9, "O": 2}, | |
| "Z": {"C": 8, "H": 7, "O": 2}, | |
| "Cbz": {"C": 8, "H": 7, "O": 2}, | |
| "Ac": {"C": 2, "H": 3, "O": 1}, | |
| "Alloc": {"C": 4, "H": 5, "O": 2} | |
| } | |
| # ------------------------------------------------ | |
| # 3) Side-chain modifications | |
| # ------------------------------------------------ | |
| side_chain_mods = { | |
| "OBn": {"C": 7, "H": 6}, | |
| "Boc": {"C": 5, "H": 8, "O": 2}, | |
| "OtBu": {"C": 4, "H": 8}, | |
| "Ot-Bu": {"C": 4, "H": 8}, | |
| "Trt": {"C": 19, "H": 14}, | |
| "tBu": {"C": 4, "H": 8}, | |
| "OMe": {"C": 1, "H": 2}, | |
| "t-Bu": {"C": 4, "H": 8}, | |
| "Pbf": {"C": 13, "H": 16, "O": 3, "S": 1}, | |
| "Alloc": {"C": 4, "H": 4, "O": 2}, | |
| "Z": {"C": 8, "H": 6, "O": 2}, | |
| "Cbz": {"C": 8, "H": 6, "O": 2}, | |
| "Fmoc": {"C": 15, "H": 10, "O": 2}, | |
| } | |
| # ------------------------------------------------ | |
| # 4) C-terminal modifications | |
| # ------------------------------------------------ | |
| c_term_mods = { | |
| "OH": {}, # free acid (no change) | |
| "OMe": { | |
| "remove": {"O": 1, "H": 1}, | |
| "add": {"O": 1, "C": 1, "H": 3} | |
| }, | |
| "OtBu": { | |
| "remove": {"O": 1, "H": 1}, | |
| "add": {"O": 1, "C": 4, "H": 9} | |
| }, | |
| "NH2": { | |
| "remove": {"O": 2, "H": 1}, | |
| "add": {"N": 1, "H": 2} | |
| }, | |
| "OSu": { | |
| "remove": {"O": 2, "H": 1}, | |
| "add": {"C": 4, "H": 3, "N": 1, "O": 4} | |
| } | |
| } | |
| # ------------------------------------------------ | |
| # 2) Utility Functions | |
| # ------------------------------------------------ | |
| from collections import Counter | |
| def combine_formulas(*dicts): | |
| total = Counter() | |
| for d in dicts: | |
| for k, v in d.items(): | |
| total[k] += v | |
| # remove zero-entries | |
| return {el: cnt for el, cnt in total.items() if cnt != 0} | |
| def formula_to_str(formula_dict): | |
| order = ["C", "H", "N", "O", "S", "P"] | |
| keys_sorted = sorted( | |
| formula_dict.keys(), | |
| key=lambda x: (order.index(x) if x in order else 999, x) | |
| ) | |
| result = [] | |
| for k in keys_sorted: | |
| cnt = formula_dict[k] | |
| if cnt == 1: | |
| result.append(k) | |
| else: | |
| result.append(f"{k}{cnt}") | |
| return "".join(result) | |
| # ------------------------------------------------ | |
| # 3) Parsing Function | |
| # ------------------------------------------------ | |
| def parse_protected_peptide(peptide_str): | |
| # 1) Remove any (D) or (L) | |
| peptide_str = re.sub(r"\(D\)|\(L\)", "", peptide_str) | |
| # 2) Remove any D- or L- | |
| peptide_str = re.sub(r"[DL]-", "", peptide_str) | |
| # 3) Convert "N-Me-" => "NMe" | |
| peptide_str = peptide_str.replace("N-Me-", "NMe") | |
| # e.g. "Fmoc-N-Me-L-Val-OH" => after removing "L-", => "Fmoc-N-Me-Val-OH" | |
| # then => "Fmoc-NMeVal-OH" | |
| # Now split by '-' | |
| parts = [p for p in peptide_str.split('-') if p] | |
| # 4) Identify N-term group | |
| n_term_group = None | |
| if parts and parts[0] in n_term_protections: | |
| n_term_group = parts[0] | |
| parts = parts[1:] | |
| # 5) Identify C-term group | |
| c_term_group = None | |
| if parts and parts[-1] in c_term_mods: | |
| c_term_group = parts[-1] | |
| parts = parts[:-1] | |
| total_formula = Counter() | |
| is_n_methyl_list = [] | |
| # Regex: optional NMe, then 3-letter code, optional sidechain | |
| # Examples: "Val", "NMeVal", "Val(tBu)", "NMeVal(tBu)" | |
| residue_pattern = re.compile( | |
| r"^(?P<nme>NMe)?(?P<aa>[A-Z][a-z]{2})(?P<side>\([A-Za-z0-9]+\))?$" | |
| ) | |
| for chunk in parts: | |
| m = residue_pattern.match(chunk) | |
| if not m: | |
| return None | |
| nme_str = m.group("nme") # "NMe" or None | |
| base_aa = m.group("aa") # e.g. "Val" | |
| side_str = m.group("side") # e.g. "(tBu)" or None | |
| if base_aa not in amino_acids: | |
| return None | |
| # Start with free AA | |
| aa_formula = Counter(amino_acids[base_aa]) | |
| # If sidechain mod | |
| if side_str: | |
| mod_key = side_str.strip("()") | |
| if mod_key not in side_chain_mods: | |
| return None | |
| aa_formula = combine_formulas(aa_formula, side_chain_mods[mod_key]) | |
| # If NMe => +C1 +H2 (i.e., -NH2 to -NH(CH3)) | |
| is_n_methyl = bool(nme_str) | |
| if is_n_methyl: | |
| aa_formula["C"] += 1 | |
| aa_formula["H"] += 2 | |
| total_formula.update(aa_formula) | |
| is_n_methyl_list.append(is_n_methyl) | |
| # 6) Peptide bonds | |
| # For each bond, remove "H2O" except if next residue is NMe => remove "HO" | |
| n_res = len(parts) | |
| if n_res > 1: | |
| for i in range(n_res - 1): | |
| if is_n_methyl_list[i+1]: | |
| total_formula["H"] -= 1 | |
| total_formula["O"] -= 1 | |
| else: | |
| total_formula["H"] -= 2 | |
| total_formula["O"] -= 1 | |
| # 7) N-term protection | |
| if n_term_group: | |
| total_formula.update(n_term_protections[n_term_group]) | |
| # remove 1 H for the final N-terminus bond | |
| total_formula["H"] -= 1 | |
| # 8) C-term modification | |
| if c_term_group: | |
| mod_info = c_term_mods[c_term_group] | |
| if "remove" in mod_info: | |
| for k, v in mod_info["remove"].items(): | |
| total_formula[k] -= v | |
| if "add" in mod_info: | |
| for k, v in mod_info["add"].items(): | |
| total_formula[k] += v | |
| # Clean up zeros or negatives | |
| for k in list(total_formula.keys()): | |
| if total_formula[k] == 0: | |
| del total_formula[k] | |
| return dict(total_formula) | |
| def remove_volume(text): | |
| return re.sub(r'\d+(\.\d+)? (µL|uL|mL|ml), ', '', text) | |
| def is_molecular_formula2(s): | |
| global stamm | |
| global output_name | |
| # Define a regular expression pattern for a valid molecular formula | |
| pattern = re.compile(r'^((Ac|Ag|Al|Am|Ar|As|At|Au|B|Ba|Be|Bh|Bi|Bk|Br|C|Ca|Cd|Ce|Cf|Cl|Cm|Co|Cr|Cs|Cu|Ds|D|Db|Dy|Er|Es|Eu|F|Fe|Fm|Fr|Ga|Gd|Ge|H|He|Hf|Hg|Ho|Hs|I|In|Ir|K|Kr|La|Li|Lr|Lu|Md|Mg|Mn|Mo|Mt|N|Na|Nb|Nd|Ne|Ni|No|Np|O|Os|P|Pa|Pb|Pd|Pm|Po|Pr|Pt|Pu|Ra|Rb|Re|Rf|Rg|Rh|Rn|Ru|S|Sb|Sc|Se|Sg|Si|Sm|Sn|Sr|Ta|Tb|Tc|Te|Th|Ti|Tl|Tm|U|V|W|Xe|Y|Yb|Zn|Zr|\(|\])[-]?\d*)+$') | |
| # Check if the entire string matches the pattern | |
| return bool(pattern.match(s)) | |
| def sort_hill_notation(chemical_formula): | |
| matches = re.findall(r'([A-Z][a-z]*)([-]*\d*)', chemical_formula) | |
| # Count occurrences of each element | |
| elements = {} | |
| for element, index in matches: | |
| elements[element] = elements.get(element, 0) + (int(index) if index else 1) | |
| # Build result with C and H first, then alphabetical | |
| result = [] | |
| for el in ['C', 'H']: | |
| if el in elements: | |
| result.append(f'{el}{elements.pop(el)}') | |
| for el in sorted(elements): | |
| result.append(f'{el}{elements[el]}') | |
| return ''.join(result) | |
| def multiply_formula_complex(formula): | |
| # Function to handle patterns like (C-1Na1H-1)2, leaving segments like (C8H18)1 unchanged | |
| def multiply_segment(match): | |
| segment = match.group(1) # The segment inside parentheses | |
| multiplier = int(match.group(2)) if match.group(2) else 1 # The multiplier, default to 1 | |
| # Parse the segment to find elements and their counts, then multiply | |
| elements_counts = re.findall(r'([A-Z][a-z]*)(-?\d*)', segment) | |
| new_segment_parts = [] | |
| for element, count in elements_counts: | |
| new_count = int(count) * multiplier if count else multiplier # Multiply or default to multiplier | |
| new_segment_parts.append(f"{element}{new_count}") | |
| return ''.join(new_segment_parts) | |
| # Regex to find segments to be multiplied and their multipliers | |
| pattern = r'\(([A-Za-z0-9-]+)\)(\d*)' | |
| modified_formula = re.sub(pattern, multiply_segment, formula) | |
| return modified_formula | |
| def replace_alkanes_with_formulas(input_string): | |
| global stamm | |
| global output_name | |
| # Dictionary mapping root alkanes to their respective formulas | |
| alkane_formulas = { | |
| "icos": "(C20H42)1", | |
| "nonadec": "(C19H40)1", | |
| "octadec": "(C18H38)1", | |
| "heptadec": "(C17H36)1", | |
| "hexadec": "(C16H34)1", | |
| "pentadec": "(C15H32)1", | |
| "tetradec": "(C14H30)1", | |
| "tridec": "(C13H28)1", | |
| "dodec": "(C12H26)1", | |
| "undec": "(C11H24)1", | |
| "dec": "(C10H22)1", | |
| "non": "(C9H20)1", | |
| "oct": "(C8H18)1", | |
| "hept": "(C7H16)1", | |
| "cyclohex":"(C6H12)1", | |
| "hex": "(C6H14)1", | |
| "pent": "(C5H12)1", | |
| "but": "(C4H10)1", | |
| "prop": "(C3H8)1", | |
| "meth": "(CH4)1", | |
| "eth": "(C2H6)1", | |
| } | |
| # Create a regular expression pattern for matching root alkanes | |
| pattern = re.compile('|'.join(fr'\b{alkane}\b' for alkane in alkane_formulas.keys())) | |
| # Use re.sub to replace root alkanes with their respective formulas | |
| result = pattern.sub(lambda x: alkane_formulas[x.group(0)], input_string) | |
| return result | |
| def clean_chemical_name_simplified(chemical_name): | |
| # 1. Initial check: Return empty if any word is too short | |
| if any(len(word) < 4 for word in chemical_name.split()): | |
| return "" | |
| # 2. Handle initial "of " prefix if present | |
| cleaned_name = chemical_name[3:] if chemical_name.startswith("of ") else chemical_name | |
| # 3. Perform a series of simple string replacements | |
| # Grouped for better readability and management | |
| simple_replacements = { | |
| "'": "", "[": "", "]": "", ".": "", "’": "", # Includes curly quote | |
| "tris-(": "tris(", "bis-(": "bis(", | |
| "tert-": "", "cis-": "", "trans-": "", "sec-": "", | |
| "Tert-": "", "Cis-": "", "Trans-": "", "Sec-": "", | |
| "Ortho-": "", "Meta-": "", "para-": "", | |
| "ortho-": "", "meta-": "" | |
| # Removed duplicates like 'tert-', 'para-', "'" from original | |
| } | |
| for old, new in simple_replacements.items(): | |
| cleaned_name = cleaned_name.replace(old, new) | |
| # 5. Remove specific bracketed version patterns via regex | |
| cleaned_name = re.sub(r'\[\d+\.\d+\.\d+\]', '', cleaned_name) | |
| # 6. Remove various standard chemical prefixes | |
| # Dictionary stores prefix: length_to_remove | |
| prefixes_to_remove = { | |
| "n,n-": 4, "n,o-": 4, | |
| "iso": 3, | |
| "n-": 2, "s-": 2, "t-": 2, "o-": 2, "m-": 2, "p-": 2 | |
| } | |
| for prefix, length in prefixes_to_remove.items(): | |
| if cleaned_name.startswith(prefix): | |
| cleaned_name = cleaned_name[length:] | |
| break # Assumes only one such prefix needs removal from the start | |
| # 7. Replace structural characters and remove isolated letters via regex | |
| cleaned_name = re.sub(r'[\(\),\d-]', ' ', cleaned_name) # Replace brackets, comma, digits, hyphen | |
| cleaned_name = re.sub(r'\s[A-Za-z]\s', ' ', cleaned_name) # Remove isolated single letters | |
| # 8. Normalize whitespace and perform final suffix/spacing adjustments | |
| cleaned_name = ' '.join(cleaned_name.split()) # Consolidate multiple spaces | |
| final_spacing_replacements = { | |
| " yl": "yl", " en ": "en", " dien ": "dien", " enyl": "enyl", | |
| " ynyl ": "ynyl ", " one": "one", "di ": "di", " dienyl ": "dienyl " | |
| } | |
| for old, new in final_spacing_replacements.items(): | |
| cleaned_name = cleaned_name.replace(old, new) | |
| return cleaned_name | |
| def translate_chemical_name_end_only(chemical_name, translation_dict): | |
| # Dictionary mapping numerical multipliers to their numeric values | |
| num_multiplier = { | |
| "undeca":"11","dodeca":"12","trideca":"13","tetradeca":"14","pentadeca":"15","hexadeca":"16", | |
| "heptadeca":"17","octadeca":"18","nonadeca":"19", | |
| "mono": "1", "di": "2", "tri": "3", "tetra": "4", | |
| "penta": "5", "hexa": "6", "hepta": "7", "octa": "8", | |
| "nona": "9", "deca": "10" | |
| } | |
| # Find and translate the identifier only if it is at the end of the string | |
| for identifier, translated in translation_dict.items(): | |
| if chemical_name.endswith(identifier): | |
| # Extract prefix and multiplier | |
| prefix = chemical_name[:len(chemical_name) - len(identifier)] | |
| # Check and extract the numerical multiplier if present | |
| for multiplier, numeric_value in num_multiplier.items(): | |
| if prefix.endswith(multiplier): | |
| prefix = prefix[:len(prefix) - len(multiplier)] | |
| break | |
| else: | |
| numeric_value = "1" # Default multiplier | |
| # Construct the new chemical name | |
| new_chemical_name = f"{prefix} ({translated}){numeric_value}" | |
| return new_chemical_name | |
| # If no identifier from the dictionary is found at the end of the name | |
| return chemical_name | |
| def modify_chemical_name_yne (chemical_name): | |
| global stamm | |
| global output_name | |
| # Dictionary mapping numerical multipliers to their numeric values | |
| num_multiplier = { | |
| "mono": "1", "di": "2", "tri": "3", "tetra": "4", | |
| "penta": "5", "hexa": "6", "hepta": "7", "octa": "8", | |
| "nona": "9", "deca": "10" | |
| } | |
| # Check if " (" is in the chemical name | |
| if "yn (" in chemical_name: | |
| prefix, suffix = chemical_name.split("yn (", 1) | |
| suffix = " (" + suffix # Add back the removed part | |
| prefix = prefix+"yn" | |
| elif "yne " in chemical_name: | |
| prefix, suffix = chemical_name.split("yne ", 1) | |
| suffix = " " + suffix # Add back the removed part | |
| prefix = prefix+"yne" | |
| else: | |
| prefix = chemical_name | |
| suffix = "" | |
| # Check for "ene" or "en" at the end of the prefix | |
| for ending in ["yn", "yne"]: | |
| if prefix.endswith(ending): | |
| # Count the occurrences of the ending | |
| count = prefix.count(ending) | |
| # Remove the ending from the prefix | |
| prefix = prefix[:len(prefix) - len(ending)] | |
| # Check for and handle multipliers | |
| for multiplier, numeric_value in num_multiplier.items(): | |
| if prefix.endswith(multiplier): | |
| count *= int(numeric_value) | |
| prefix = prefix[:len(prefix) - len(multiplier)] | |
| break | |
| # Add the "(H-4)" part | |
| prefix += " (H-4)" + str(count) | |
| stamm=True | |
| # Reconstruct the modified chemical name | |
| modified_name = prefix + " " + suffix | |
| return modified_name.strip() | |
| return chemical_name | |
| def modify_chemical_name_ene (chemical_name): | |
| global stamm | |
| global output_name | |
| # Dictionary mapping numerical multipliers to their numeric values | |
| num_multiplier = { | |
| "mono": "1", "di": "2", "tri": "3", "tetra": "4", | |
| "penta": "5", "hexa": "6", "hepta": "7", "octa": "8", | |
| "nona": "9", "deca": "10" | |
| } | |
| # Check if " (" is in the chemical name | |
| if "en (" in chemical_name: | |
| prefix, suffix = chemical_name.split("en (", 1) | |
| suffix = " (" + suffix # Add back the removed part | |
| prefix = prefix+"en" | |
| elif "ene " in chemical_name: | |
| prefix, suffix = chemical_name.split("ene ", 1) | |
| suffix = " " + suffix # Add back the removed part | |
| prefix = prefix+"ene" | |
| else: | |
| prefix = chemical_name | |
| suffix = "" | |
| # Check for "ene" or "en" at the end of the prefix | |
| for ending in ["en", "ene"]: | |
| if prefix.endswith(ending): | |
| # Count the occurrences of the ending | |
| count = prefix.count(ending) | |
| # Remove the ending from the prefix | |
| prefix = prefix[:len(prefix) - len(ending)] | |
| # Check for and handle multipliers | |
| count=1 | |
| for multiplier, numeric_value in num_multiplier.items(): | |
| if prefix.endswith(multiplier): | |
| count *= int(numeric_value) | |
| prefix = prefix[:len(prefix) - len(multiplier)] | |
| break | |
| # Add the "(H-2)" part | |
| prefix += " (H-2)" + str(count) | |
| stamm=True | |
| # Reconstruct the modified chemical name | |
| modified_name = prefix + " " + suffix | |
| return modified_name.strip() | |
| return chemical_name | |
| def translate_chemical_name_with_correct_multipliers(chemical_name, translation_dict): | |
| global stamm | |
| global output_name | |
| # Dictionary mapping numerical multipliers to their numeric values | |
| num_multiplier = { | |
| "mono": "1", "di": "2", "tri": "3", "tetra": "4", | |
| "penta": "5", "hexa": "6", "hepta": "7", "octa": "8", | |
| "nona": "9", "undeca":"11","dodeca":"12","deca": "10" | |
| } | |
| # Iterate over each item in the translation dictionary | |
| for identifier, translation in translation_dict.items(): | |
| # Find all occurrences of the identifier with a possible multiplier | |
| for multiplier, multiplier_value in num_multiplier.items(): | |
| # Replace the multiplier + identifier with the correct translation | |
| combined = multiplier + identifier | |
| if combined in chemical_name: | |
| replacement = f" ({translation}){multiplier_value} " | |
| chemical_name = chemical_name.replace(combined, replacement) | |
| stamm=True | |
| # Replace any remaining standalone identifiers | |
| if identifier in chemical_name: | |
| stamm=True | |
| chemical_name = chemical_name.replace(identifier, f" ({translation})1 ") | |
| return chemical_name | |
| # Test the function with provided examples | |
| def simplify_chemical_formula(formula): | |
| simplified_formula = "" | |
| i = 0 | |
| while i < len(formula): | |
| # Add the current character (element symbol or digit) | |
| simplified_formula += formula[i] | |
| # If the current character is a letter and next character is '1' | |
| if formula[i].isalpha() and i + 1 < len(formula) and formula[i + 1] == '1': | |
| # Check if the '1' is followed by another digit | |
| if i + 2 < len(formula) and formula[i + 2].isdigit(): | |
| simplified_formula += formula[i + 1] | |
| # Skip the '1' if it's not followed by another digit | |
| i += 1 | |
| i += 1 | |
| return simplified_formula | |
| # Example usage with the corrected function | |
| def name_to_sum_formula (input_name): | |
| global stamm | |
| global output_name | |
| # render all lowercase | |
| input_name= input_name.lower() | |
| stamm=False | |
| #remove hyphens, commas and numbers | |
| input_name=clean_chemical_name_simplified(input_name) | |
| translation_dict3 = {"one oxime":"N1O1H-1","disulfide":"S2H2","sulfide":"S1H2","sulfonamide":"S1N1O2H1","aza":"C-1N1H-1","phosphoranylidene":"P1H1","carbodiimide":"C1H2N2","guanidine":"CH5N3","indazole":"C7H6N2","isatin":"C8H5N1O2","tryptamine":"C10H12N2","cysteamine":"C2H7NS","sulfone":"H2S1O2","ketone":"C1O1H2","ketene":"C2O1H2","methanone":"C1O1H2","ether":"H2O1","carbonate":"C1H2O3","phosphate":"H3P1O4","phosphoric acid":"H3P1O4","phosphonate":"H3P1O3","phosphonic acid":"H3P1O3","phosphinate":"H3P1O2","phosphinic acid":"H3P1O2","hydroxylamine":"N1H3O1","hydrochloride":"H1Cl1","acetylene":"C2H2","adenosine": "C10H13N5O4", "guanosine": "C10H13N5O5", "cytidine": "C9H13N3O5", "uridine": "C9H12N2O6", "thymidine": "C10H14N2O5","purine":"C5H4N4","thymine":"C5H6N2O2","uracil":"C4H4N2O2","adenine":"C5H5N5","guanine":"C5H5M5O1","cytosine":"C4H5N3O1","benzhydrazide":"C7H8N2O1","hydrazide":"N2H4","hydrazine":"H4N2","hydrate":"H2O1","benzamidine":"C7H8N2","benzamide":"C7H7O1N1","acetamidine":"C2H6N2","cinnamaldehyde":"C9H8O1","oxazolidinone":"C3H5N1O2","benzoisoxazole":"C7H5N1O1","isoxazole":"C3H3N1O1","isoxazoline":"C3H5N1O1","isoxazolidine":"C3H7N1O1","benzooxazole":"C7H5N1O1","oxazole":"C3H3N1O1","oxazoline":"C3H5N1O1","oxazolidine":"C3H7N1O1","benzisothiazole":"C7H5N1S1","benzothiazole":"C7H5N1S1","isothiazole":"C3H3N1S1","thiazole":"C3H3N1S1","thiazoline":"C3H5N1S1","thiazolidine":"C3H7N1S1","thiazolone":"C3H3N1O1S1","thiazolinone":"C3H5N1O1S1","thiazolidinone":"C3H7N1O1S1","thiadiazole":"C2H2N2S1","adamantane":"C10H16","cubane":"C8H8","thiophenol":"C6H6S1","acetonitrile":"C2H3N1","propionitrile":"C3H5N1","propanenitrile":"C3H5N1","butanenitrile":"C4H7N1","butyronitrile":"C4H7N1","benzonitrile":"C7H5N1","indolenine":"C8H7N1","isoindoline":"C8H9N1","indoline":"C8H9N1","isoindolinone":"C8H7N1O1","indolinone":"C8H7N1O1","chromenone":"C9H6O2","benzoindole":"C12H9N1","oxindole":"C8H7N1O1","indanone":" C9H8O1","indanol":" C9H10O1","indolone":"C8H7N1O1","indole":"C8H7N1","acetamide":"C2H5N1O1","biphenyl":"C12H10","binaphthalene":"C20H14","binaphthyl":"C20H14","phosphonium":"P1H5","ammonium":"N1H4","acridinium":"C13H10N","sulfoxonium":"S1O1H3","pyridinium":"C5H6N1","benzophenone":"C13H10O1","acetone":"C3H6O1","acetophenone":"C8H8O1","propiophenone":"C9H10O1","anthracene":"C14H10","phenanthrene":"C14H10","pyrene":"C16H10","diazene":"N2H2","succinimide":"C4H5O2N1","maleimide":"C4H3O2N1","phthalimide":"C8H5N1O2","isobenzofuran":"C8H6O1","benzofuran":"C8H6O1","furan":"C4H4O1","benzothiophene":"C8H6S1","thiophene":"C4H4S1","valeric acid":"C5H10O2","isobutyric acid":"C4H8O2","butyric acid":"C4H8O2","propionic acid":"C3H6O2","propiolic acid":"C3H2O2","formamide":"C1H3N1O1","acetic acid":"C2H4O2", "adipic acid:": "C6H10O4","pimelic acid": "C7H12O4","malonic acid":"C3H4O4","succinic acid":"C4H6O4","picolinic acid":"C6H5N1O2","glycolic acid":"C2H4O3","acetaldehyde":"C2H4O1","propionaldehyde":"C3H6O1","butyraldehyde":"C4H8O1","valeraldehyde":"C5H10O1","dioxaborinane":"C3H7B1O2","dioxaborolane":"C2H5B1O2","boric acid":"B1H3O3","boronic acid":"B1O2H3","boronate":"B1O2H3","borinic acid":"B1O2H3","borinate":"B1O2H3","cinnamic acid":"C9H8O2", | |
| "glycine":"C2H5N1O2","glycinol":"C2H7N1O1","phenylalanine": "C9H11N1O2", | |
| "phenylalaninol": "C9H13N1O1", | |
| "alanine": "C3H7N1O2", | |
| "alaninol": "C3H9N1O1", | |
| "tryptophan": "C11H12N2O2", | |
| "tryptophanol": "C11H14N2O1", | |
| "valine": "C5H11N1O2", | |
| "valinol": "C5H13N1O1", | |
| "leucine": "C6H13N1O2", | |
| "leucinol": "C6H15N1O1", | |
| "isoleucine": "C6H13N1O2", | |
| "isoleucinol": "C6H15N1O1", | |
| "methionine": "C5H11N1O2S1", | |
| "methioninol": "C5H13N1O1S1", | |
| "proline": "C5H9N1O2", | |
| "prolinol": "C5H11N1O1", | |
| "serine": "C3H7N1O3", | |
| "serinol": "C3H9N1O2", | |
| "threonine": "C4H9N1O3", | |
| "threoninol": "C4H11N1O2", | |
| "cysteine": "C3H7N1O2S1", | |
| "cysteinol": "C3H9N1O1S1", | |
| "tyrosine": "C9H11N1O3", | |
| "tyrosinol": "C9H13N1O2", | |
| "asparagine": "C4H8N2O3", | |
| "asparaginol": "C4H10N2O2", | |
| "glutamine": "C5H10N2O3", | |
| "glutaminol": "C5H12N2O2", | |
| "lysine": "C6H14N2O2", | |
| "lysinol": "C6H16N2O1", | |
| "arginine": "C6H14N4O2", | |
| "argininol": "C6H16N4O1", | |
| "histidine": "C6H9N3O2", | |
| "histidinol": "C6H11N3O1", | |
| "aspartic acid": "C4H7N1O4", | |
| "aspartinol": "C4H9N1O3", | |
| "glutamic acid": "C5H9N1O4", | |
| "glutaminol": "C5H11N1O3", | |
| "glycine": "C2H5N1O2", | |
| "glycinol": "C2H7N1O1", | |
| "methanamine":"C1H5N1","benzoic acid":"C7H6O2","acrylaldehyde":"C3H4O1","acrylic acid":"C3H4O2","acrylamide":"C3H5N1O1","benzoquinone":"C6H4O2","hydroquinone":"C6H6O2","phenol":"C6H6O1","phloroglucinol":"C6H6O3","anisidine":"C7H9N1O1","resorcinol":"C6H6O2","anisole":"C7H8O1","toluene":"C7H8","mesitylene":"C9H12","xylene":"C8H10","pyrimidinone":"C4H4N2O","pyrimidine":"C4H4N2","quinazoline":"C8H6N2","quinazolinone":" C8H6N2O1","quinoxaline":"C8H6N2",'acridinone":"C13H9N1O1",'"isoquinolinol":"C9H7NO","quinolinol":"C9H7NO","isoquinoline":"C9H7N1","quinoline":"C9H7N1","isoquinolinone":"C9H7N1O1","quinolinone":"C9H7N1O1","aniline":"C6H7N1","triazine":"C3H3N3","pyridinone":"C5H5N1O1","pyridone":"C5H5N1O1","tetralone": "C10H10O","quinoxalinone":"C8H6N2O1","bipyridine":"C10H8N2","piperazine":"C4H10N2","morpholine":"C4H9N1O1","piperidine":"C5H11N1","pyridine":"C5H5N1","picoline":"C6H7N1","oxirane":"C2H4O1","aziridine":"C2H5N1","azetidine":"C3H7N1","bispyrrolidine":"C8H16N2","pyrrolidine":"C4H9N1","pyrrolidinone":"C4H7N1O1","pyrrole":"C4H5N1","furfural":"C5H4O2","isonicotinaldehyde":"C6H5NO","nicotinaldehyde":"C6H5NO","isonicotinic acid":"C6H5NO2","nicotinic acid":"C6H5NO2","furaldehyde":"C5H4O2","benzaldehyde": "C7H6O1", "salicylic acid":"C7H6O3","salicylaldehyde":"C7H6O2","indene":"C9H8","phenanthroline":"C12H8N2","benzotriazole":"C6H5N3","triazole":"C2H3N3","tetrazole":"C1H2N4","benzimidazole":"C7H6N2","imidazole":"C3H4N2","imidazolidinone":"C3H6N2O","coumarin":"C9H6O2","chromene":"C9H8O1","chromane":"C9H10O1","pyrazole":"C3H4N2","pyrazine":"C4H4N2","pyridazine":"C4H4N2","thiourea":"CH4N2S1","urea":"CH4N2O1","styrene":"C8H8","benzene": "C6H6","naphthalene":"C10H8","naphthol":"C10H8O1","oxide":"O1","naphthoyl":"C11H7O1","diamine":"N2H2","amine":"N1H3","phosphite":"P1H3O3","disulfane":"S2H2","sulfane":"H2S1"} | |
| output_name = translate_chemical_name_with_correct_multipliers(input_name, translation_dict3) | |
| output_name = ' '.join(output_name.split()).strip() | |
| translation_dict1 = {"carbinol":"C1H2O1","pyran":"C5H5O1","sulfonic acid":"S1O3","boronic acid":"B1O2H1","propiolate":"C3H1O2","benzoate":"C7H5O2","naphthoate":"C11H7O2","dicarboxylate":"C2O4","carboxylate":"C1O2","acetate":"C2H3O2","pivalate":"C4H9O2","propionate":"C3H5O2","acrylate":"C3H3O2","cinnamate":"C9H7O2","formate":"C1H1O2","trifluoromethanesulfonate":"C1F3S1O3","triflate":"C1F3S1O3","methanesulfonate":"C1H3S1O3","mesylate":"C1H3S1O3","sulfonate":"S1O3","sulfinate":"S1O2H-1","tetrafluoroborate":"B1F4","trifluoroborate":"B1F3","borate":"B1H3O3","thiol":"S1","azide":"N3","iodide":"I1","fluoride":"F1","alcohol":"O1H1","ol": "O1","one":"O1H-2","oic acid":"O2H-2","carbazole":"C12H9N1","carboxylic acid":"C1O2","carboxamide":"C1O1N1H1","carbonitrile":"C1N1H-1","carbaldehyde":"C1O1","carboxaldehyde":"C1O1","bromide":"Br1","tartrate":"C4H6O6","chloride":"Cl1","al":"O1H-2","silane":"Si1H3","phosphine":"P1H2","phosphane":"P1H2","borane":"B1H2","oate":"O2H-2","nitrile":"N1H-3","perchlorate":"Cl1O4"} | |
| output_name = translate_chemical_name_end_only(output_name, translation_dict1) | |
| # remove multiple spaces | |
| output_name = ' '.join(output_name.split()).strip() | |
| # check alkynes | |
| output_name = modify_chemical_name_yne(output_name) | |
| output_name = ' '.join(output_name.split()).strip() | |
| # check alkenes | |
| output_name = modify_chemical_name_ene(output_name) | |
| output_name = ' '.join(output_name.split()).strip() | |
| # check alkanes | |
| if "ane" in output_name or "an (" in output_name or "a (" in output_name: | |
| stamm=True | |
| output_name = output_name.replace("an (", " (") | |
| output_name = output_name.replace("a (", " (") | |
| output_name = ' '.join([word if not word.endswith('ane') else word[:-3] for word in output_name.split()]) | |
| # if no alkynes, alkenes or alkenes, add 2 hydrogens (to cover for pentyl bromide) | |
| if stamm==False and not output_name=="": | |
| output_name=output_name+" H1" | |
| #translate all substituents | |
| translation_dict = {"phosphorous":"P1","sodium":"Na1","potassium":"K1","acetamido":"C2H3N1O1","isobutyryl":"C4H6O1","butyryl":"C4H6O1","butanoyl":"C4H6O1","valeroyl":"C5H8O1","heptanoyl":"C7H12O1","trichloroacetyl":"C2H-1Cl3O1","nitroso":"N1O1H-1","furyl":"C4H3O1","isothiocyanate":"N1C1S1H-1","thiocyanate":"N1C1S1H-1","isocyanide":"N1C1H-1","oxalate":"C2H2O4","malonate":"C3H4O4","isocyanate":"O1C1N1H-1","benzoyl":"C7H4O1","phenacyl":"C8H6O1","methacryloyl":"C4H4O1","acryloyl":"C3H2O1","naphthoyl":"C11H7O1","ethynyl":"C2","propynyl":"C3H2","butynyl":"C4H4","pentynyl":"C5H6","hexynyl":"C6H8","trityl":"C19H14","sulfonyl":"S1O2","allyloxy":"C3H4O1","allyl":"C3H4","propargyl":"C3H2","trifluoromethyl":"C1F3H-1","trifluoromethoxy":"O1C1F3H-1","isothiocyanato":"C1N1S1H-1","thiocyanato":"C1N1S1H-1","isocyanato":"C1N1O1H-1","cyanato":"C1N1O1H-1","isocyano":"C1N1H-1","cyano":"C1N1H-1","isocyanido":"C1N1H-1","cyanido":"C1N1H-1","nitro":"N1O2H-1","azido":"N3H-1","diazo":"N2H-2","tosyl":"C7H6S1O2","benzyloxycarbonyl":"C8H6O2","benzyloxy":"C7H6O1","propyloxy":"C3H6O1","isobutyloxy":"C4H8O1","butyloxy":"C4H8O1","pentyloxy":"C5H10O1","hexyloxy":"C6H12O1","heptyloxy":"C7H14O1","octyloxy":"C8H16O1","nonyloxy":"C9H18O1","benzyl":"C7H6","glycyl":"C2H3N1O1","geranylated":"C10H16","geranyl":"C10H16","neryl":"C10H16","farnesyl":"C15H24","prenyl":"C5H8","styryl":"C8H6","naphthyl":"C10H6","phthaloyl":"C8H2O2","phthalyl":"C8H2O2","vinyl":"C2H2","phenylthio":"C6H4S1","thio":"S1","phenyl":"C6H4","mesityl":"C9H10","naphthalenyl":"C10H6","ptolyl":"C7H6","otolyl":"C7H6","mtolyl":"C7H6","ethenyl":"C2H2","isopropenyl":"C3H4","cyclopropenyl":"C3H2","propenyl":"C3H4","cyclobutenyl":"C4H4","isobutenyl":"C4H6","butenyl":"C4H6","butenynyl":"C4H2","cyclopentenyl":"C5H6","isopentenyl":"C5H8","pentenyl":"C5H8","cyclohexenyl":"C6H8","isohexenyl":"C6H10","cycloheptenyl":"C7H10","heptenyl":"C7H12","cyclooctenyl":"C8H12","octenyl":"C8H14","hexenyl":"C6H10","pyranyl":"C5H5O1","thienyl":"C4H3S1","oxiranyl":"C2H3O1","methylene":"C1","tolyl":"C7H6","isoindolinyl":"C8H7N1","indolinyl":"C8H7N1","adamantyl":"C10H14","isopropyl":"C3H6","cyclopropyl":"C3H4","butadienyl":"C4H4","pentadienyl":"C5H6","methylated":"C1H2","methyl":"C1H2","formyl":"C1O1","formamido":"C1H1O1N1","ethylated":"C2H4","ethyl":"C2H4","propyl":"C3H6","isobutyl":"C4H8","cyclobutyl":"C4H6","butyl":"C4H8","cyclopentyl":"C5H8","npentyl":"C5H10","neopentyl":"C5H10","pentyl":"C5H10","cyclohexyl":"C6H10","nhexyl":"C6H12","hexyl":"C6H12","heptyl":"C7H14","octyl":"C8H16","nonyl":"C9H18","undecyl":"C11H22","decyl":"C10H20","boc":"C5H8O2","fmoc":"C15H10O2","cbz":"C8H6O2","hydroxy":"O1","mercapto":"S1","hydro":"H1","bromo":"Br1H-1","hydrazino":"N2H2","amino":"N1H1","methoxycarbonyl":"C2H2O2","methoxy":"C1H2O1","phenoxy":"C6H4O1","ethoxycarbonyl":"C3H4O2","ethoxy":"C2H4O1","propoxy":"C3H6O1","butoxycarbonyl":"C5H8O2","butoxy":"C4H8O1","pyridinyl":"C5H3N1","piperidinyl":"C5H9N1","pyrrolidinyl":"C4H7N1","iodosyl":"I1O1H-1","iodo":"I1H-1","fluoro":"F1H-1","chloro":"Cl1H-1","silyloxy":"Si1O1H2","silyl":"Si1H2","oxo":"O1H-2","spiro":"H-4","bicyclo":"H-4","cyclo":"H-2","acetoxy":"C2H2O2","acetyl":"C2H2O1","propionyl":"C3H4O1","propanoyl":"C3H4O1"} | |
| output_name = translate_chemical_name_with_correct_multipliers(output_name,translation_dict) | |
| output_name = ' '.join(output_name.split()).strip() | |
| output_name = replace_alkanes_with_formulas(output_name) | |
| output_name = output_name.replace(" ", "") | |
| #multiply, e.g. (C2H2)2 to give C4H4 etc. | |
| output_name = multiply_formula_complex(output_name) | |
| if is_molecular_formula2(output_name): | |
| output_name=sort_hill_notation(output_name) | |
| else: | |
| output_name="" | |
| output_name=output_name.replace("H0","") | |
| # Remove index 1, e.g. C11H12O1 becomes C11H12O | |
| output_name = simplify_chemical_formula(output_name) | |
| # remove results with negative indices, H-1, C2H-2 etc. | |
| if "H-" in output_name: output_name = None | |
| if output_name == "": | |
| output_name = None | |
| return (output_name) | |
| def custom_replace(input_string): | |
| replacements = { | |
| "CDP": "(C37H30P2)", "Pin": "(C4H12O2)", "(dtbbpy)": "(C18H24N2)", | |
| "Ph": "(C6H5)", "Bn": "(C7H7)", "Bz": "(C7H5O)", "Tf": "(CF3SO2)", | |
| "Ac": "(CH3CO)", "acac": "(C5H7O2)", "TMS": "(C3H9Si)", "tBu": "(C4H9)", | |
| "n-Bu": "(C4H9)", "t-Bu": "(C4H9)", "iPr": "(C3H7)", "i-Pr": "(C3H7)", | |
| "n-Pr": "(C3H7)", "TBS": "(SiC6H15)", "Boc": "(C5H9O2)", "Fmoc": "(C5H10O2)", | |
| "Trt": "(C19H15)", "Cp": "(C5H5)", "p-Ts": "(C7H7SO2)", "pTs": "(C7H7SO2)", | |
| "Ts": "(C7H7SO2)", "Tos": "(C7H7SO2)", "Tr": "(C19H15)", "dppb": "(C28H28P2)", | |
| "dppe": "(C26H24P2)", "nbd": "(C7H8)", "Ns": "(C6H4NO4S)", "Ad": "(C10H15)", | |
| "nPr": "(C3H7)", "MOM": "(C2H5O)", "Piv": "(C5H9O)", "Et": "(C2H5)", | |
| "Me": "(CH3)", "dppp": "(C27H26P2)", "(TES)": "(C6H15Si)", "nBu": "(C4H9)", | |
| "pin": "(C6H12O2)", "bpy": "(C10H8N2)", "COD": "(C8H12)", "cod": "(C8H12)", | |
| "Nf": "(C4F9O2S)", "TIPS": "(C9H21Si)", "dppf": "(C34H28FeP2)", | |
| "Bu-t": "(C4H9)", "Troc": "(C3H2Cl3O2)", "Cy": "(C6H11)", "Cbz": "(C8H7O2)", | |
| "(TFA)": "(CF3CO2)", "(ppy)": "(C11H9N)", "dba": "(C17H14O)", "Bu": "(C4H9)", | |
| "pic": "(C6H4N1O2)" | |
| } | |
| return reduce(lambda s, kv: s.replace(kv[0], kv[1]), replacements.items(), input_string) | |
| def convert_multiple_spaces(text): | |
| converted_text = re.sub(r'\s+', ' ', text) | |
| return converted_text | |
| def replace_patterns_compound_numbers(text): | |
| return re.sub(r'\s+(?:\d+|\(\d+\))\s+\(', ' (', text) | |
| def is_molecular_formula(s): | |
| elements = {'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'B', 'Ba', 'Be', 'Bh', 'Bi', 'Bk', 'Br', | |
| 'C', 'Ca', 'Cd', 'Ce', 'Cf', 'Cl', 'Cm', 'Co', 'Cr', 'Cs', 'Cu', 'Ds', 'D', 'Db', 'Dy', | |
| 'Er', 'Es', 'Eu', 'F', 'Fe', 'Fm', 'Fr', 'Ga', 'Gd', 'Ge', 'H', 'He', 'Hf', 'Hg', 'Ho', | |
| 'Hs', 'I', 'In', 'Ir', 'K', 'Kr', 'La', 'Li', 'Lr', 'Lu', 'Md', 'Mg', 'Mn', 'Mo', 'Mt', | |
| 'N', 'Na', 'Nb', 'Nd', 'Ne', 'Ni', 'No', 'Np', 'O', 'Os', 'P', 'Pa', 'Pb', 'Pd', 'Pm', | |
| 'Po', 'Pr', 'Pt', 'Pu', 'Ra', 'Rb', 'Re', 'Rf', 'Rg', 'Rh', 'Rn', 'Ru', 'S', 'Sb', 'Sc', | |
| 'Se', 'Sg', 'Si', 'Sm', 'Sn', 'Sr', 'Ta', 'Tb', 'Tc', 'Te', 'Th', 'Ti', 'Tl', 'Tm', 'U', | |
| 'V', 'W', 'Xe', 'Y', 'Yb', 'Zn', 'Zr'} | |
| # Check balanced brackets | |
| stack = [] | |
| for c in s: | |
| if c in '([': | |
| stack.append(c) | |
| elif c == ')': | |
| if not stack or stack[-1] != '(': return False | |
| stack.pop() | |
| elif c == ']': | |
| if not stack or stack[-1] != '[': return False | |
| stack.pop() | |
| if stack: return False | |
| # Parse elements and numbers | |
| i = 0 | |
| while i < len(s): | |
| if s[i] in '([': | |
| i += 1 | |
| elif s[i] in ')]': | |
| i += 1 | |
| # Check for number after bracket | |
| start = i | |
| while i < len(s) and s[i].isdigit(): i += 1 | |
| if start < i and int(s[start:i]) == 0: return False | |
| else: | |
| # Check for element (try 2-letter first, then 1-letter) | |
| found = False | |
| for length in [2, 1]: | |
| if i + length <= len(s) and s[i:i + length] in elements: | |
| i += length | |
| found = True | |
| # Check for number after element | |
| start = i | |
| while i < len(s) and s[i].isdigit(): i += 1 | |
| if start < i and int(s[start:i]) == 0: return False | |
| break | |
| if not found: return False | |
| return True | |
| def remove_first_word(input_string): | |
| words = input_string.split() | |
| return ' '.join(words[1:]) if words else None | |
| def extract_info_from_list(string_list): | |
| x_values = [] | |
| y_values = [] | |
| z_values = [] | |
| for text in string_list: | |
| # Pattern to capture the first two words, number before "mg," and number before "mmol" | |
| pattern = re.compile(r'([^ ]+ [^ ]+ [^ ]+ [^ ]+) \((\d+(?:\.\d+)?) mg, (\d+(?:\.\d+)?) mmol') | |
| match = pattern.search(text) | |
| if match: | |
| x_values.append(match.group(1)) | |
| y_values.append(float(match.group(2))) | |
| z_values.append(float(match.group(3))) | |
| else: | |
| # Handle the case where no match is found | |
| x_values.append(None) | |
| y_values.append(None) | |
| z_values.append(None) | |
| return x_values, y_values, z_values | |
| def switch_order(text): | |
| return re.sub(r'\((\d+(?:\.\d+)?) (mmol|µmol), (\d+(?:\.\d+)?) (mg|g)\)', r'(\3 \4, \1 \2)', text) | |
| def convert_units(input_string): | |
| """Convert various units to standardized forms.""" | |
| conversions = [ | |
| (r'(\d+(?:\.\d+)?)\s*kg', lambda m: f"{int(float(m.group(1)) * 1000)} g"), | |
| (r'(\d+(?:\.\d+)?)\s*g', lambda m: f"{int(float(m.group(1)) * 1000)} mg"), | |
| (r'(\d+(?:\.\d+)?)\s*mol', lambda m: f"{float(m.group(1)) * 1000} mmol"), | |
| (r'(\d+(?:\.\d+)?)\s*µmol', lambda m: f"{float(m.group(1)) / 1000} mmol"), | |
| (r'(\d+(?:\.\d+)?)\s*µg', lambda m: f"{float(m.group(1)) / 1000} mg"), | |
| ] | |
| for pattern, replacement in conversions: | |
| input_string = re.sub(pattern, replacement, input_string, flags=re.IGNORECASE) | |
| return input_string | |
| def extract_values_from_text(text): | |
| pattern = re.compile(r'([^ ]+ [^ ]+ [^ ]+ [^ ]+) \((\d+(?:\.\d+)?) mg, (\d+(?:\.\d+)?) mmol') | |
| matches = pattern.findall(text) | |
| results = [] | |
| for match in matches: | |
| compound_info = f"{match[0]} ({match[1]} mg, {match[2]} mmol)" | |
| results.append(compound_info) | |
| return results | |
| def remove_equiv(text): | |
| """Remove patterns like '1.5 eq, ', '2 equiv.', '3 equiv ', etc.""" | |
| return re.sub(r'\d+(?:\.\d+)? (?:eq|equiv)\.?,? ?', '', text) | |
| def remove_molprozent(text): | |
| """Remove patterns like '25 mol%', '1.5 Mol%', '3 mol-%', '2 Mol-%', etc.""" | |
| return re.sub(r'\d+(?:\.\d+)? ?[Mm]ol-?%', '', text) | |
| def replace_sx_with_space(input_string): | |
| # Use regular expression to replace " SX " and " S-X " with a space | |
| output_string = re.sub(r'\sS-\d+\s|\sS\d+\s', ' ', input_string) | |
| return output_string | |
| def extract_text_from_pdf(file_path): | |
| # Initialize the PyMuPDF document object | |
| if os.path.basename(file_path).lower() == 'desktop.ini': | |
| return "" | |
| pdf_document = fitz.open(file_path) | |
| # Initialize an empty string to store the text content | |
| text_content = "" | |
| # Iterate through each page of the PDF | |
| for page_num in range(pdf_document.page_count): | |
| # Get the current page | |
| page = pdf_document.load_page(page_num) | |
| # Extract text from the page and add it to the content string | |
| text_content += page.get_text() | |
| return text_content | |
| def remove_isolated_patterns(s): | |
| """Remove patterns like ' (1a) ', ' (2b) ', etc. and replace with single space.""" | |
| return re.sub(r'\s+\(\d+[a-z]\)\s+', ' ', s) | |
| def modify_medication_format_v2(input_string): | |
| """Enhanced medication formatter that handles mg/mmol and g/mol units.""" | |
| # Pattern for mg/mmol (original functionality) | |
| pattern_mg_mmol = r'(\d+(?:\.\d+)?)\s*(mg)\s*\((\d+(?:\.\d+)?)\s*(mmol)\)\s*of\s+([A-Za-z0-9,\-\(\)\s]+)' | |
| # Pattern for g/mol (new functionality) | |
| pattern_g_mol = r'(\d+(?:\.\d+)?)\s*(g)\s*\((\d+(?:\.\d+)?)\s*(mol)\)\s*of\s+([A-Za-z0-9,\-\(\)\s]+)' | |
| # Apply both transformations | |
| result = re.sub(pattern_mg_mmol, r'\5 (\1 \2, \3 \4)', input_string) | |
| result = re.sub(pattern_g_mol, r'\5 (\1 \2, \3 \4)', result) | |
| return result | |
| def clean_file_contents(file_contents): | |
| def sub_all(s, patterns): | |
| for old, new in patterns: s = s.replace(old, new) | |
| return s | |
| def sub_all_re(s, patterns): | |
| for pattern, repl in patterns: s = re.sub(pattern, repl, s) | |
| return s | |
| file_contents = ' '.join(str(file_contents).replace('\n', ' ').replace('\r', ' ').split()).strip() | |
| file_contents = re.sub(r'\((?:\d+(?:\.\d+)?)\s*mL,\s*(\d+(?:\.\d+)?)', r'(\1', file_contents) | |
| file_contents = remove_volume(file_contents) | |
| file_contents = sub_all(file_contents, [(m, 'mol%') for m in ['Mol%', 'Mol-%', 'mol-%', 'mol %', 'Mol %']]) | |
| file_contents = remove_molprozent(file_contents) | |
| file_contents = sub_all(file_contents, [("�", " "), ("‐", "-"), ("'", "'"), ("´", "'"), (";", ","), | |
| (" and ", " and and and and annd "), | |
| ("mmol) ", "mmol) and and and and and and "), | |
| ("mmol), ", "mmol), and and and and and and "), | |
| ("equiv) ", ") and and and and and and "), | |
| ("equiv), ", "), and and and and and and "), | |
| ("equiv.) ", "), and and and and and and "), | |
| ("equiv.), ", "), and and and and and and "), | |
| ("eq.) ", "), and and and and and and "), | |
| ("eq.), ", "), and and and and and and "), | |
| ("eq) ", "), and and and and and and "), | |
| ("eq), ", "), and and and and and and "), ("·", "."), ("⋅", "."), | |
| ("•", "."), ("・", "."), ("(±)-", ""), ("(±)–", ""), ("(–)-", ""), | |
| (" . ", " "), (" of ", " "), (", (", " ("), (" complex", ""), | |
| (" compound ", " "), (" Compound ", " "), ("- ", "-")]) | |
| file_contents = sub_all_re(file_contents, [(r' \(\d+, ', ' (')]) | |
| file_contents = sub_all(file_contents, [(d, ' ') for d in | |
| [' was ', ' added ', ' as ', ' light-yellow ', ' oil ', ' yellow ', | |
| ' crude ', ' vessel ', ' pink ', ' white ', ' from ', ' White ', ' and ', | |
| ' with ', ' solid ', ' liquid ', ' granule ']]) | |
| file_contents = sub_all(file_contents, [(e, 'eq') for e in ['equivalents', 'equivs', 'equiv', 'eq.', 'Eq']]) | |
| file_contents = sub_all(file_contents, [(f'.{i} H2O', f'(H2O){i}') for i in range(1, 10)] + [('. H2O', '(H2O)')]) | |
| file_contents = file_contents.translate(str.maketrans('₀₁₂₃₄₅₆₇₈₉', '0123456789')) | |
| for func in [remove_equiv, remove_molprozent, convert_units, modify_medication_format_v2, | |
| switch_order, convert_multiple_spaces, replace_sx_with_space, remove_isolated_patterns, | |
| replace_patterns_compound_numbers]: | |
| file_contents = func(file_contents) | |
| return ' '.join(re.sub(r'\[\d+\]', '', file_contents).split()).strip() | |
| def check_reagents(pdf_file_path): | |
| """ | |
| Check reagents in a PDF file and return a list of error messages. | |
| Args: | |
| pdf_file_path (str): Path to the PDF file | |
| Returns: | |
| list: List of reagent error messages as strings | |
| """ | |
| reagent_errors = [] | |
| try: | |
| file_contents = extract_text_from_pdf(pdf_file_path) # load the text content into a string | |
| file_contents = clean_file_contents(file_contents) | |
| # search for the following pattern "x mg Y (z mmol" where Y is a string and x and z are float or integer and transform it into "Y (x mg, z mmol)" | |
| pattern = r"(\d+\.?\d*) mg ([a-zA-Z0-9-]+) \((\d+\.?\d*) mmol\)" | |
| def transform_string(s): | |
| return re.sub(pattern, r"\2 (\1 mg, \3 mmol)", s) | |
| file_contents = transform_string(file_contents) | |
| result = extract_values_from_text(file_contents) | |
| x, y, z = extract_info_from_list(result) | |
| if x: | |
| for i in range(len(x)): | |
| if x[i]: | |
| if z[i] != 0: | |
| apparent_mw = y[i] / z[i] | |
| else: | |
| apparent_mw = 1000.66 | |
| mw = None | |
| words = x[i].split() | |
| if words and len(words[0]) > 4: | |
| formula_from_name = name_to_sum_formula(x[i]) | |
| else: | |
| formula_from_name = "" | |
| if formula_from_name: | |
| mw_from_name = round(Formula(formula_from_name).mass, 2) | |
| mw = mw_from_name | |
| if x[i] in chemical_dict and chemical_dict[x[i]]: | |
| formula_from_dictionary = chemical_dict[x[i]] | |
| mw_from_dictionary = round(Formula(formula_from_dictionary).mass, 2) | |
| if mw_from_dictionary != mw: | |
| mw = mw_from_dictionary | |
| formula_from_name = False | |
| if mw: | |
| mass_error = abs(round(((mw / apparent_mw) - 1) * 100, 1)) | |
| error_msg = f"{x[i]} ({y[i]} mg, {z[i]} mmol) MW: {mw}, used: {apparent_mw:.2f} (Mass error: {mass_error}{'%)' + (' *' if formula_from_name else '')}" | |
| reagent_errors.append(error_msg) | |
| if not mw: | |
| x[i] = remove_first_word(x[i]) | |
| if x[i]: | |
| mw = None | |
| words = x[i].split() | |
| if not all(c.isalpha() for c in words[0]) or len(words[0]) > 4: | |
| formula_from_name = name_to_sum_formula(x[i]) | |
| else: | |
| formula_from_name = None | |
| if formula_from_name: | |
| mw_from_name = round(Formula(formula_from_name).mass, 2) | |
| mw = mw_from_name | |
| if x[i] in chemical_dict and chemical_dict[x[i]]: | |
| formula_from_dictionary = chemical_dict[x[i]] | |
| if formula_from_dictionary: | |
| mw_from_dictionary = round(Formula(formula_from_dictionary).mass, 2) | |
| if mw_from_dictionary != mw: | |
| mw = mw_from_dictionary | |
| if mw: | |
| mass_error = abs(round(((mw / apparent_mw) - 1) * 100, 1)) | |
| error_msg = f"{x[i]} ({y[i]} mg, {z[i]} mmol) MW: {mw}, used: {apparent_mw:.2f} (Mass error: {mass_error}{'%' + (' *' if formula_from_name else '')})" | |
| reagent_errors.append(error_msg) | |
| if not mw: | |
| x[i] = remove_first_word(x[i]) | |
| if x[i]: | |
| mw = None | |
| words = x[i].split() | |
| if not all(c.isalpha() for c in words[0]) or len(words[0]) > 4: | |
| formula_from_name = name_to_sum_formula(x[i]) | |
| else: | |
| formula_from_name = "" | |
| if formula_from_name: | |
| mw_from_name = round(Formula(formula_from_name).mass, 2) | |
| mw = mw_from_name | |
| if x[i] in chemical_dict and chemical_dict[x[i]]: | |
| formula_from_dictionary = chemical_dict[x[i]] | |
| if formula_from_dictionary: | |
| mw_from_dictionary = round(Formula(formula_from_dictionary).mass, 2) | |
| if mw_from_dictionary != mw: | |
| mw = mw_from_dictionary | |
| formula_from_name = False | |
| else: | |
| mw = 666.66 | |
| if mw and mw != 666.66: | |
| mass_error = abs(round(((mw / apparent_mw) - 1) * 100, 1)) | |
| error_msg = f"{x[i]} ({y[i]} mg, {z[i]} mmol) MW: {mw}, used: {apparent_mw:.2f} (Mass error: {mass_error}{'%)' + (' *' if formula_from_name else '')}" | |
| reagent_errors.append(error_msg) | |
| if not mw: | |
| x[i] = remove_first_word(x[i]) | |
| if x[i]: | |
| mw_from_name = None | |
| mw_from_dictionary = None | |
| formula = parse_protected_peptide(x[i]) | |
| if formula is None: | |
| replacement_formula = custom_replace(x[i]) | |
| else: | |
| replacement_formula = formula_to_str(formula) | |
| if is_molecular_formula(replacement_formula): | |
| mw_from_formula = round(Formula(replacement_formula).mass, 2) | |
| mw = mw_from_formula | |
| else: | |
| replacement_formula = None | |
| if not replacement_formula and len(x[i]) > 5: | |
| formula_from_name = name_to_sum_formula(x[i]) | |
| else: | |
| formula_from_name = None | |
| if formula_from_name: | |
| mw_from_name = round(Formula(formula_from_name).mass, 2) | |
| mw = mw_from_name | |
| if x[i] in chemical_dict and chemical_dict[x[i]]: | |
| formula_from_dictionary = chemical_dict[x[i]] | |
| if formula_from_dictionary: | |
| mw_from_dictionary = round(Formula(formula_from_dictionary).mass, 2) | |
| if mw_from_dictionary != mw: | |
| mw = mw_from_dictionary | |
| formula_from_name = False | |
| replacement_formula = False | |
| else: | |
| mw = 666.66 | |
| if mw and mw != 666.66: | |
| mass_error = abs(round(((mw / apparent_mw) - 1) * 100, 1)) | |
| error_msg = f"{x[i]} ({y[i]} mg, {z[i]} mmol) MW: {mw}, used: {apparent_mw:.2f} (Mass error: {mass_error}{'%)' + (' *' if formula_from_name or replacement_formula else '')}" | |
| reagent_errors.append(error_msg) | |
| except Exception as e: | |
| reagent_errors.append(f"an error occurred: {e}") | |
| return reagent_errors | |
| # Specify the dictionary file name | |
| file_name = "chemical_data.txt" | |
| # Initialize an empty dictionary | |
| chemical_dict = {} | |
| # Load the dictionary from the file | |
| try: | |
| with open(file_name, "r", encoding="utf-8") as file: # Explicit UTF-8 encoding | |
| for line in file: | |
| # Split each line into key and value | |
| parts = line.strip().split(":") | |
| if len(parts) == 2: | |
| # Extract key and value | |
| chemical = parts[0].strip() | |
| formula = parts[1].strip() | |
| # Add to the dictionary | |
| chemical_dict[chemical] = formula | |
| except FileNotFoundError: | |
| print(f"File '{file_name}' not found. Make sure the file exists.") | |
| except UnicodeDecodeError: | |
| print(f"Encoding issue detected. Try opening '{file_name}' with a different encoding.") | |
| # Print the loaded dictionary | |
| # print("Loaded Dictionary: chemical_dict.txt") | |
| def main(): | |
| st.set_page_config( | |
| page_title="PDF Reagent Checker", | |
| page_icon="🧪", | |
| layout="wide" | |
| ) | |
| st.title("🧪 PDF Reagent Checker") | |
| st.markdown("Upload a PDF file to check for reagent-related issues.") | |
| uploaded_file = st.file_uploader( | |
| "Choose a PDF file", | |
| type="pdf", | |
| help="Drag and drop a PDF file or click to browse" | |
| ) | |
| if uploaded_file is not None: | |
| tmp_file_path = "" # Initialize to ensure it's defined | |
| try: | |
| # Create a temporary file to save the uploaded PDF | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: | |
| tmp_file.write(uploaded_file.getvalue()) | |
| tmp_file_path = tmp_file.name | |
| # Process the PDF | |
| with st.spinner("Processing PDF... Please wait."): | |
| # Pass tmp_file_path to check_reagents, which might contain the filename for dummy logic | |
| reagent_list = check_reagents(tmp_file_path) # Or check_reagents(uploaded_file.name) if preferred for dummy | |
| # Display results as plain text | |
| if reagent_list: | |
| # Display the filename (note: this is the uploaded filename, not the full path from your image) | |
| st.text(f"{uploaded_file.name}") | |
| # Display each reagent issue as a plain text line | |
| for reagent_error_line in reagent_list: | |
| match = re.search(r"Mass error:\s*([-+]?\d*\.?\d+)%", reagent_error_line) | |
| if match: | |
| error_val = float(match.group(1)) | |
| if abs(error_val) > 10.0: | |
| st.markdown(f"<span style='color:red'>{reagent_error_line}</span>", unsafe_allow_html=True) | |
| else: | |
| st.text(reagent_error_line) | |
| else: | |
| st.text(reagent_error_line) | |
| # Option to download results (content will be plain text) | |
| results_download_text = f"{uploaded_file.name}\n" # File name first | |
| for error_line in reagent_list: | |
| results_download_text += f"{error_line}\n" | |
| st.download_button( | |
| label="📥 Download Results", | |
| data=results_download_text, | |
| file_name=f"reagent_check_results_{uploaded_file.name}.txt", | |
| mime="text/plain" | |
| ) | |
| else: | |
| st.text("✅ No reagent issues found in this PDF!") # Plain text success message | |
| except Exception as e: | |
| st.text(f"❌ An error occurred while processing the PDF: {str(e)}") # Plain text error message | |
| finally: | |
| # Clean up temporary file | |
| if tmp_file_path and os.path.exists(tmp_file_path): | |
| os.unlink(tmp_file_path) | |
| else: | |
| st.text("👆 Please upload a PDF file to get started.") # Plain text info message | |
| # Add some helpful information (expander is a UI element, kept for usability) | |
| with st.expander("ℹ️ How to use this app"): | |
| st.markdown(""" | |
| 1. **Upload a PDF**: Click the file uploader above or drag and drop a PDF file | |
| 2. **Wait for processing**: The app will analyze your PDF for reagent-related issues | |
| 3. **View results**: Any issues found will be displayed as plain text below | |
| 4. **Download results**: If issues are found, you can download a summary report | |
| **Supported file format**: PDF only | |
| """) | |
| if __name__ == "__main__": | |
| main() |