Matchball commited on
Commit
83d4989
Β·
verified Β·
1 Parent(s): 74babfa

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +1399 -0
app.py ADDED
@@ -0,0 +1,1399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Match
2
+ from molmass import Formula
3
+
4
+ import streamlit as st
5
+ import time
6
+ import logging
7
+ import fitz # PyMuPDF
8
+ import io
9
+
10
+ # Configure logging
11
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12
+
13
+
14
+
15
+ def remove_specific_lines_from_string(input_string):
16
+ pattern = re.compile(r'^\s*(S)?\d+\s*$')
17
+ lines = input_string.split('\n')
18
+ filtered_lines = [line for line in lines if not pattern.match(line)]
19
+ return '\n'.join(filtered_lines)
20
+
21
+ def check_conditions(cleaned_results):
22
+ for row in cleaned_results:
23
+ # Check if the 8th column (index 7) is empty or contains "-0.0001" or "+0.0001"
24
+ if row[7] not in ("", "-0.0001", "+0.0001","Electron mass error"):
25
+ return False
26
+ # Check if the 7th column (index 6) as a float is less than 10
27
+ try:
28
+ if float(row[6]) >= 10:
29
+ return False
30
+ except ValueError:
31
+ # If conversion to float fails, return False
32
+ return False
33
+ return True
34
+
35
+ def fix_floats(text):
36
+ """
37
+ Searches a string for floats in the form "xxxx.xxx" and changes them to "xxxx.xxx0".
38
+
39
+ Args:
40
+ text (str): The input text to search and modify.
41
+
42
+ Returns:
43
+ str: The modified text with floats in the form "xxxx.xxx0".
44
+ """
45
+ # Define a regular expression pattern to match floats with 3 decimal places
46
+ pattern = r'\b\d+\.\d{3}\b'
47
+
48
+ # Use the re.sub() function to replace matches with the modified float
49
+ modified_text = re.sub(pattern, lambda match: match.group() + '0', text)
50
+
51
+ return modified_text
52
+
53
+
54
+ def remove_sublists_with_missing_element1_positions_swapped(cleaned_results):
55
+ """
56
+ Removes sublists where element 1 is missing (''), if there exists another sublist
57
+ where elements at positions 2, 3, and 4 are the same (positions 3 and 4 may be swapped)
58
+ and element 1 is present.
59
+ """
60
+ # Create a set to hold indices of sublists to remove
61
+ indices_to_remove = set()
62
+ # Build a dictionary to map keys (elements 2, and positions 3 & 4 as a frozenset) to indices
63
+ element_presence = {}
64
+
65
+ # First pass: Collect sublists where element 1 is present
66
+ for idx, sublist in enumerate(cleaned_results):
67
+ if len(sublist) < 4:
68
+ continue # Skip if sublist doesn't have enough elements
69
+ # Create a frozenset of positions 3 and 4 to handle swapping
70
+ positions_3_4_set = frozenset([sublist[2], sublist[3]])
71
+ key = (sublist[1], positions_3_4_set) # Element at position 2 and set of positions 3 and 4
72
+ if sublist[0] != '':
73
+ # Element 1 is present, store the index
74
+ if key not in element_presence:
75
+ element_presence[key] = []
76
+ element_presence[key].append(idx)
77
+
78
+ # Second pass: Identify sublists to remove
79
+ for idx, sublist in enumerate(cleaned_results):
80
+ if len(sublist) < 4:
81
+ continue # Skip if sublist doesn't have enough elements
82
+ if sublist[0] == '':
83
+ # Element 1 is missing
84
+ positions_3_4_set = frozenset([sublist[2], sublist[3]])
85
+ key = (sublist[1], positions_3_4_set)
86
+ if key in element_presence:
87
+ # There is at least one sublist where elements 2, 3, 4 (with positions 3 and 4 swapped) are the same and element 1 is present
88
+ indices_to_remove.add(idx)
89
+
90
+ # Remove sublists at the collected indices
91
+ cleaned_results = [sublist for idx, sublist in enumerate(cleaned_results) if idx not in indices_to_remove]
92
+ return cleaned_results
93
+
94
+
95
+
96
+ def remove_spaces_in_formula(text):
97
+ """
98
+ Removes all spaces within chemical formulas in the input text.
99
+
100
+ The function identifies chemical formulas based on sequences of element symbols
101
+ (one or two letters, starting with an uppercase letter), possibly separated by numbers
102
+ and spaces, and removes any spaces within those sequences.
103
+
104
+ Args:
105
+ text: The input string containing chemical formulas.
106
+
107
+ Returns:
108
+ The processed string with spaces removed from within chemical formulas.
109
+ """
110
+
111
+ # Step 1: Protect floats by surrounding them with '#'
112
+ text = re.sub(r'(\d+\.\d+)', r'#\1#', text)
113
+
114
+ # Regular expression pattern to match chemical formulas
115
+ element = r'[A-Z][a-z]?'
116
+ number = r'\d+'
117
+ # Pattern matches sequences starting with an element symbol, followed by
118
+ # elements or numbers, possibly with spaces in between
119
+ pattern = r'(' + element + r'(?:\s*(?:' + element + r'|' + number + r'))+)'
120
+
121
+ # Function to remove spaces within the matched chemical formula
122
+ def remove_spaces(match):
123
+ return match.group(0).replace(' ', '')
124
+
125
+ # Replace matches in the text with spaces removed within chemical formulas
126
+ return re.sub(pattern, remove_spaces, text)
127
+
128
+
129
+ def remove_page_numbers(text):
130
+ """
131
+ Remove lines that appear to be page numbers from a text string.
132
+
133
+ Matches:
134
+ - Single integers (e.g., "12")
135
+ - Integers with dashes (e.g., "- 12 -", "-13-")
136
+ - Integers with p/P/s/S prefix (e.g., "P12", "s23")
137
+ - Integers with p/P/s/S prefix and dashes (e.g., "S-12", "p -13")
138
+
139
+ Args:
140
+ text (str): Input text containing page numbers
141
+
142
+ Returns:
143
+ str: Text with page number lines removed
144
+ """
145
+ # Split text into lines
146
+ lines = text.split('\n')
147
+
148
+ # Regular expression patterns for page numbers
149
+ patterns = [
150
+ r'^\s*\d+\s*$', # Single integers: "12"
151
+ r'^\s*-\s*\d+\s*-\s*$', # Dashed integers: "- 12 -"
152
+ r'^\s*-\d+-\s*$', # Compact dashed integers: "-13-"
153
+ r'^\s*[psPS]\s*-?\s*\d+\s*(?:\n|$)', # p/P/s/S prefixed: "P12", "s23", "S-12"
154
+ ]
155
+
156
+ # Combine patterns
157
+ combined_pattern = '|'.join(f'({pattern})' for pattern in patterns)
158
+
159
+ # Filter out lines matching the patterns
160
+ cleaned_lines = [line for line in lines if not re.match(combined_pattern, line)]
161
+
162
+ # Rejoin the remaining lines
163
+ return '\n'.join(cleaned_lines)
164
+
165
+
166
+ def is_float(value):
167
+ try:
168
+ float(value)
169
+ return True
170
+ except ValueError:
171
+ return False
172
+
173
+ def protect_floats(text: str) -> str:
174
+
175
+ # Match floats with 3+ digits before decimal and 4+ after
176
+ pattern = r'(\d{3,}\.\d{4,})'
177
+
178
+ def add_spaces(match: re.Match) -> str:
179
+ """Add spaces around the matched float if needed."""
180
+ float_num = match.group(1)
181
+ start, end = match.span(1)
182
+
183
+ # Get characters before and after the float
184
+ char_before = text[start - 1] if start > 0 else ''
185
+ char_after = text[end] if end < len(text) else ''
186
+
187
+ # Only add space if the adjacent characters aren't already spaces
188
+ prefix = '' if char_before.isspace() else ' '
189
+ suffix = '' if char_after.isspace() else ' '
190
+
191
+ return f'{prefix}{float_num}{suffix}'
192
+
193
+ return re.sub(pattern, add_spaces, text)
194
+
195
+ def replace_comma_with_decimal(text: str) -> str:
196
+ # Match numbers with comma decimals that:
197
+ # \b - Start at a word boundary
198
+ # \d+ - Have one or more digits before the comma
199
+ # , - Have a comma
200
+ # \d+ - Have one or more digits after the comma
201
+ # \b - End at a word boundary
202
+ pattern = r'\b(\d+,\d+)\b'
203
+
204
+ def comma_to_decimal(match: Match[str]) -> str:
205
+ """Convert comma to decimal point in matched number."""
206
+ return match.group(0).replace(',', '.')
207
+
208
+ return re.sub(pattern, comma_to_decimal, text)
209
+
210
+ def adjust_space_around_decimal(text):
211
+
212
+ if not isinstance(text, str):
213
+ raise TypeError("Input must be a string")
214
+
215
+ # Step 1: Remove unwanted spaces around decimal points
216
+ # Handles cases like "23. 4562" β†’ "23.4562"
217
+ text = re.sub(r'(\d+)\s*\.\s*(\d+)', r'\1.\2', text)
218
+
219
+ # Step 2: Add space between decimal numbers and following text
220
+ # Handles cases like "2.4beta" β†’ "2.4 beta"
221
+ text = re.sub(r'(\d+\.\d+)([A-Za-z])', r'\1 \2', text)
222
+
223
+ # Step 3: Handle special cases where no space is needed
224
+ # For file extensions like ".txt", ".pdf"
225
+ text = re.sub(r'(\s\d+)\s+(\.[A-Za-z]+\b)', r'\1\2', text)
226
+
227
+ return text
228
+
229
+ def decrease_element_count(molecular_formula: str, element_to_decrease: str) -> str:
230
+ """
231
+ Decreases the count of a specific element in a molecular formula by 1.
232
+
233
+ Args:
234
+ molecular_formula: The input molecular formula (e.g., 'C6H12O2')
235
+ element_to_decrease: The element whose count should be decreased (e.g., 'C')
236
+
237
+ Returns:
238
+ Modified molecular formula with decreased element count
239
+
240
+ Example:
241
+ >>> decrease_element_count('C6H12O2', 'C')
242
+ 'C5H12O2'
243
+ """
244
+ pattern = fr'({element_to_decrease})(?![a-z])\d*'
245
+
246
+ def replace_element(match: re.Match) -> str:
247
+ element_count = match.group()
248
+ element = re.match(r'([A-Z][a-z]*)', element_count).group()
249
+
250
+ if count_match := re.search(r'\d+', element_count):
251
+ current_count = int(count_match.group())
252
+ return (f"{element}{current_count - 1}" if current_count > 2
253
+ else element) # Remove count when it's 2
254
+ return element
255
+
256
+ return re.sub(pattern, replace_element, molecular_formula)
257
+
258
+
259
+ def have_swapped_adjacent_digits(float1: float, float2: float) -> bool:
260
+ # Convert floats to strings
261
+ str1, str2 = str(float1), str(float2)
262
+
263
+ # Remove last two digits for comparison
264
+ str1 = str1[:-2]
265
+ str2 = str2[:-2]
266
+
267
+ # Remove decimal points for comparison
268
+ str1_no_dot = str1.replace('.', '')
269
+ str2_no_dot = str2.replace('.', '')
270
+
271
+ # Check lengths
272
+ if len(str1_no_dot) != len(str2_no_dot) or len(str1_no_dot) < 2:
273
+ return False
274
+
275
+ # Find positions that differ
276
+ diff_positions = [i for i in range(len(str1_no_dot))
277
+ if str1_no_dot[i] != str2_no_dot[i]]
278
+
279
+ # Must have exactly 2 differences for a single swap
280
+ if len(diff_positions) != 2:
281
+ return False
282
+
283
+ # The positions must be adjacent
284
+ if diff_positions[1] - diff_positions[0] != 1:
285
+ return False
286
+
287
+ # Check if it's actually a swap
288
+ pos1, pos2 = diff_positions
289
+ return (str1_no_dot[pos1] == str2_no_dot[pos2] and
290
+ str1_no_dot[pos2] == str2_no_dot[pos1])
291
+
292
+
293
+
294
+ def differ_in_single_digit_except_last_two(float1: float, float2: float) -> bool:
295
+ """
296
+ Checks if two floating-point numbers differ by exactly one digit, excluding the last two digits.
297
+ Handles trailing zeros and decimal points in the comparison.
298
+
299
+ Args:
300
+ float1: First floating-point number
301
+ float2: Second floating-point number
302
+
303
+ Returns:
304
+ True if numbers differ by exactly one digit (excluding last two), False otherwise
305
+
306
+ Examples:
307
+ >>> differ_in_single_digit_except_last_two(123.45, 153.45)
308
+ True
309
+ >>> differ_in_single_digit_except_last_two(123.45, 153.46)
310
+ False
311
+ >>> differ_in_single_digit_except_last_two(123.450, 153.45)
312
+ True
313
+ """
314
+ # Convert to strings and normalize by removing trailing zeros and decimal points
315
+ str1 = str(float1).rstrip('0').rstrip('.')
316
+ str2 = str(float2).rstrip('0').rstrip('.')
317
+
318
+ # Quick validation checks
319
+ if len(str1) != len(str2) or len(str1) < 3: # Need at least 3 digits for comparison
320
+ return False
321
+
322
+ # Extract main part and last two digits
323
+ main1, last_two1 = str1[:-2], str1[-2:]
324
+ main2, last_two2 = str2[:-2], str2[-2:]
325
+
326
+ # Last two digits must match
327
+ if last_two1 != last_two2:
328
+ return False
329
+
330
+ # Count differing digits in main part
331
+ return sum(1 for a, b in zip(main1, main2) if a != b) == 1
332
+
333
+
334
+ def calculate_molecular_weight(formula):
335
+ # Dictionary of atomic weights for elements up to Plutonium (94)
336
+ # Values are in atomic mass units (amu) or g/mol
337
+ atomic_weights = {
338
+ "H": 1.008, "D": 2.0141, "He": 4.002602, "Li": 6.94, "Be": 9.0121831, "B": 10.81, "C": 12.011,
339
+ "N": 14.007, "O": 15.999, "F": 18.9984, "Ne": 20.1797, "Na": 22.98977, "Mg": 24.305, "Al": 26.98154,
340
+ "Si": 28.085, "P": 30.97376, "S": 32.06, "Cl": 35.45, "Ar": 39.948, "K": 39.0983, "Ca": 40.078,
341
+ "Sc": 44.955908, "Ti": 47.867, "V": 50.9415, "Cr": 51.9961, "Mn": 54.938044, "Fe": 55.845,
342
+ "Co": 58.933194, "Ni": 58.6934, "Cu": 63.546, "Zn": 65.38, "Ga": 69.723, "Ge": 72.630,
343
+ "As": 74.921595, "Se": 78.971, "Br": 79.904, "Kr": 83.798, "Rb": 85.4678, "Sr": 87.62,
344
+ "Y": 88.90584, "Zr": 91.224, "Nb": 92.90637, "Mo": 95.95, "Tc": 98, "Ru": 101.07,
345
+ "Rh": 102.90550, "Pd": 106.42, "Ag": 107.8682, "Cd": 112.414, "In": 114.818, "Sn": 118.710,
346
+ "Sb": 121.760, "Te": 127.60, "I": 126.90447, "Xe": 131.293, "Cs": 132.90545196, "Ba": 137.327,
347
+ "La": 138.90547, "Ce": 140.116, "Pr": 140.90766, "Nd": 144.242, "Pm": 145, "Sm": 150.36,
348
+ "Eu": 151.964, "Gd": 157.25, "Tb": 158.92535, "Dy": 162.500, "Ho": 164.93033,
349
+ "Er": 167.259, "Tm": 168.93422, "Yb": 173.04, "Lu": 174.9668, "Hf": 178.49,
350
+ "Ta": 180.94788, "W": 183.84, "Re": 186.207, "Os": 190.23, "Ir": 192.217,
351
+ "Pt": 195.084, "Au": 196.96657, "Hg": 200.592, "Tl": 204.38, "Pb": 207.2,
352
+ "Bi": 208.9804, "Po": 209, "At": 210, "Rn": 222, "Fr": 223, "Ra": 226,
353
+ "Ac": 227, "Th": 232.0377, "Pa": 231.03588, "U": 238.02891, "Np": 237, "Pu": 244
354
+ }
355
+
356
+ # Parse the molecular formula using regex
357
+ formula_components = re.findall(r"([A-Z][a-z]?)(\d*)", formula)
358
+
359
+ # Calculate total molecular weight
360
+ mol_weight = 0.0
361
+ for element, count in formula_components:
362
+ # Get atomic weight from dictionary, default to 0.0 if element not found
363
+ element_weight = atomic_weights.get(element, 0.0)
364
+ # If no count specified, assume 1, otherwise convert string to integer
365
+ mol_weight += element_weight * (int(count) if count else 1)
366
+
367
+ return mol_weight
368
+
369
+ def remove_spaces_within_brackets(s, max_chars=20):
370
+ """
371
+ Removes all spaces within brackets () or [] if the number of non-space characters inside
372
+ is within max_chars. Handles nested brackets appropriately without affecting spaces outside
373
+ the brackets.
374
+
375
+ Args:
376
+ - s (str): The input string.
377
+ - max_chars (int): Maximum number of non-space characters between opening and closing brackets.
378
+
379
+ Returns:
380
+ - str: The modified string with spaces removed within qualifying brackets.
381
+ """
382
+ stack = []
383
+ # Mapping of opening brackets to their corresponding closing brackets
384
+ opening_to_closing = {'(': ')', '[': ']'}
385
+ # Mapping of closing brackets to their corresponding opening brackets
386
+ closing_to_opening = {')': '(', ']': '['}
387
+
388
+ s_list = list(s) # Convert string to list for mutable operations
389
+ remove_space_ranges = [] # List to hold ranges where spaces need to be removed
390
+
391
+ for i, char in enumerate(s_list):
392
+ if char in opening_to_closing:
393
+ # Push opening bracket and its position onto the stack
394
+ stack.append((char, i))
395
+ elif char in closing_to_opening:
396
+ if stack and stack[-1][0] == closing_to_opening[char]:
397
+ # Pop the last opening bracket from the stack
398
+ open_char, open_pos = stack.pop()
399
+ close_pos = i
400
+ # Extract the substring inside the brackets
401
+ content = ''.join(s_list[open_pos + 1:close_pos])
402
+ # Count the number of non-space characters
403
+ non_space_chars = len(content.replace(' ', ''))
404
+ if non_space_chars <= max_chars:
405
+ # Define the range for space removal (exclusive of brackets)
406
+ remove_space_ranges.append((open_pos + 1, close_pos))
407
+ else:
408
+ # Unmatched closing bracket; ignore or handle as needed
409
+ pass
410
+
411
+ # Sort ranges in descending order of start index to handle inner brackets first
412
+ remove_space_ranges.sort(key=lambda x: x[0], reverse=True)
413
+
414
+ for start, end in remove_space_ranges:
415
+ # Extract the substring within the current bracket (excluding brackets)
416
+ substring = ''.join(s_list[start:end])
417
+ # Remove all spaces within this substring
418
+ substring_no_spaces = substring.replace(' ', '')
419
+ # Replace the original substring with the modified one
420
+ s_list[start:end] = list(substring_no_spaces)
421
+
422
+ # Join the list back into a string and return
423
+ return ''.join(s_list)
424
+
425
+
426
+ def isotope_correct(text):
427
+ """
428
+ Applies a series of substitutions to a text to correct for isotope labeling and other specific replacements.
429
+
430
+ Parameters:
431
+ text (str): The input text to be processed.
432
+
433
+ Returns:
434
+ str: The processed text with all substitutions applied.
435
+ """
436
+ # Dictionary of replacements for isotope corrections and other text cleanup
437
+ replacements = {
438
+ "For":" ","[MALDI]":"","[MALDI-TOF]":"","detected":" ","page": " ", "of": " ", "𝑀": " ", "EI": " ", " . ": " ", ":": " ", "Ξ”": " ",
439
+ "𝛼": " ", " a ": " ", "M ": " ", " H ": " ", "ESI": " ", " Na ": " ", " K ": " ",
440
+ " NH4 ": " ", "Obs.": " ", "obs": " ", "78.9183": "", "48Ti": "[48Ti]","54Fe":"[54Fe]",
441
+ "46Ti": "[46Ti]", "47Ti": "[47Ti]", " 2H": "D", " [3H]": "[3H]",
442
+ " 10B": "[10B]", "127I": "[127I]", "120Sn":"[120Sn]", "119Sn":"[119Sn]", "118Sn":"[118Sn]",
443
+ "N23Na": "*N23*Na","O23Na": "*O23*Na", "F23Na": "*F23*Na", "H23Na": "*H23*Na", "23Na":"[23Na]","H28Si": "*H28*Si", "H11B": "*H11*B",
444
+ "H13Co": "*H13*Co", "H13Cl": "*H13*Cl", "H18O": "*H18*O", "H218O": "*H218*O", "N18O": "*N18*O",
445
+ "H35Cl": "*H35*Cl", "H37Cl": "*H37*Cl", "H10B":"*H10*B", "H19F": "*H19*F", "H81Br":"*H81*Br","H79Br":"*H79*Br","Br79": "[79Br]",
446
+ " 79Br": "[79Br]", " 81Br": "[81Br]", "18O": "[18O]", "74Ge": "[74Ge]", "65Cu":"[65Cu]",
447
+ "63Cu":"[63Cu]", "Br81": "[81Br]", " 35Cl": "[35Cl]", " 37Cl": "[37Cl]", " 11B": "[11B]",
448
+ " 32S": "S", " 31P": "P", "35Cl":"[35Cl]", "80Se":"[80Se]", "37Cl":"[37Cl]", "28Si":"[28Si]",
449
+ "13C":"[13C]", "[13C]l":"13Cl", "96Ru":"[96Ru]","79Br":"[79Br]", "81Br":"[81Br]", "11B":"[11B]", "10B":"[10B]",
450
+ "[10B]r":"10Br", "[[":"[", "]]":"]", "*H13*Cl": "H13Cl", "*H18*O": "H18O", "*H218*O": "H218O",
451
+ "*N18*O": "N18O", "*H13*Co": "H13Co", "*H37*Cl": "H37Cl", "*H35*Cl": "H35Cl","*H81Br*":"H81Br","*H79Br*":"H79Br",
452
+ "*H28*Si": "H28Si", "*H10*B":"H10B", "*H23*Na": "H23Na", "*F23*Na": "F23Na", "*N23*Na": "N23Na","*O23*Na": "O23Na",
453
+ "*H11*B":"H11B", "*H19*F": "H19F", "cacld": "", "calcd.": "calcd ", "calc’d": "calcd ",
454
+ "calcd gcm": " ", " is ": " ", "calcd": "calcd ", "calcd ": "calcd ","++": "+","(M":"[M", ")+":"]+ ",
455
+ "MALDI":"","Maldi":""," [13C]":"[13C]"," [127I]":"[127I]"," [12C":"C"," [37Cl]":"37Cl"," [35Cl]":"35Cl",
456
+ "C ":"C","H":"H", " N":"N"," O":"O"," Na":"Na", " Br":"Br", "N ":"N"," Cl":"Cl", " F":"F"," S":"S"," P":"P"," B":"B","M]+H+]":"M+H]+","M]-H+]":"M-H]-",
457
+ "MH+":"M+H]+ ","]-(":"]- ","]+)":"]+ ","]-)":"]- ","]2-)":"]2- ","]+C":"]+ C","[MM":"","=":"","[MeOH":" ","[MeCN":" ","m/z":" ","]+2 ":"]2+ ","]+1":"]+","M+ C":"M+C","+]":"]+","+calc":" calc",
458
+ "Na)]":"Na]","+Na)":"+Na]",";":" ","+H)]":"+H]","+K)]":"+K]","+NH4)]":"+NH4]","+H)":"+H]","H+)":"H]+","Na+)":"Na]+","-calcd":"- calcd","[M-H] ":"[M-H]-","--":"-",
459
+ "NH4+)":"MH4]+","M+)":"M]+","M]+)":"M]+","+)":"+","M- ":"M-","+.":"+","[MNa]+":"[M+Na]+","[MH]+":"[M+H]+",
460
+ " M2+ ":" [M]2+ "," M3+ ":" [M]3+ "," M4+ ":" [M]4+ "," M5+ ":" [M]5+ "," M6+ ":" [M]6+ ",
461
+ " M2- ": " [M]2- ", " M3- ": " [M]3- ", " M4- ": " [M]4- ", " M5- ": " [M]5- ", " M6- ": " [M]6- ","[M+H] ":"[M+H]+ ","[M+Na] ":"[M+Na]+ ","[M] ":"[M]+ ","]calcd":"] calcd","-.":"- ","M+1)":"M+1]+ ","+ꞏ":"+","]-calcd":"]- calcd",
462
+ "[Methyl":" ","[MA":" ","[ME":" ","[MI":" ","[MO":" ","[MU":" ","[Ma":" ","[Mi":" ","[Mo":" ","[Mu":" ","[Mg":" ","[M+H].":"[M+H]+ ","[M+Na].":"[M+Na]+ ","].":"] ","[M+Na]+":"[M+Na]+ ","[M+H]+":"[M+H]+ ","[M]+Na]+":"[M+Na]+","[M]+H]+":"[M+H]+"
463
+ }
464
+
465
+ # Apply each replacement in the dictionary to the text
466
+ for original, replacement in replacements.items():
467
+ text = text.replace(original, replacement)
468
+
469
+ return text
470
+
471
+
472
+ def transform_expressions_in_text(text):
473
+ """
474
+ Transforms all chemical expressions within a given text into a standardized format.
475
+
476
+ Rules for expressions:
477
+ - Starts with M or nM, where n is a single digit integer.
478
+ - Ends with a charge (e.g., +, 2+, -).
479
+ - Can be enclosed in () or [] brackets.
480
+ - May contain spaces which are removed within the expression.
481
+ - Charges can be inside or outside the brackets.
482
+
483
+ The transformed expression:
484
+ - Contains no spaces within the expression.
485
+ - Preserves surrounding text intact.
486
+
487
+ Args:
488
+ - text (str): The input text containing chemical expressions.
489
+
490
+ Returns:
491
+ - str: The text with all expressions transformed accordingly.
492
+ """
493
+
494
+ # Step 1: Replace specific symbols with corresponding charges
495
+ symbol_replacements = {
496
+
497
+ 'βŠ•': '+',
498
+ 'β€’+': '+',
499
+ 'ο€­': '+',
500
+ '': "+",
501
+ '+.':'+ ',
502
+ 'β€’': '',
503
+ 'Β·':'',
504
+ 'βˆ™':'',
505
+ 'ꞏ': '',
506
+ '–': '-',
507
+ '-':'-',
508
+ 'βˆ’.':'- ',
509
+ 'βˆ’': '-', # Minus sign
510
+ 'β€”': '-', # Em dash
511
+ '―': '-',
512
+ 'Λ—': '-',
513
+ '-.': '- ',
514
+ }
515
+
516
+ # Create a regex pattern to match all keys in symbol_replacements
517
+ symbols_pattern = re.compile('|'.join(map(re.escape, symbol_replacements.keys())))
518
+ text = symbols_pattern.sub(lambda match: symbol_replacements[match.group()], text)
519
+
520
+ # Step 2: Define regex to find expressions
521
+ # This pattern matches expressions enclosed in [] or () with optional charges outside
522
+ expression_pattern = re.compile(
523
+ r'[\[(]' # Opening bracket [ or (
524
+ r'(\d*M?\d*[a-zA-Z\d-]*)' # Capture group (explained above)
525
+ r'[])]' # Closing bracket ] or )
526
+ r'(\d*\+|-)?' # Optional charge outside the brackets
527
+ r'[,:]*' # Optional trailing characters
528
+ )
529
+
530
+ def replace_expression(match):
531
+ expression_part = match.group(1) # The main part of the expression
532
+ charge_outside = match.group(2) # The charge outside the brackets, if any
533
+
534
+ # Step 3: Remove all internal brackets within the main expression
535
+ expression_part = re.sub(r'[\[\]()]', '', expression_part)
536
+
537
+ # Step 4: Remove all spaces within the main expression
538
+ expression_part = re.sub(r'\s+', '', expression_part)
539
+
540
+ if not charge_outside:
541
+ # Step 5: Extract charge from the main expression if charge_outside is not present
542
+ charge_match = re.search(r'([+-])$', expression_part)
543
+ if charge_match:
544
+ charge = charge_match.group(1)
545
+ expression_part = expression_part[:charge_match.start()]
546
+ else:
547
+ charge = ''
548
+ else:
549
+ charge = charge_outside
550
+
551
+ # Step 6: Format the transformed expression
552
+ transformed = f'[{expression_part}]{charge}'
553
+
554
+ return transformed
555
+
556
+ # Step 7: Substitute all matching expressions in the text
557
+ transformed_text = expression_pattern.sub(replace_expression, text)
558
+
559
+ return transformed_text
560
+
561
+ def transform_molecular_formula(formula):
562
+ """
563
+ Transforms a molecular formula string to a standardized format.
564
+
565
+ Args:
566
+ formula: The molecular formula string to transform.
567
+
568
+ Returns:
569
+ The transformed molecular formula string.
570
+ """
571
+
572
+ # Remove all round brackets and colons
573
+ formula = formula.replace("(", "").replace(")", "").replace(":", "").replace("]+-", "]+")
574
+
575
+ # Remove ALL spaces within brackets and move the + or - sign after the bracket (if any)
576
+ formula = re.sub(r'\[(.*?)]', lambda m: '[' + m.group(1).replace(' ', '') + ']' + ('+' if '+' in m.group(1) else '') + ('-' if '-' in m.group(1) else ''), formula)
577
+
578
+ # Replace "M-" with "M-"
579
+ formula = re.sub(r'M\s*–', 'M-', formula)
580
+
581
+ # Replace "M +" or "M+" with "M+"
582
+ formula = re.sub(r'M\s*\+', 'M+', formula)
583
+
584
+ # Ensure standardized ion is surrounded by one space, BUT NOT IF IT IS THE LAST THING
585
+ formula = re.sub(r'([^ ])(\[\w+][+-]?)(?=\S)', r'\1 \2 ', formula) # Include optional + or - in the ion group
586
+
587
+ # Add brackets if "M" is present without brackets
588
+ if "M" in formula and "[" not in formula:
589
+ formula = "[" + formula + "]"
590
+
591
+ # Add spaces around "calcd for", "found"
592
+ formula = re.sub(r'(calcd\s*for|found)', r' \1 ', formula)
593
+
594
+ # Remove double spaces
595
+ formula = formula.replace("++", "+").replace("++", "+").replace(",", " ")
596
+ formula = re.sub(r'\s+', ' ', formula)
597
+ formula = formula.replace("-+", "+").replace("]+-", "]+").replace("+]+", "]+ ").replace("++", "+").replace("--", "").replace(",", "+")
598
+
599
+ return formula
600
+
601
+
602
+ # Configure logging
603
+ logging.basicConfig(level=logging.INFO, format='%(message)s')
604
+
605
+
606
+ def generate_error_dictionary(element_list, counts_range, special_cases=None):
607
+ """
608
+ Generates an error dictionary mapping mass differences to element or group descriptions.
609
+ For atoms, includes entries for counts from counts_range.
610
+ For groups, includes entries only for count=1, with descriptions like "1 OH-group".
611
+
612
+ Parameters:
613
+ - element_list (list): List of element symbols or groups (e.g., ['H', 'O', 'N', 'OH']).
614
+ - counts_range (range): Range of atom counts for atoms (e.g., range(1, 11) for counts 1-10).
615
+ - special_cases (dict): Optional dictionary for special error cases
616
+ (e.g., {'0.0005': 'Electron mass error'}).
617
+
618
+ Returns:
619
+ - dict: Error dictionary with mass differences as keys and descriptions as values.
620
+ """
621
+
622
+ error_dict = {}
623
+ electron_mass = 0.0005486 # Atomic mass units (amu)
624
+
625
+ for element in element_list:
626
+ try:
627
+ atomic_mass = Formula(element).monoisotopic_mass
628
+ except Exception as e:
629
+ print(f"Error processing element {element}: {e}")
630
+ continue # Skip this element if there's an error
631
+
632
+ # Determine if the element is a group (more than one capital letter)
633
+ is_group = sum(1 for c in element if c.isupper()) > 1
634
+
635
+ if is_group:
636
+ # For groups, create entry only for count=1
637
+ count = 1
638
+ mass_diff_e = atomic_mass * count
639
+ mass_diff_e_rounded = round(mass_diff_e, 4)
640
+ description = f"{count} {element}-group" # Use the group name with '1' and 'group' with hyphen
641
+ if mass_diff_e_rounded in error_dict:
642
+ if description not in error_dict[mass_diff_e_rounded]:
643
+ error_dict[mass_diff_e_rounded] += f", {description}"
644
+ else:
645
+ error_dict[mass_diff_e_rounded] = description
646
+
647
+ # Positively Charged Ion (E+)
648
+ mass_diff_e_plus = mass_diff_e + (electron_mass * count)
649
+ mass_diff_e_plus_rounded = round(mass_diff_e_plus, 4)
650
+ if mass_diff_e_plus_rounded in error_dict:
651
+ if description not in error_dict[mass_diff_e_plus_rounded]:
652
+ error_dict[mass_diff_e_plus_rounded] += f", {description}"
653
+ else:
654
+ error_dict[mass_diff_e_plus_rounded] = description
655
+
656
+ # Negatively Charged Ion (E-)
657
+ mass_diff_e_minus = mass_diff_e - (electron_mass * count)
658
+ mass_diff_e_minus_rounded = round(mass_diff_e_minus, 4)
659
+ if mass_diff_e_minus_rounded in error_dict:
660
+ if description not in error_dict[mass_diff_e_minus_rounded]:
661
+ error_dict[mass_diff_e_minus_rounded] += f", {description}"
662
+ else:
663
+ error_dict[mass_diff_e_minus_rounded] = description
664
+ else:
665
+ # For atoms, create entries for counts in counts_range
666
+ for count in counts_range:
667
+ mass_diff_e = atomic_mass * count
668
+ mass_diff_e_rounded = round(mass_diff_e, 4)
669
+ if count == 1:
670
+ description = f"{count} {element}-atom"
671
+ else:
672
+ description = f"{count} {element}-atoms"
673
+
674
+ if mass_diff_e_rounded in error_dict:
675
+ if description not in error_dict[mass_diff_e_rounded]:
676
+ error_dict[mass_diff_e_rounded] += f", {description}"
677
+ else:
678
+ error_dict[mass_diff_e_rounded] = description
679
+
680
+ # Positively Charged Ion (E+)
681
+ mass_diff_e_plus = mass_diff_e + (electron_mass * count)
682
+ mass_diff_e_plus_rounded = round(mass_diff_e_plus, 4)
683
+ if mass_diff_e_plus_rounded in error_dict:
684
+ if description not in error_dict[mass_diff_e_plus_rounded]:
685
+ error_dict[mass_diff_e_plus_rounded] += f", {description}"
686
+ else:
687
+ error_dict[mass_diff_e_plus_rounded] = description
688
+
689
+ # Negatively Charged Ion (E-)
690
+ mass_diff_e_minus = mass_diff_e - (electron_mass * count)
691
+ mass_diff_e_minus_rounded = round(mass_diff_e_minus, 4)
692
+ if mass_diff_e_minus_rounded in error_dict:
693
+ if description not in error_dict[mass_diff_e_minus_rounded]:
694
+ error_dict[mass_diff_e_minus_rounded] += f", {description}"
695
+ else:
696
+ error_dict[mass_diff_e_minus_rounded] = description
697
+
698
+ # Add Special Cases if Provided
699
+ if special_cases:
700
+ for mass, desc in special_cases.items():
701
+ mass_float = float(mass)
702
+ mass_rounded = round(mass_float, 4)
703
+ if mass_rounded in error_dict:
704
+ if desc not in error_dict[mass_rounded]:
705
+ error_dict[mass_rounded] += f", {desc}"
706
+ else:
707
+ error_dict[mass_rounded] = desc
708
+
709
+ return error_dict
710
+
711
+
712
+ # Define special cases like electron mass error
713
+ special_errors = {
714
+ '0.0005': "Electron mass error",
715
+ '0.0006': "Electron mass error",
716
+ '0.0073': "Nominal mass error (H=1.0000)?",
717
+ '0.0072': "Nominal mass error (H=1.0000)?",
718
+ '0.0071': "Nominal mass error (H=1.0000)?",
719
+ '0.0070': "Nominal mass error (H=1.0000)?",
720
+ '1.0005': "Nominal mass error (H=1.0000)?",
721
+ '1.0006': "Nominal mass error (H=1.0000)?",
722
+ '0.0102': "Nominal mass error (Na=23.0000)?",
723
+ '0.0103': "Nominal mass error (Na=23.0000)?",
724
+ '0.0107': "Nominal mass error (Na=23.0000)?",
725
+ '0.0108': "Nominal mass error (Na=23.0000)?",
726
+ '1.0077': '1 H-atom',
727
+ '1.0076': '1 H-atom',
728
+ '1.0075': '1 H-atom',
729
+ '1.0083': '1 H-atom',
730
+ '+22.9897': '1 Na-atom',
731
+ '+22.9902': '1 Na-atom',
732
+ '21.9892':"Nominal mass error [M]+1.0000 (not [M+Na]+)",
733
+ '21.9893':"Nominal mass error [M]+1.0000 (not [M+Na]+)",
734
+ '0.9964': 'Specify measured B-isotope(s)',
735
+ '0.9963': 'Specify measured B-isotope(s)',
736
+ '1.9927': 'Specify measured B-isotopes',
737
+ '1.9928': 'Specify measured B-isotopes',
738
+ '1.9979': 'Specify measured Br-isotope(s)',
739
+ '1.9980': 'Specify measured Br-isotope(s)',
740
+ '+17.9906':"Exchange 1 H- with 1 F-atom",
741
+ '-17.9906':"Exchange 1 F- with 1 H-atom",
742
+ '+14.9871':"Exchange 1 H- with 1 O-atom",
743
+ '-14.9871':"Exchange 1 O- with 1 H-atom",
744
+ '+77.9105':"Exchange 1 H- with 1 Br-atom",
745
+ '-77.9105':"Exchange 1 Br- with 1 H-atom",
746
+ '1.0039': 'Mass calcd for [M+1] (1x 13C)',
747
+ '1.0038': 'Mass calcd for [M+1] (1x 13C)',
748
+ '1.0034': 'Mass calcd for [M+1] (1x 13C)',
749
+ '1.0033': 'Mass calcd for [M+1] (1x 13C)',
750
+ '1.0032': 'Mass calcd for [M+1] (1x 13C)',
751
+ '2.0064': 'Mass calcd for [M+2] (2x 13C)',
752
+
753
+ }
754
+
755
+ # Generate the error dictionary
756
+ elements = [
757
+ 'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne',
758
+ 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca',
759
+ 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn',
760
+ 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr',
761
+ 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn',
762
+ 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd',
763
+ 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb',
764
+ 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg',
765
+ 'Tl', 'Pb', 'Bi','D','CH','CH2','CH3','CH4','NH','NH2','NH3','NH4',
766
+ 'OH','H2O','H3O','NO','NO2','OCH3','CF3','C2H5','C2H6','HF','HCl',
767
+ 'HBr','HS','HI','C3H8','C4H10'
768
+
769
+ ]
770
+
771
+ atom_counts = range(1, 11) # 1 to 10
772
+ error_dictionary = generate_error_dictionary(elements, atom_counts, special_errors)
773
+
774
+ def categorize_error(error_value, known_errors, tolerance=0.0001):
775
+ """
776
+ Categorizes the error based on a given error value and a dictionary of known atomic masses.
777
+ Generates a message indicating whether atoms should be added or removed.
778
+
779
+ Parameters:
780
+ error_value (float): The calculated error between the calculated and recalculated mass.
781
+ known_errors (dict): A dictionary where keys are atomic masses and values are the element descriptions.
782
+ tolerance (float): The tolerance range within which the error value should match a known difference.
783
+
784
+ Returns:
785
+ str: The dynamically generated error message if a match is found, otherwise returns a blank space for zero difference.
786
+ """
787
+ # Check if the error value is effectively zero within the tolerance range
788
+ if abs(error_value) <= tolerance:
789
+ return "" # Return a blank space if the difference is zero
790
+
791
+ # Special case handling for known mass differences
792
+ for atomic_mass, atom_description in known_errors.items():
793
+ # Check if the error matches the dictionary value or the dictionary value plus 0.0001
794
+ if (abs(abs(error_value) - atomic_mass) <= tolerance or
795
+ abs(abs(error_value) - (atomic_mass + 0.0001)) <= tolerance):
796
+
797
+ if len(atom_description) > 13: # Check if the database entry is longer than six characters
798
+ return atom_description # Return the database entry directly
799
+
800
+ # Extract the count and element from the dictionary entry
801
+ parts = atom_description.split()
802
+ if len(parts) != 2:
803
+ # Handle unexpected format
804
+ return atom_description
805
+
806
+ count_str, element = parts
807
+ try:
808
+ count = int(count_str)
809
+ except ValueError:
810
+ # Handle cases where count is not an integer
811
+ return atom_description
812
+
813
+ # Generate the correct message based on the sign of the error
814
+ if error_value > 0:
815
+ return f"Add {count} {element} to formula"
816
+ else:
817
+ return f"Remove {count} {element} from formula"
818
+
819
+ # If no match found, return the error value as a string with the correct sign
820
+ return f"{error_value:+.4f}"
821
+
822
+
823
+ def hrms_cleanup(result, error_dictionary):
824
+ """
825
+ Processes a list of HRMS data strings and extracts specified components,
826
+ ensuring that the ion notation is correctly captured and then removed from the line.
827
+ Before processing each line, it removes all strings within the line that are shorter than
828
+ 5 characters and do not contain a capital 'M'.
829
+ Recalculates the monoisotopic mass using the molmass library and computes error.
830
+
831
+ Parameters:
832
+ - result (list of str): The list containing HRMS data strings.
833
+ - error_dictionary (dict): The autogenerated error dictionary with mass differences and descriptions.
834
+
835
+ Returns:
836
+ - list of list: A list where each sublist contains extracted data, including error calculations and descriptions.
837
+ """
838
+
839
+ # Initialize the parsed_results list
840
+ parsed_results = []
841
+
842
+ # Updated ion_pattern to include optional digits before 'M'
843
+ ion_pattern = re.compile(r'\[\d*M[^]]*]\S*')
844
+
845
+ # New formula pattern: word starting with 'C', followed by digits, 'H', digits, and possibly other elements
846
+ #formula_pattern = re.compile(r'C\d+H\d+(?:[A-Z][a-z]?\d*|\[\d+[A-Z][a-z]*\d*)*[+-]?')
847
+ #formula_pattern = re.compile(r'C\d+H\d+(?:[A-Z][a-z]?\d*|\[\d+[A-Z][a-z]*\d*\])*[+-]?')
848
+
849
+ #formula_pattern = re.compile(r'C\d+(?:H\d+|F\d+)(?:[A-Z][a-z]?\d*|\[\d+[A-Z][a-z]*\d*])*[+-]?')
850
+ formula_pattern = re.compile(r'C\d+(?:H\d+|F\d+|D\d+)(?:[A-Z][a-z]?\d*|\[\d+[A-Z][a-z]*\d*])*[+-]?')
851
+
852
+ # Pattern for floats with exactly 4 digits after decimal point
853
+ float_pattern = re.compile(r'\d+\.\d{4}')
854
+
855
+ # Process each line in the result list
856
+ for line in result:
857
+ # Remove words shorter than 5 characters that do not contain a capital 'M'
858
+ words = line.split()
859
+ words_filtered = [word for word in words if len(word) >= 5 or ('M' in word)]
860
+ line = ' '.join(words_filtered)
861
+
862
+ # Initialize a row with 8 empty elements (added a column for Error)
863
+ row = [''] * 8
864
+
865
+ # Extract the ion notation and its charge
866
+ ion_match = ion_pattern.search(line)
867
+ ion_charge = ''
868
+ if ion_match:
869
+ ion = ion_match.group(0)
870
+ row[1] = ion.strip()
871
+ # Extract the charge from the ion notation if present (e.g., ]+, ]-, ]2+)
872
+ ion_charge_match = re.search(r'(\d*[+-])?$', ion)
873
+ if ion_charge_match:
874
+ ion_charge = ion_charge_match.group(1)
875
+ # Remove the ion notation from the line
876
+ line = line.replace(ion, '')
877
+ else:
878
+ row[1] = ''
879
+
880
+ # Now proceed to extract the formula, calcd mass, and found mass from the modified line
881
+
882
+ # Extract the formula
883
+ formula_match = formula_pattern.search(line)
884
+
885
+ if formula_match:
886
+ formula = formula_match.group(0).strip()
887
+ # If the formula ends with ion_charge, remove ion_charge from formula
888
+ if ion_charge and formula.endswith(ion_charge):
889
+ formula = formula[:-len(ion_charge)].strip()
890
+ # Check if there's a charge present in the formula
891
+ charge_match = re.search(r'([+-]\d*)$', formula)
892
+ if charge_match:
893
+ charge = charge_match.group(1)
894
+ formula_no_charge = formula.replace(charge, "")
895
+ else:
896
+ charge = ion_charge if ion_charge else '+'
897
+ formula_no_charge = formula
898
+
899
+ # Enclose the formula in square brackets before recalculating the mass
900
+
901
+ formula_in_brackets = f'[{formula_no_charge}]{charge}'
902
+ formula_in_brackets = formula_in_brackets.replace("H1HeXe", "[13C]")
903
+ formula_in_brackets = formula_in_brackets.replace("C1F", "CF")
904
+ formula_in_brackets = formula_in_brackets.replace("H1N", "HN")
905
+ row[0] = formula_in_brackets
906
+
907
+ # Recalculate the monoisotopic mass using molmass while keeping isotopic notation intact
908
+ try:
909
+ recalculated_mass = Formula(formula_in_brackets).monoisotopic_mass
910
+ if ion_charge:
911
+ if ion_charge in ("+", "-"):
912
+ charge_number = 1
913
+ else:
914
+ charge_number = int(ion_charge[:-1]) # Extract the numeric part of the charge
915
+ recalculated_mass /= abs(charge_number)
916
+
917
+ row[4] = f'{recalculated_mass:.4f}' # Store the monoisotopic mass with 4 decimal precision
918
+ except Exception as e:
919
+ row[4] = 'Error' # Handle the case where the formula is invalid for molmass
920
+ else:
921
+ row[0] = ''
922
+ row[4] = ''
923
+
924
+ # Extract all floats with exactly 4 decimal places
925
+ floats_with_4_decimals = float_pattern.findall(line)
926
+
927
+ # Extract the calcd mass - first occurring float with 4 decimal places
928
+ if floats_with_4_decimals:
929
+ calcd_mass = floats_with_4_decimals[0]
930
+ row[2] = calcd_mass.strip()
931
+ else:
932
+ row[2] = ''
933
+
934
+ # Extract the found mass - second float with 4 decimal places, if it exists
935
+ if len(floats_with_4_decimals) >= 2:
936
+ found_mass = floats_with_4_decimals[1]
937
+ row[3] = found_mass.strip()
938
+ else:
939
+ row[3] = ''
940
+
941
+ # Calculate the error between the calculated mass and the recalculated mass
942
+ if row[2] and row[4] and row[2] != 'Error' and row[4] != 'Error':
943
+ try:
944
+ error = float(row[2]) - float(row[4])
945
+ # Categorize the error based on the error value
946
+ error_description = categorize_error(error, error_dictionary)
947
+ # Check for a typo error if no existing error description
948
+ if is_float(error_description) or error==0:
949
+
950
+ if differ_in_single_digit_except_last_two(float(row[2]), float(row[3])):
951
+ error_description = "Typo (Calcd,Found)"
952
+
953
+ if differ_in_single_digit_except_last_two(float(row[2]), float(row[4])):
954
+ error_description = "Typo (Calcd,Recalcd)"
955
+
956
+ if have_swapped_adjacent_digits(float(row[2]), float(row[3])):
957
+ error_description = "Transposed digits (Calcd,Found)"
958
+
959
+ if have_swapped_adjacent_digits(float(row[2]), float(row[4])):
960
+ error_description = "Transposed digits (Calcd,Recalcd)"
961
+
962
+ if error_description in ("-0.0010", "-0.0011", "-0.0012") and ion_charge == "-":
963
+ error_description = "Mass was calculated for cation"
964
+
965
+ if error_description in ("-0.0010", "-0.0011", "-0.0012") and "M-" in row[1]:
966
+ error_description = "Mass was calculated for cation"
967
+ #print(error_description)
968
+
969
+ mw_plus = round(calculate_molecular_weight(row[0]), 4)
970
+ if float(row[2]) == mw_plus:
971
+ error_description = "Molecular weight error"
972
+
973
+ mw_plus_plus1 = round(mw_plus + 1, 4)
974
+ if float(row[2]) == mw_plus_plus1:
975
+ error_description = "Molecular weight error"
976
+
977
+ mw_plus_plus23 = round(mw_plus + 23, 4)
978
+ if float(row[2]) == mw_plus_plus23:
979
+ error_description = "Molecular weight error"
980
+
981
+ formula_neutral = row[0].replace("+", "")
982
+ mw_neutral = round(calculate_molecular_weight(formula_neutral), 4)
983
+ if mw_neutral == float(row[2]):
984
+ error_description = "Molecular weight error (neutral)"
985
+
986
+ mw_neutral_plus1 = round(mw_neutral + 1, 4)
987
+ if mw_neutral_plus1 == float(row[2]):
988
+ error_description = "Molecular weight error (neutral+1)"
989
+
990
+ mw_neutral_plus23 = round(mw_neutral + 23, 4)
991
+ if mw_neutral_plus23 == float(row[2]):
992
+ error_description = "Molecular weight error (neutral+23)"
993
+
994
+ if "Na" in row[0]:
995
+ formula_minus_sodium = row[0].replace("Na", "")
996
+ mw1 = round(calculate_molecular_weight(formula_minus_sodium), 4) + 23
997
+ if mw1 == float(row[2]):
998
+ error_description = "Molecular weight + 23.0000"
999
+ else:
1000
+ formula_plus_sodium = row[0].replace("[", "").replace("]", "").replace("+", "").replace("-", "")
1001
+ formula_plus_sodium = formula_plus_sodium+"Na"
1002
+ mw_plus_sodium = round(calculate_molecular_weight(formula_plus_sodium), 4)
1003
+ if mw_plus_sodium == float(row[2]):
1004
+ error_description = "Molecular weight error (Formula+Na)"
1005
+
1006
+ formula_minus_h = row[0].replace("[", "").replace("]", "").replace("+", "").replace("-", "")
1007
+ formula_minus_h = decrease_element_count(formula_minus_h, 'H')
1008
+ mw2 = round(calculate_molecular_weight(formula_minus_h), 4) + 1
1009
+ if mw2 == float(row[2]):
1010
+ error_description = "Molecular weight + 1.0000"
1011
+
1012
+ row[7] = error_description # Replace the error value with the error description or keep the difference
1013
+
1014
+ except ValueError:
1015
+ row[7] = 'Error'
1016
+ else:
1017
+ row[7] = 'Error'
1018
+
1019
+ if row[1] and row[2] and row[3] and not row[0]:
1020
+ row[7] = 'No formula found'
1021
+
1022
+ # Skip the row if both row[0] and row[1] are empty
1023
+ if not row[0] and not row[1]:
1024
+ continue # Do not append this row to parsed_results
1025
+
1026
+ # Append the row to the parsed_results list
1027
+ parsed_results.append(row)
1028
+
1029
+ return parsed_results
1030
+
1031
+
1032
+ def calc_dev_calcd_and_recalcd(cleaned_results):
1033
+ """
1034
+ Calculates the absolute deviation between the calculated mass, recalculated mass, and the found mass in ppm,
1035
+ and updates the 'Dev (Calcd)' and 'Dev (Recalcd)' columns in the cleaned_results list.
1036
+
1037
+ Parameters:
1038
+ cleaned_results (list of list): The list containing extracted data.
1039
+
1040
+ Returns:
1041
+ list of list: The updated cleaned_results list with 'Dev (Calcd)' and 'Dev (Recalcd)' columns filled.
1042
+ """
1043
+ for row in cleaned_results:
1044
+ calcd_mass = row[2]
1045
+ found_mass = row[3]
1046
+ recalcd_mass = row[4]
1047
+
1048
+ # Initialize found_mass_float only if found_mass exists and is valid
1049
+ found_mass_float = None
1050
+ if found_mass:
1051
+ try:
1052
+ found_mass_float = float(found_mass)
1053
+ except ValueError:
1054
+ found_mass_float = None
1055
+
1056
+ # Calculate deviation for the calculated mass
1057
+ if calcd_mass and found_mass_float is not None:
1058
+ try:
1059
+ calcd_mass_float = float(calcd_mass)
1060
+ deviation_calcd = abs((found_mass_float - calcd_mass_float) / calcd_mass_float) * 1e6 # ppm
1061
+ row[5] = f"{deviation_calcd:.1f}" # Format to one decimal place
1062
+ except ValueError:
1063
+ row[5] = '' # Leave the field empty if conversion fails
1064
+ else:
1065
+ row[5] = ''
1066
+
1067
+ # Calculate deviation for the recalculated mass
1068
+ if recalcd_mass and found_mass_float is not None:
1069
+ try:
1070
+ recalcd_mass_float = float(recalcd_mass)
1071
+ deviation_recalcd = abs((found_mass_float - recalcd_mass_float) / recalcd_mass_float) * 1e6 # ppm
1072
+ row[6] = f"{deviation_recalcd:.1f}" # Format to one decimal place
1073
+ except ValueError:
1074
+ row[6] = '' # Leave the field empty if conversion fails
1075
+ else:
1076
+ row[6] = ''
1077
+ return cleaned_results
1078
+
1079
+
1080
+ def print_aligned_table(cleaned_results):
1081
+ """
1082
+ Displays the cleaned_results in an aligned table format in Streamlit,
1083
+ highlighting deviations greater than 10 ppm in red and error messages in purple.
1084
+ """
1085
+ headers = ['Formula', 'Ion', 'Calcd Mass', 'Found Mass', 'Recalcd Mass',
1086
+ 'Dev (Calcd)', 'Dev (Recalcd)', 'Error']
1087
+
1088
+ # Build HTML table with inline CSS for borders, padding, and monospaced font
1089
+ table_html = '<table style="border-collapse: collapse; width: 100%; font-family: monospace;">'
1090
+
1091
+ # Create header row
1092
+ table_html += '<tr>'
1093
+ for header in headers:
1094
+ table_html += f'<th style="border: 1px solid black; padding: 4px; text-align: left;">{header}</th>'
1095
+ table_html += '</tr>'
1096
+
1097
+ # Create data rows
1098
+ for row in cleaned_results:
1099
+ table_html += '<tr>'
1100
+ for i, cell in enumerate(row):
1101
+ # Determine text alignment: right align for numeric columns
1102
+ align = 'right' if i in [2, 3, 4, 5, 6, 7] else 'left'
1103
+ style = f"text-align: {align}; border: 1px solid black; padding: 4px;"
1104
+ cell_str = str(cell)
1105
+
1106
+ # For deviation columns, apply red color if deviation > 10
1107
+ if i in [5, 6]:
1108
+ try:
1109
+ if float(cell) > 10:
1110
+ style += " color: red;"
1111
+ except (ValueError, TypeError):
1112
+ pass
1113
+ # For the error column, apply purple if the cell doesn't represent a number
1114
+ elif i == 7:
1115
+ if isinstance(cell, str) and not re.match(r'^[+-]?\d*\.?\d+$', cell_str):
1116
+ style += " color: purple;"
1117
+
1118
+ table_html += f'<td style="{style}">{cell_str}</td>'
1119
+ table_html += '</tr>'
1120
+
1121
+ table_html += '</table>'
1122
+
1123
+ # Display the table in Streamlit
1124
+ st.markdown(table_html, unsafe_allow_html=True)
1125
+
1126
+
1127
+ def search_calcd_with_floats(text: str) -> List[str]:
1128
+ """
1129
+ Search for 'calcd' followed by two floats with four decimal places.
1130
+ Extract from up to 25 characters before 'calcd' (if no float present) until the second float.
1131
+ Only extract if total length is less than 100 characters.
1132
+
1133
+ Args:
1134
+ text (str): Input text to search
1135
+
1136
+ Returns:
1137
+ List[str]: List of matching strings
1138
+ """
1139
+ pattern_float = re.compile(r'\d+\.\d{4}')
1140
+ results = []
1141
+
1142
+ # Find all occurrences of 'calcd', case-insensitive
1143
+ for calcd_match in re.finditer('calcd', text, re.IGNORECASE):
1144
+ calcd_start = calcd_match.start()
1145
+
1146
+ # Look at up to 25 characters before 'calcd'
1147
+ pre_calcd_start = max(0, calcd_start - 25)
1148
+ pre_calcd_text = text[pre_calcd_start:calcd_start]
1149
+
1150
+ # Check if there's a float in the pre-calcd text
1151
+ pre_calcd_floats = list(pattern_float.finditer(pre_calcd_text))
1152
+
1153
+ # Determine the start position based on pre-calcd text
1154
+ if not pre_calcd_floats: # If no floats found before calcd
1155
+ extraction_start = pre_calcd_start
1156
+ else:
1157
+ extraction_start = calcd_start
1158
+
1159
+ # Look ahead for floats after 'calcd'
1160
+ post_calcd_text = text[calcd_start:calcd_start + 100]
1161
+ post_floats = list(pattern_float.finditer(post_calcd_text))
1162
+
1163
+ if len(post_floats) >= 2:
1164
+ # End at the second float
1165
+ end_pos = calcd_start + post_floats[1].end()
1166
+
1167
+ # Only extract if total length is less than 100 characters
1168
+ if end_pos - extraction_start < 100:
1169
+ result = text[extraction_start:end_pos]
1170
+ results.append(result)
1171
+
1172
+ return results
1173
+
1174
+
1175
+ def search_hrms_with_floats(text: str) -> List[str]:
1176
+ """
1177
+ Search for 'HRMS' followed by at least two floats with four decimal places.
1178
+ If 'calcd' appears in the 25 characters after the second float, stop at the second float.
1179
+ Otherwise, include up to 25 characters after the second float.
1180
+
1181
+ Args:
1182
+ text (str): Input text to search
1183
+
1184
+ Returns:
1185
+ List[str]: List of matching strings
1186
+ """
1187
+ pattern_float = re.compile(r'\d+\.\d{4}')
1188
+ hrms_positions = [m.start() for m in re.finditer('HRMS', text)]
1189
+ results = []
1190
+
1191
+ for hrms_pos in hrms_positions:
1192
+ # Extract up to 100 characters from 'HRMS'
1193
+ max_length_substring = text[hrms_pos:hrms_pos + 100]
1194
+ floats = list(pattern_float.finditer(max_length_substring))
1195
+
1196
+ if len(floats) >= 2:
1197
+ second_float_end = floats[1].end()
1198
+
1199
+ # Look at the next 25 characters after the second float
1200
+ next_25_chars = max_length_substring[second_float_end:second_float_end + 25]
1201
+
1202
+ # If 'calcd' appears in next 25 chars, stop at second float
1203
+ if 'calcd' in next_25_chars.lower():
1204
+ end_pos = hrms_pos + second_float_end
1205
+ else:
1206
+ # If no 'calcd', include up to 25 characters after second float
1207
+ end_pos = hrms_pos + second_float_end + 25
1208
+
1209
+ # Ensure end position doesn't exceed text length or 100 characters from 'HRMS'
1210
+ end_pos = min(len(text), end_pos, hrms_pos + 100)
1211
+ result = text[hrms_pos:end_pos].strip()
1212
+ results.append(result)
1213
+
1214
+ return results
1215
+
1216
+ def process_replacements(text: str) -> str:
1217
+ """
1218
+ Perform all necessary string replacements on the text.
1219
+ """
1220
+ replacements = {
1221
+ r' is ':' ',
1222
+ r'LCMS':'HRMS',
1223
+ r'HRESIMS':"HRMS",
1224
+ r'HRESI': 'HRMS',
1225
+ r'HR-MS': 'HRMS',
1226
+ r'ESI-MS': ' HRMS',
1227
+ r'‐': '-',
1228
+ r'β€’':r'-',
1229
+ r'MHz':'',
1230
+ r'MeOD':'',
1231
+ r'Cal':"cal",
1232
+ r'calculated': 'calcd ',
1233
+ r'calcd.': 'calcd ',
1234
+ r'calc. ': 'calcd ',
1235
+ r'calc ': 'calcd ',
1236
+ r'chemical':'',
1237
+ r'formula':'',
1238
+ r' βŠ•': "+",
1239
+ r'β€’': "",
1240
+ r'':'',
1241
+ r'βˆ™':'',
1242
+ r'●':'',
1243
+ r'οΌ‹':'+',
1244
+ r'Observed':' ',
1245
+ r'observed':' ',
1246
+
1247
+ }
1248
+
1249
+ for pattern, replacement in replacements.items():
1250
+ text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
1251
+ text = ' '.join(text.split()).strip()
1252
+ return text
1253
+
1254
+
1255
+
1256
+ def main():
1257
+ st.set_page_config(page_title="Chemistry Text Analyzer", page_icon="πŸ§ͺ", layout="wide")
1258
+
1259
+ st.title("Chemistry Text Analyzer")
1260
+ st.write("""
1261
+ This app analyzes chemistry text for common errors, inconsistencies, and formatting issues.
1262
+ Upload a PDF file or paste your text in the box below to analyze it.
1263
+ """)
1264
+
1265
+ # Create tabs for different input methods
1266
+ tab1, tab2 = st.tabs(["Upload PDF", "Text Input"])
1267
+
1268
+ with tab1:
1269
+ uploaded_file = st.file_uploader("Choose a PDF file", type=['pdf'])
1270
+ analyze_pdf = st.button("Analyze PDF")
1271
+
1272
+ if analyze_pdf and uploaded_file is not None:
1273
+ with st.spinner("Extracting text from PDF..."):
1274
+ text_content = extract_text_from_pdf(uploaded_file)
1275
+
1276
+ if text_content:
1277
+ st.success(f"Successfully extracted text from {uploaded_file.name}")
1278
+ st.write("---")
1279
+ analyze_content(text_content)
1280
+ else:
1281
+ st.error("Failed to extract text from the PDF. Please check if the PDF contains extractable text.")
1282
+
1283
+ with tab2:
1284
+ # Text input area
1285
+ text_input = st.text_area("Paste your text here:", height=300)
1286
+ analyze_text = st.button("Analyze Text")
1287
+
1288
+ if analyze_text:
1289
+ if not text_input:
1290
+ st.warning("Please paste some text to analyze.")
1291
+ else:
1292
+ st.write("---")
1293
+ # Replace newlines with spaces to match the original behavior
1294
+ text_content = text_input.replace('\n', ' ')
1295
+ analyze_content(text_content)
1296
+
1297
+
1298
+ import streamlit as st
1299
+ import re
1300
+
1301
+
1302
+ def analyze_content(text_content):
1303
+ text_content = remove_specific_lines_from_string(text_content)
1304
+ # st.write(text_content)
1305
+ text_content = re.sub(r'\s+', ' ', text_content).strip() # Replace multiple spaces with a single space
1306
+ text_content = process_replacements(text_content)
1307
+ text_content = replace_comma_with_decimal(text_content)
1308
+ text_content = adjust_space_around_decimal(text_content)
1309
+ text_content = fix_floats(text_content)
1310
+ # st.write(text_content)
1311
+ text_content = remove_page_numbers(text_content)
1312
+ text_content = re.sub(r'\[((C\d+(?:[A-Z][a-z]?\d*)*),\s*([M+][^]]+))', r'\1 [\3]', text_content)
1313
+ text_content = re.sub(r'(C)(\d+)(h)(\d+)', lambda m: f'C{m.group(2)}H{m.group(4)}', text_content,
1314
+ flags=re.IGNORECASE)
1315
+ text_content = re.sub(r'(c)(\d+)(H)(\d+)', lambda m: f'C{m.group(2)}H{m.group(4)}', text_content,
1316
+ flags=re.IGNORECASE)
1317
+ text_content = re.sub(r'\b(C)(\d+)(HD)\b', r'C\2H1D', text_content)
1318
+ text_content = re.sub(r'\b(C)\s*(\d*)\s*(H)\s*(\d*)\s*(N)\s*(\d*)\b',
1319
+ lambda
1320
+ m: f"{m.group(1)}{m.group(2) or ''}{m.group(3)}{m.group(4) or ''}{m.group(5)}{m.group(6) or ''}",
1321
+ text_content)
1322
+
1323
+ text_content = re.sub(r'\b(C)\s*(\d*)\s*(H)\s*(\d*)\s*(O)\s*(\d*)\b',
1324
+ lambda
1325
+ m: f"{m.group(1)}{m.group(2) or ''}{m.group(3)}{m.group(4) or ''}{m.group(5)}{m.group(6) or ''}",
1326
+ text_content)
1327
+ text_content = text_content.replace("C2o", "C20").replace("C1o", "C10").replace("Cal", "cal")
1328
+ text_content = re.sub(r'B(\d+)H(\d+)', r'H\2B\1', text_content)
1329
+ text_content = text_content.replace('\n', ' ').replace('+-', '+').replace(':', " ").replace('–', '-').replace(',',
1330
+ " ")
1331
+ text_content = remove_spaces_within_brackets(text_content)
1332
+ # Remove nested brackets from [(M+H]]+ etc.
1333
+ text_content = re.sub(r'\(\[([^]]{1,10})]\+\)', r'[\1]+', text_content)
1334
+ text_content = re.sub(r'\[\[([^]]{1,10})]\+]', r'[\1]+', text_content)
1335
+ text_content = text_content.replace(' [[', '[').replace(']]', ']')
1336
+
1337
+ replacements = {
1338
+ "₁": "1", "β‚‚": "2", "₃": "3", "β‚„": "4", "β‚…": "5",
1339
+ "₆": "6", "₇": "7", "β‚ˆ": "8", "₉": "9", "β‚€": "0",
1340
+ "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁡": "5",
1341
+ "⁢": "6", "⁷": "7", "⁸": "8", "⁹": "9", "⁰": "0",
1342
+ "Б": "C", "Н": "H",
1343
+ "C ": "C", " H ": "H", " F ": "F", " N ": "N", " Cl ": "Cl",
1344
+ " Br ": "Br", " O ": "O", " I ": "I", " P ": "P", " B ": "B",
1345
+ " S ": "S", " NO ": "NO", " Na ": "Na", " SNa ": "SNa", " NNa ": "NNa",
1346
+ " + ": "+"
1347
+ }
1348
+
1349
+ # Apply replacements and additional processing steps.
1350
+ for original, replacement in replacements.items():
1351
+ text_content = text_content.replace(original, replacement)
1352
+ text_content = remove_spaces_in_formula(text_content)
1353
+ text_content = text_content.replace('#', '')
1354
+ text_content = re.sub(r'(C\d+)', r' \1', text_content)
1355
+ text_content = transform_expressions_in_text(text_content)
1356
+ text_content = isotope_correct(text_content)
1357
+ text_content = protect_floats(text_content)
1358
+ text_content = text_content.replace("[13C]", "H1HeXe")
1359
+ text_content = text_content.replace("CF", "C1F")
1360
+ text_content = text_content.replace("HN", "H1N")
1361
+ # st.write(text_content) # Optionally display intermediate output
1362
+ results1 = search_hrms_with_floats(text_content)
1363
+ modified_text = text_content
1364
+ for match in results1:
1365
+ modified_text = modified_text.replace(match, '')
1366
+ # Clean up any extra spaces
1367
+ modified_text = re.sub(r'\s+', ' ', modified_text).strip()
1368
+ text_content = modified_text
1369
+ results2 = search_calcd_with_floats(text_content)
1370
+
1371
+ results = results1 + results2
1372
+ cleaned_results = hrms_cleanup(results, error_dictionary)
1373
+
1374
+ cleaned_results = calc_dev_calcd_and_recalcd(cleaned_results)
1375
+ cleaned_results = remove_sublists_with_missing_element1_positions_swapped(cleaned_results)
1376
+
1377
+ # Remove duplicate sublists
1378
+ cleaned_results_new = []
1379
+ for sublist in cleaned_results:
1380
+ if sublist not in cleaned_results_new:
1381
+ cleaned_results_new.append(sublist)
1382
+ cleaned_results = cleaned_results_new
1383
+
1384
+ # Count the total number of measurements
1385
+ num_row = len(cleaned_results)
1386
+
1387
+ if cleaned_results:
1388
+ st.write(" ")
1389
+ # Use the Streamlit version of print_aligned_table to display the table
1390
+ print_aligned_table(cleaned_results)
1391
+ if check_conditions(cleaned_results):
1392
+ st.success("Awesome! No mistakes!")
1393
+ # Optionally, display the raw results:
1394
+ # for result in results:
1395
+ # st.write(result)
1396
+
1397
+
1398
+ if __name__ == '__main__':
1399
+ main()