Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1333,91 +1333,89 @@ def main():
|
|
| 1333 |
analyze_content(text_content)
|
| 1334 |
|
| 1335 |
|
| 1336 |
-
import streamlit as st
|
| 1337 |
-
import re
|
| 1338 |
-
|
| 1339 |
def analyze_content(text_content):
|
| 1340 |
-
st.set_page_config(layout="wide") # Ensures better layout control
|
| 1341 |
-
|
| 1342 |
-
# Add explicit CSS to ensure scrollbar is visible
|
| 1343 |
-
st.markdown(
|
| 1344 |
-
"""
|
| 1345 |
-
<style>
|
| 1346 |
-
div.block-container {
|
| 1347 |
-
overflow-y: auto;
|
| 1348 |
-
}
|
| 1349 |
-
iframe {
|
| 1350 |
-
overflow: visible !important;
|
| 1351 |
-
}
|
| 1352 |
-
</style>
|
| 1353 |
-
""", unsafe_allow_html=True
|
| 1354 |
-
)
|
| 1355 |
-
|
| 1356 |
text_content = remove_specific_lines_from_string(text_content)
|
| 1357 |
-
|
|
|
|
| 1358 |
text_content = process_replacements(text_content)
|
| 1359 |
text_content = replace_comma_with_decimal(text_content)
|
| 1360 |
text_content = adjust_space_around_decimal(text_content)
|
| 1361 |
text_content = fix_floats(text_content)
|
|
|
|
| 1362 |
text_content = remove_page_numbers(text_content)
|
| 1363 |
-
|
| 1364 |
text_content = re.sub(r'\[((C\d+(?:[A-Z][a-z]?\d*)*),\s*([M+][^]]+))', r'\1 [\3]', text_content)
|
| 1365 |
-
text_content = re.sub(r'(C)(\d+)(h)(\d+)', lambda m: f'C{m.group(2)}H{m.group(4)}', text_content,
|
| 1366 |
-
|
|
|
|
|
|
|
| 1367 |
text_content = re.sub(r'\b(C)(\d+)(HD)\b', r'C\2H1D', text_content)
|
| 1368 |
-
|
| 1369 |
text_content = re.sub(r'\b(C)\s*(\d*)\s*(H)\s*(\d*)\s*(N)\s*(\d*)\b',
|
| 1370 |
-
lambda
|
|
|
|
| 1371 |
text_content)
|
| 1372 |
|
| 1373 |
text_content = re.sub(r'\b(C)\s*(\d*)\s*(H)\s*(\d*)\s*(O)\s*(\d*)\b',
|
| 1374 |
-
lambda
|
|
|
|
| 1375 |
text_content)
|
| 1376 |
-
|
| 1377 |
text_content = text_content.replace("C2o", "C20").replace("C1o", "C10").replace("Cal", "cal")
|
| 1378 |
text_content = re.sub(r'B(\d+)H(\d+)', r'H\2B\1', text_content)
|
| 1379 |
-
text_content = text_content.replace('\n', ' ').replace('+-', '+').replace(':', " ").replace('–', '-').replace(',',
|
|
|
|
| 1380 |
text_content = remove_spaces_within_brackets(text_content)
|
| 1381 |
-
|
| 1382 |
text_content = re.sub(r'\(\[([^]]{1,10})]\+\)', r'[\1]+', text_content)
|
| 1383 |
text_content = re.sub(r'\[\[([^]]{1,10})]\+]', r'[\1]+', text_content)
|
| 1384 |
text_content = text_content.replace(' [[', '[').replace(']]', ']')
|
| 1385 |
|
| 1386 |
-
replacements = {
|
| 1387 |
-
|
| 1388 |
-
|
| 1389 |
-
|
| 1390 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1391 |
|
|
|
|
| 1392 |
for original, replacement in replacements.items():
|
| 1393 |
text_content = text_content.replace(original, replacement)
|
| 1394 |
-
|
| 1395 |
-
text_content =
|
| 1396 |
text_content = re.sub(r'(C\d+)', r' \1', text_content)
|
| 1397 |
text_content = transform_expressions_in_text(text_content)
|
| 1398 |
text_content = isotope_correct(text_content)
|
| 1399 |
text_content = protect_floats(text_content)
|
| 1400 |
-
text_content = text_content.replace("[13C]", "H1HeXe")
|
| 1401 |
-
|
|
|
|
|
|
|
| 1402 |
results1 = search_hrms_with_floats(text_content)
|
| 1403 |
modified_text = text_content
|
| 1404 |
for match in results1:
|
| 1405 |
modified_text = modified_text.replace(match, '')
|
|
|
|
| 1406 |
modified_text = re.sub(r'\s+', ' ', modified_text).strip()
|
| 1407 |
text_content = modified_text
|
| 1408 |
-
|
| 1409 |
results2 = search_calcd_with_floats(text_content)
|
|
|
|
| 1410 |
results = results1 + results2
|
| 1411 |
cleaned_results = hrms_cleanup(results, error_dictionary)
|
|
|
|
| 1412 |
cleaned_results = calc_dev_calcd_and_recalcd(cleaned_results)
|
| 1413 |
cleaned_results = remove_sublists_with_missing_element1_positions_swapped(cleaned_results)
|
| 1414 |
|
|
|
|
| 1415 |
cleaned_results_new = []
|
| 1416 |
for sublist in cleaned_results:
|
| 1417 |
if sublist not in cleaned_results_new:
|
| 1418 |
cleaned_results_new.append(sublist)
|
| 1419 |
cleaned_results = cleaned_results_new
|
| 1420 |
|
|
|
|
| 1421 |
num_row = len(cleaned_results)
|
| 1422 |
|
| 1423 |
if cleaned_results:
|
|
@@ -1429,6 +1427,5 @@ def analyze_content(text_content):
|
|
| 1429 |
st.write(" ")
|
| 1430 |
st.write(f"No HRMS matches found in the uploaded file")
|
| 1431 |
|
| 1432 |
-
|
| 1433 |
if __name__ == '__main__':
|
| 1434 |
main()
|
|
|
|
| 1333 |
analyze_content(text_content)
|
| 1334 |
|
| 1335 |
|
|
|
|
|
|
|
|
|
|
| 1336 |
def analyze_content(text_content):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1337 |
text_content = remove_specific_lines_from_string(text_content)
|
| 1338 |
+
# st.write(text_content)
|
| 1339 |
+
text_content = re.sub(r'\s+', ' ', text_content).strip() # Replace multiple spaces with a single space
|
| 1340 |
text_content = process_replacements(text_content)
|
| 1341 |
text_content = replace_comma_with_decimal(text_content)
|
| 1342 |
text_content = adjust_space_around_decimal(text_content)
|
| 1343 |
text_content = fix_floats(text_content)
|
| 1344 |
+
# st.write(text_content)
|
| 1345 |
text_content = remove_page_numbers(text_content)
|
|
|
|
| 1346 |
text_content = re.sub(r'\[((C\d+(?:[A-Z][a-z]?\d*)*),\s*([M+][^]]+))', r'\1 [\3]', text_content)
|
| 1347 |
+
text_content = re.sub(r'(C)(\d+)(h)(\d+)', lambda m: f'C{m.group(2)}H{m.group(4)}', text_content,
|
| 1348 |
+
flags=re.IGNORECASE)
|
| 1349 |
+
text_content = re.sub(r'(c)(\d+)(H)(\d+)', lambda m: f'C{m.group(2)}H{m.group(4)}', text_content,
|
| 1350 |
+
flags=re.IGNORECASE)
|
| 1351 |
text_content = re.sub(r'\b(C)(\d+)(HD)\b', r'C\2H1D', text_content)
|
|
|
|
| 1352 |
text_content = re.sub(r'\b(C)\s*(\d*)\s*(H)\s*(\d*)\s*(N)\s*(\d*)\b',
|
| 1353 |
+
lambda
|
| 1354 |
+
m: f"{m.group(1)}{m.group(2) or ''}{m.group(3)}{m.group(4) or ''}{m.group(5)}{m.group(6) or ''}",
|
| 1355 |
text_content)
|
| 1356 |
|
| 1357 |
text_content = re.sub(r'\b(C)\s*(\d*)\s*(H)\s*(\d*)\s*(O)\s*(\d*)\b',
|
| 1358 |
+
lambda
|
| 1359 |
+
m: f"{m.group(1)}{m.group(2) or ''}{m.group(3)}{m.group(4) or ''}{m.group(5)}{m.group(6) or ''}",
|
| 1360 |
text_content)
|
|
|
|
| 1361 |
text_content = text_content.replace("C2o", "C20").replace("C1o", "C10").replace("Cal", "cal")
|
| 1362 |
text_content = re.sub(r'B(\d+)H(\d+)', r'H\2B\1', text_content)
|
| 1363 |
+
text_content = text_content.replace('\n', ' ').replace('+-', '+').replace(':', " ").replace('–', '-').replace(',',
|
| 1364 |
+
" ")
|
| 1365 |
text_content = remove_spaces_within_brackets(text_content)
|
| 1366 |
+
# Remove nested brackets from [(M+H]]+ etc.
|
| 1367 |
text_content = re.sub(r'\(\[([^]]{1,10})]\+\)', r'[\1]+', text_content)
|
| 1368 |
text_content = re.sub(r'\[\[([^]]{1,10})]\+]', r'[\1]+', text_content)
|
| 1369 |
text_content = text_content.replace(' [[', '[').replace(']]', ']')
|
| 1370 |
|
| 1371 |
+
replacements = {
|
| 1372 |
+
"₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5",
|
| 1373 |
+
"₆": "6", "₇": "7", "₈": "8", "₉": "9", "₀": "0",
|
| 1374 |
+
"¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5",
|
| 1375 |
+
"⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9", "⁰": "0",
|
| 1376 |
+
"С": "C", "Н": "H",
|
| 1377 |
+
"C ": "C", " H ": "H", " F ": "F", " N ": "N", " Cl ": "Cl",
|
| 1378 |
+
" Br ": "Br", " O ": "O", " I ": "I", " P ": "P", " B ": "B",
|
| 1379 |
+
" S ": "S", " NO ": "NO", " Na ": "Na", " SNa ": "SNa", " NNa ": "NNa",
|
| 1380 |
+
" + ": "+"
|
| 1381 |
+
}
|
| 1382 |
|
| 1383 |
+
# Apply replacements and additional processing steps.
|
| 1384 |
for original, replacement in replacements.items():
|
| 1385 |
text_content = text_content.replace(original, replacement)
|
| 1386 |
+
text_content = remove_spaces_in_formula(text_content)
|
| 1387 |
+
text_content = text_content.replace('#', '')
|
| 1388 |
text_content = re.sub(r'(C\d+)', r' \1', text_content)
|
| 1389 |
text_content = transform_expressions_in_text(text_content)
|
| 1390 |
text_content = isotope_correct(text_content)
|
| 1391 |
text_content = protect_floats(text_content)
|
| 1392 |
+
text_content = text_content.replace("[13C]", "H1HeXe")
|
| 1393 |
+
text_content = text_content.replace("CF", "C1F")
|
| 1394 |
+
text_content = text_content.replace("HN", "H1N")
|
| 1395 |
+
# st.write(text_content) # Optionally display intermediate output
|
| 1396 |
results1 = search_hrms_with_floats(text_content)
|
| 1397 |
modified_text = text_content
|
| 1398 |
for match in results1:
|
| 1399 |
modified_text = modified_text.replace(match, '')
|
| 1400 |
+
# Clean up any extra spaces
|
| 1401 |
modified_text = re.sub(r'\s+', ' ', modified_text).strip()
|
| 1402 |
text_content = modified_text
|
|
|
|
| 1403 |
results2 = search_calcd_with_floats(text_content)
|
| 1404 |
+
|
| 1405 |
results = results1 + results2
|
| 1406 |
cleaned_results = hrms_cleanup(results, error_dictionary)
|
| 1407 |
+
|
| 1408 |
cleaned_results = calc_dev_calcd_and_recalcd(cleaned_results)
|
| 1409 |
cleaned_results = remove_sublists_with_missing_element1_positions_swapped(cleaned_results)
|
| 1410 |
|
| 1411 |
+
# Remove duplicate sublists
|
| 1412 |
cleaned_results_new = []
|
| 1413 |
for sublist in cleaned_results:
|
| 1414 |
if sublist not in cleaned_results_new:
|
| 1415 |
cleaned_results_new.append(sublist)
|
| 1416 |
cleaned_results = cleaned_results_new
|
| 1417 |
|
| 1418 |
+
# Count the total number of measurements
|
| 1419 |
num_row = len(cleaned_results)
|
| 1420 |
|
| 1421 |
if cleaned_results:
|
|
|
|
| 1427 |
st.write(" ")
|
| 1428 |
st.write(f"No HRMS matches found in the uploaded file")
|
| 1429 |
|
|
|
|
| 1430 |
if __name__ == '__main__':
|
| 1431 |
main()
|