Matchball commited on
Commit
4c8f4de
·
verified ·
1 Parent(s): 81d6890

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -40
app.py CHANGED
@@ -1333,91 +1333,89 @@ def main():
1333
  analyze_content(text_content)
1334
 
1335
 
1336
- import streamlit as st
1337
- import re
1338
-
1339
  def analyze_content(text_content):
1340
- st.set_page_config(layout="wide") # Ensures better layout control
1341
-
1342
- # Add explicit CSS to ensure scrollbar is visible
1343
- st.markdown(
1344
- """
1345
- <style>
1346
- div.block-container {
1347
- overflow-y: auto;
1348
- }
1349
- iframe {
1350
- overflow: visible !important;
1351
- }
1352
- </style>
1353
- """, unsafe_allow_html=True
1354
- )
1355
-
1356
  text_content = remove_specific_lines_from_string(text_content)
1357
- text_content = re.sub(r'\s+', ' ', text_content).strip()
 
1358
  text_content = process_replacements(text_content)
1359
  text_content = replace_comma_with_decimal(text_content)
1360
  text_content = adjust_space_around_decimal(text_content)
1361
  text_content = fix_floats(text_content)
 
1362
  text_content = remove_page_numbers(text_content)
1363
-
1364
  text_content = re.sub(r'\[((C\d+(?:[A-Z][a-z]?\d*)*),\s*([M+][^]]+))', r'\1 [\3]', text_content)
1365
- text_content = re.sub(r'(C)(\d+)(h)(\d+)', lambda m: f'C{m.group(2)}H{m.group(4)}', text_content, flags=re.IGNORECASE)
1366
- text_content = re.sub(r'(c)(\d+)(H)(\d+)', lambda m: f'C{m.group(2)}H{m.group(4)}', text_content, flags=re.IGNORECASE)
 
 
1367
  text_content = re.sub(r'\b(C)(\d+)(HD)\b', r'C\2H1D', text_content)
1368
-
1369
  text_content = re.sub(r'\b(C)\s*(\d*)\s*(H)\s*(\d*)\s*(N)\s*(\d*)\b',
1370
- lambda m: f"{m.group(1)}{m.group(2) or ''}{m.group(3)}{m.group(4) or ''}{m.group(5)}{m.group(6) or ''}",
 
1371
  text_content)
1372
 
1373
  text_content = re.sub(r'\b(C)\s*(\d*)\s*(H)\s*(\d*)\s*(O)\s*(\d*)\b',
1374
- lambda m: f"{m.group(1)}{m.group(2) or ''}{m.group(3)}{m.group(4) or ''}{m.group(5)}{m.group(6) or ''}",
 
1375
  text_content)
1376
-
1377
  text_content = text_content.replace("C2o", "C20").replace("C1o", "C10").replace("Cal", "cal")
1378
  text_content = re.sub(r'B(\d+)H(\d+)', r'H\2B\1', text_content)
1379
- text_content = text_content.replace('\n', ' ').replace('+-', '+').replace(':', " ").replace('–', '-').replace(',', " ")
 
1380
  text_content = remove_spaces_within_brackets(text_content)
1381
-
1382
  text_content = re.sub(r'\(\[([^]]{1,10})]\+\)', r'[\1]+', text_content)
1383
  text_content = re.sub(r'\[\[([^]]{1,10})]\+]', r'[\1]+', text_content)
1384
  text_content = text_content.replace(' [[', '[').replace(']]', ']')
1385
 
1386
- replacements = {"₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8",
1387
- "": "9", "₀": "0", "¹": "1", "²": "2", "³": "3", "": "4", "": "5", "⁶": "6",
1388
- "": "7", "": "8", "": "9", "": "0", "С": "C", "Н": "H", "C ": "C", " H ": "H",
1389
- " F ": "F", " N ": "N", " Cl ": "Cl", " Br ": "Br", " O ": "O", " I ": "I", " P ": "P",
1390
- " B ": "B", " S ": "S", " NO ": "NO", " Na ": "Na", " SNa ": "SNa", " NNa ": "NNa", " + ": "+"}
 
 
 
 
 
 
1391
 
 
1392
  for original, replacement in replacements.items():
1393
  text_content = text_content.replace(original, replacement)
1394
-
1395
- text_content = remove_spaces_in_formula(text_content).replace('#', '')
1396
  text_content = re.sub(r'(C\d+)', r' \1', text_content)
1397
  text_content = transform_expressions_in_text(text_content)
1398
  text_content = isotope_correct(text_content)
1399
  text_content = protect_floats(text_content)
1400
- text_content = text_content.replace("[13C]", "H1HeXe").replace("CF", "C1F").replace("HN", "H1N")
1401
-
 
 
1402
  results1 = search_hrms_with_floats(text_content)
1403
  modified_text = text_content
1404
  for match in results1:
1405
  modified_text = modified_text.replace(match, '')
 
1406
  modified_text = re.sub(r'\s+', ' ', modified_text).strip()
1407
  text_content = modified_text
1408
-
1409
  results2 = search_calcd_with_floats(text_content)
 
1410
  results = results1 + results2
1411
  cleaned_results = hrms_cleanup(results, error_dictionary)
 
1412
  cleaned_results = calc_dev_calcd_and_recalcd(cleaned_results)
1413
  cleaned_results = remove_sublists_with_missing_element1_positions_swapped(cleaned_results)
1414
 
 
1415
  cleaned_results_new = []
1416
  for sublist in cleaned_results:
1417
  if sublist not in cleaned_results_new:
1418
  cleaned_results_new.append(sublist)
1419
  cleaned_results = cleaned_results_new
1420
 
 
1421
  num_row = len(cleaned_results)
1422
 
1423
  if cleaned_results:
@@ -1429,6 +1427,5 @@ def analyze_content(text_content):
1429
  st.write(" ")
1430
  st.write(f"No HRMS matches found in the uploaded file")
1431
 
1432
-
1433
  if __name__ == '__main__':
1434
  main()
 
1333
  analyze_content(text_content)
1334
 
1335
 
 
 
 
1336
  def analyze_content(text_content):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1337
  text_content = remove_specific_lines_from_string(text_content)
1338
+ # st.write(text_content)
1339
+ text_content = re.sub(r'\s+', ' ', text_content).strip() # Replace multiple spaces with a single space
1340
  text_content = process_replacements(text_content)
1341
  text_content = replace_comma_with_decimal(text_content)
1342
  text_content = adjust_space_around_decimal(text_content)
1343
  text_content = fix_floats(text_content)
1344
+ # st.write(text_content)
1345
  text_content = remove_page_numbers(text_content)
 
1346
  text_content = re.sub(r'\[((C\d+(?:[A-Z][a-z]?\d*)*),\s*([M+][^]]+))', r'\1 [\3]', text_content)
1347
+ text_content = re.sub(r'(C)(\d+)(h)(\d+)', lambda m: f'C{m.group(2)}H{m.group(4)}', text_content,
1348
+ flags=re.IGNORECASE)
1349
+ text_content = re.sub(r'(c)(\d+)(H)(\d+)', lambda m: f'C{m.group(2)}H{m.group(4)}', text_content,
1350
+ flags=re.IGNORECASE)
1351
  text_content = re.sub(r'\b(C)(\d+)(HD)\b', r'C\2H1D', text_content)
 
1352
  text_content = re.sub(r'\b(C)\s*(\d*)\s*(H)\s*(\d*)\s*(N)\s*(\d*)\b',
1353
+ lambda
1354
+ m: f"{m.group(1)}{m.group(2) or ''}{m.group(3)}{m.group(4) or ''}{m.group(5)}{m.group(6) or ''}",
1355
  text_content)
1356
 
1357
  text_content = re.sub(r'\b(C)\s*(\d*)\s*(H)\s*(\d*)\s*(O)\s*(\d*)\b',
1358
+ lambda
1359
+ m: f"{m.group(1)}{m.group(2) or ''}{m.group(3)}{m.group(4) or ''}{m.group(5)}{m.group(6) or ''}",
1360
  text_content)
 
1361
  text_content = text_content.replace("C2o", "C20").replace("C1o", "C10").replace("Cal", "cal")
1362
  text_content = re.sub(r'B(\d+)H(\d+)', r'H\2B\1', text_content)
1363
+ text_content = text_content.replace('\n', ' ').replace('+-', '+').replace(':', " ").replace('–', '-').replace(',',
1364
+ " ")
1365
  text_content = remove_spaces_within_brackets(text_content)
1366
+ # Remove nested brackets from [(M+H]]+ etc.
1367
  text_content = re.sub(r'\(\[([^]]{1,10})]\+\)', r'[\1]+', text_content)
1368
  text_content = re.sub(r'\[\[([^]]{1,10})]\+]', r'[\1]+', text_content)
1369
  text_content = text_content.replace(' [[', '[').replace(']]', ']')
1370
 
1371
+ replacements = {
1372
+ "": "1", "": "2", "": "3", "": "4", "": "5",
1373
+ "": "6", "": "7", "": "8", "": "9", "": "0",
1374
+ "¹": "1", "²": "2", "³": "3", "": "4", "": "5",
1375
+ "": "6", "": "7", "": "8", "": "9", "": "0",
1376
+ "С": "C", "Н": "H",
1377
+ "C ": "C", " H ": "H", " F ": "F", " N ": "N", " Cl ": "Cl",
1378
+ " Br ": "Br", " O ": "O", " I ": "I", " P ": "P", " B ": "B",
1379
+ " S ": "S", " NO ": "NO", " Na ": "Na", " SNa ": "SNa", " NNa ": "NNa",
1380
+ " + ": "+"
1381
+ }
1382
 
1383
+ # Apply replacements and additional processing steps.
1384
  for original, replacement in replacements.items():
1385
  text_content = text_content.replace(original, replacement)
1386
+ text_content = remove_spaces_in_formula(text_content)
1387
+ text_content = text_content.replace('#', '')
1388
  text_content = re.sub(r'(C\d+)', r' \1', text_content)
1389
  text_content = transform_expressions_in_text(text_content)
1390
  text_content = isotope_correct(text_content)
1391
  text_content = protect_floats(text_content)
1392
+ text_content = text_content.replace("[13C]", "H1HeXe")
1393
+ text_content = text_content.replace("CF", "C1F")
1394
+ text_content = text_content.replace("HN", "H1N")
1395
+ # st.write(text_content) # Optionally display intermediate output
1396
  results1 = search_hrms_with_floats(text_content)
1397
  modified_text = text_content
1398
  for match in results1:
1399
  modified_text = modified_text.replace(match, '')
1400
+ # Clean up any extra spaces
1401
  modified_text = re.sub(r'\s+', ' ', modified_text).strip()
1402
  text_content = modified_text
 
1403
  results2 = search_calcd_with_floats(text_content)
1404
+
1405
  results = results1 + results2
1406
  cleaned_results = hrms_cleanup(results, error_dictionary)
1407
+
1408
  cleaned_results = calc_dev_calcd_and_recalcd(cleaned_results)
1409
  cleaned_results = remove_sublists_with_missing_element1_positions_swapped(cleaned_results)
1410
 
1411
+ # Remove duplicate sublists
1412
  cleaned_results_new = []
1413
  for sublist in cleaned_results:
1414
  if sublist not in cleaned_results_new:
1415
  cleaned_results_new.append(sublist)
1416
  cleaned_results = cleaned_results_new
1417
 
1418
+ # Count the total number of measurements
1419
  num_row = len(cleaned_results)
1420
 
1421
  if cleaned_results:
 
1427
  st.write(" ")
1428
  st.write(f"No HRMS matches found in the uploaded file")
1429
 
 
1430
  if __name__ == '__main__':
1431
  main()