Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,1399 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Match
|
| 2 |
+
from molmass import Formula
|
| 3 |
+
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import time
|
| 6 |
+
import logging
|
| 7 |
+
import fitz # PyMuPDF
|
| 8 |
+
import io
|
| 9 |
+
|
| 10 |
+
# Configure logging
|
| 11 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def remove_specific_lines_from_string(input_string):
|
| 16 |
+
pattern = re.compile(r'^\s*(S)?\d+\s*$')
|
| 17 |
+
lines = input_string.split('\n')
|
| 18 |
+
filtered_lines = [line for line in lines if not pattern.match(line)]
|
| 19 |
+
return '\n'.join(filtered_lines)
|
| 20 |
+
|
| 21 |
+
def check_conditions(cleaned_results):
|
| 22 |
+
for row in cleaned_results:
|
| 23 |
+
# Check if the 8th column (index 7) is empty or contains "-0.0001" or "+0.0001"
|
| 24 |
+
if row[7] not in ("", "-0.0001", "+0.0001","Electron mass error"):
|
| 25 |
+
return False
|
| 26 |
+
# Check if the 7th column (index 6) as a float is less than 10
|
| 27 |
+
try:
|
| 28 |
+
if float(row[6]) >= 10:
|
| 29 |
+
return False
|
| 30 |
+
except ValueError:
|
| 31 |
+
# If conversion to float fails, return False
|
| 32 |
+
return False
|
| 33 |
+
return True
|
| 34 |
+
|
| 35 |
+
def fix_floats(text):
|
| 36 |
+
"""
|
| 37 |
+
Searches a string for floats in the form "xxxx.xxx" and changes them to "xxxx.xxx0".
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
text (str): The input text to search and modify.
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
str: The modified text with floats in the form "xxxx.xxx0".
|
| 44 |
+
"""
|
| 45 |
+
# Define a regular expression pattern to match floats with 3 decimal places
|
| 46 |
+
pattern = r'\b\d+\.\d{3}\b'
|
| 47 |
+
|
| 48 |
+
# Use the re.sub() function to replace matches with the modified float
|
| 49 |
+
modified_text = re.sub(pattern, lambda match: match.group() + '0', text)
|
| 50 |
+
|
| 51 |
+
return modified_text
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def remove_sublists_with_missing_element1_positions_swapped(cleaned_results):
|
| 55 |
+
"""
|
| 56 |
+
Removes sublists where element 1 is missing (''), if there exists another sublist
|
| 57 |
+
where elements at positions 2, 3, and 4 are the same (positions 3 and 4 may be swapped)
|
| 58 |
+
and element 1 is present.
|
| 59 |
+
"""
|
| 60 |
+
# Create a set to hold indices of sublists to remove
|
| 61 |
+
indices_to_remove = set()
|
| 62 |
+
# Build a dictionary to map keys (elements 2, and positions 3 & 4 as a frozenset) to indices
|
| 63 |
+
element_presence = {}
|
| 64 |
+
|
| 65 |
+
# First pass: Collect sublists where element 1 is present
|
| 66 |
+
for idx, sublist in enumerate(cleaned_results):
|
| 67 |
+
if len(sublist) < 4:
|
| 68 |
+
continue # Skip if sublist doesn't have enough elements
|
| 69 |
+
# Create a frozenset of positions 3 and 4 to handle swapping
|
| 70 |
+
positions_3_4_set = frozenset([sublist[2], sublist[3]])
|
| 71 |
+
key = (sublist[1], positions_3_4_set) # Element at position 2 and set of positions 3 and 4
|
| 72 |
+
if sublist[0] != '':
|
| 73 |
+
# Element 1 is present, store the index
|
| 74 |
+
if key not in element_presence:
|
| 75 |
+
element_presence[key] = []
|
| 76 |
+
element_presence[key].append(idx)
|
| 77 |
+
|
| 78 |
+
# Second pass: Identify sublists to remove
|
| 79 |
+
for idx, sublist in enumerate(cleaned_results):
|
| 80 |
+
if len(sublist) < 4:
|
| 81 |
+
continue # Skip if sublist doesn't have enough elements
|
| 82 |
+
if sublist[0] == '':
|
| 83 |
+
# Element 1 is missing
|
| 84 |
+
positions_3_4_set = frozenset([sublist[2], sublist[3]])
|
| 85 |
+
key = (sublist[1], positions_3_4_set)
|
| 86 |
+
if key in element_presence:
|
| 87 |
+
# There is at least one sublist where elements 2, 3, 4 (with positions 3 and 4 swapped) are the same and element 1 is present
|
| 88 |
+
indices_to_remove.add(idx)
|
| 89 |
+
|
| 90 |
+
# Remove sublists at the collected indices
|
| 91 |
+
cleaned_results = [sublist for idx, sublist in enumerate(cleaned_results) if idx not in indices_to_remove]
|
| 92 |
+
return cleaned_results
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def remove_spaces_in_formula(text):
|
| 97 |
+
"""
|
| 98 |
+
Removes all spaces within chemical formulas in the input text.
|
| 99 |
+
|
| 100 |
+
The function identifies chemical formulas based on sequences of element symbols
|
| 101 |
+
(one or two letters, starting with an uppercase letter), possibly separated by numbers
|
| 102 |
+
and spaces, and removes any spaces within those sequences.
|
| 103 |
+
|
| 104 |
+
Args:
|
| 105 |
+
text: The input string containing chemical formulas.
|
| 106 |
+
|
| 107 |
+
Returns:
|
| 108 |
+
The processed string with spaces removed from within chemical formulas.
|
| 109 |
+
"""
|
| 110 |
+
|
| 111 |
+
# Step 1: Protect floats by surrounding them with '#'
|
| 112 |
+
text = re.sub(r'(\d+\.\d+)', r'#\1#', text)
|
| 113 |
+
|
| 114 |
+
# Regular expression pattern to match chemical formulas
|
| 115 |
+
element = r'[A-Z][a-z]?'
|
| 116 |
+
number = r'\d+'
|
| 117 |
+
# Pattern matches sequences starting with an element symbol, followed by
|
| 118 |
+
# elements or numbers, possibly with spaces in between
|
| 119 |
+
pattern = r'(' + element + r'(?:\s*(?:' + element + r'|' + number + r'))+)'
|
| 120 |
+
|
| 121 |
+
# Function to remove spaces within the matched chemical formula
|
| 122 |
+
def remove_spaces(match):
|
| 123 |
+
return match.group(0).replace(' ', '')
|
| 124 |
+
|
| 125 |
+
# Replace matches in the text with spaces removed within chemical formulas
|
| 126 |
+
return re.sub(pattern, remove_spaces, text)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def remove_page_numbers(text):
|
| 130 |
+
"""
|
| 131 |
+
Remove lines that appear to be page numbers from a text string.
|
| 132 |
+
|
| 133 |
+
Matches:
|
| 134 |
+
- Single integers (e.g., "12")
|
| 135 |
+
- Integers with dashes (e.g., "- 12 -", "-13-")
|
| 136 |
+
- Integers with p/P/s/S prefix (e.g., "P12", "s23")
|
| 137 |
+
- Integers with p/P/s/S prefix and dashes (e.g., "S-12", "p -13")
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
text (str): Input text containing page numbers
|
| 141 |
+
|
| 142 |
+
Returns:
|
| 143 |
+
str: Text with page number lines removed
|
| 144 |
+
"""
|
| 145 |
+
# Split text into lines
|
| 146 |
+
lines = text.split('\n')
|
| 147 |
+
|
| 148 |
+
# Regular expression patterns for page numbers
|
| 149 |
+
patterns = [
|
| 150 |
+
r'^\s*\d+\s*$', # Single integers: "12"
|
| 151 |
+
r'^\s*-\s*\d+\s*-\s*$', # Dashed integers: "- 12 -"
|
| 152 |
+
r'^\s*-\d+-\s*$', # Compact dashed integers: "-13-"
|
| 153 |
+
r'^\s*[psPS]\s*-?\s*\d+\s*(?:\n|$)', # p/P/s/S prefixed: "P12", "s23", "S-12"
|
| 154 |
+
]
|
| 155 |
+
|
| 156 |
+
# Combine patterns
|
| 157 |
+
combined_pattern = '|'.join(f'({pattern})' for pattern in patterns)
|
| 158 |
+
|
| 159 |
+
# Filter out lines matching the patterns
|
| 160 |
+
cleaned_lines = [line for line in lines if not re.match(combined_pattern, line)]
|
| 161 |
+
|
| 162 |
+
# Rejoin the remaining lines
|
| 163 |
+
return '\n'.join(cleaned_lines)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def is_float(value):
|
| 167 |
+
try:
|
| 168 |
+
float(value)
|
| 169 |
+
return True
|
| 170 |
+
except ValueError:
|
| 171 |
+
return False
|
| 172 |
+
|
| 173 |
+
def protect_floats(text: str) -> str:
|
| 174 |
+
|
| 175 |
+
# Match floats with 3+ digits before decimal and 4+ after
|
| 176 |
+
pattern = r'(\d{3,}\.\d{4,})'
|
| 177 |
+
|
| 178 |
+
def add_spaces(match: re.Match) -> str:
|
| 179 |
+
"""Add spaces around the matched float if needed."""
|
| 180 |
+
float_num = match.group(1)
|
| 181 |
+
start, end = match.span(1)
|
| 182 |
+
|
| 183 |
+
# Get characters before and after the float
|
| 184 |
+
char_before = text[start - 1] if start > 0 else ''
|
| 185 |
+
char_after = text[end] if end < len(text) else ''
|
| 186 |
+
|
| 187 |
+
# Only add space if the adjacent characters aren't already spaces
|
| 188 |
+
prefix = '' if char_before.isspace() else ' '
|
| 189 |
+
suffix = '' if char_after.isspace() else ' '
|
| 190 |
+
|
| 191 |
+
return f'{prefix}{float_num}{suffix}'
|
| 192 |
+
|
| 193 |
+
return re.sub(pattern, add_spaces, text)
|
| 194 |
+
|
| 195 |
+
def replace_comma_with_decimal(text: str) -> str:
|
| 196 |
+
# Match numbers with comma decimals that:
|
| 197 |
+
# \b - Start at a word boundary
|
| 198 |
+
# \d+ - Have one or more digits before the comma
|
| 199 |
+
# , - Have a comma
|
| 200 |
+
# \d+ - Have one or more digits after the comma
|
| 201 |
+
# \b - End at a word boundary
|
| 202 |
+
pattern = r'\b(\d+,\d+)\b'
|
| 203 |
+
|
| 204 |
+
def comma_to_decimal(match: Match[str]) -> str:
|
| 205 |
+
"""Convert comma to decimal point in matched number."""
|
| 206 |
+
return match.group(0).replace(',', '.')
|
| 207 |
+
|
| 208 |
+
return re.sub(pattern, comma_to_decimal, text)
|
| 209 |
+
|
| 210 |
+
def adjust_space_around_decimal(text):
|
| 211 |
+
|
| 212 |
+
if not isinstance(text, str):
|
| 213 |
+
raise TypeError("Input must be a string")
|
| 214 |
+
|
| 215 |
+
# Step 1: Remove unwanted spaces around decimal points
|
| 216 |
+
# Handles cases like "23. 4562" β "23.4562"
|
| 217 |
+
text = re.sub(r'(\d+)\s*\.\s*(\d+)', r'\1.\2', text)
|
| 218 |
+
|
| 219 |
+
# Step 2: Add space between decimal numbers and following text
|
| 220 |
+
# Handles cases like "2.4beta" β "2.4 beta"
|
| 221 |
+
text = re.sub(r'(\d+\.\d+)([A-Za-z])', r'\1 \2', text)
|
| 222 |
+
|
| 223 |
+
# Step 3: Handle special cases where no space is needed
|
| 224 |
+
# For file extensions like ".txt", ".pdf"
|
| 225 |
+
text = re.sub(r'(\s\d+)\s+(\.[A-Za-z]+\b)', r'\1\2', text)
|
| 226 |
+
|
| 227 |
+
return text
|
| 228 |
+
|
| 229 |
+
def decrease_element_count(molecular_formula: str, element_to_decrease: str) -> str:
|
| 230 |
+
"""
|
| 231 |
+
Decreases the count of a specific element in a molecular formula by 1.
|
| 232 |
+
|
| 233 |
+
Args:
|
| 234 |
+
molecular_formula: The input molecular formula (e.g., 'C6H12O2')
|
| 235 |
+
element_to_decrease: The element whose count should be decreased (e.g., 'C')
|
| 236 |
+
|
| 237 |
+
Returns:
|
| 238 |
+
Modified molecular formula with decreased element count
|
| 239 |
+
|
| 240 |
+
Example:
|
| 241 |
+
>>> decrease_element_count('C6H12O2', 'C')
|
| 242 |
+
'C5H12O2'
|
| 243 |
+
"""
|
| 244 |
+
pattern = fr'({element_to_decrease})(?![a-z])\d*'
|
| 245 |
+
|
| 246 |
+
def replace_element(match: re.Match) -> str:
|
| 247 |
+
element_count = match.group()
|
| 248 |
+
element = re.match(r'([A-Z][a-z]*)', element_count).group()
|
| 249 |
+
|
| 250 |
+
if count_match := re.search(r'\d+', element_count):
|
| 251 |
+
current_count = int(count_match.group())
|
| 252 |
+
return (f"{element}{current_count - 1}" if current_count > 2
|
| 253 |
+
else element) # Remove count when it's 2
|
| 254 |
+
return element
|
| 255 |
+
|
| 256 |
+
return re.sub(pattern, replace_element, molecular_formula)
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def have_swapped_adjacent_digits(float1: float, float2: float) -> bool:
|
| 260 |
+
# Convert floats to strings
|
| 261 |
+
str1, str2 = str(float1), str(float2)
|
| 262 |
+
|
| 263 |
+
# Remove last two digits for comparison
|
| 264 |
+
str1 = str1[:-2]
|
| 265 |
+
str2 = str2[:-2]
|
| 266 |
+
|
| 267 |
+
# Remove decimal points for comparison
|
| 268 |
+
str1_no_dot = str1.replace('.', '')
|
| 269 |
+
str2_no_dot = str2.replace('.', '')
|
| 270 |
+
|
| 271 |
+
# Check lengths
|
| 272 |
+
if len(str1_no_dot) != len(str2_no_dot) or len(str1_no_dot) < 2:
|
| 273 |
+
return False
|
| 274 |
+
|
| 275 |
+
# Find positions that differ
|
| 276 |
+
diff_positions = [i for i in range(len(str1_no_dot))
|
| 277 |
+
if str1_no_dot[i] != str2_no_dot[i]]
|
| 278 |
+
|
| 279 |
+
# Must have exactly 2 differences for a single swap
|
| 280 |
+
if len(diff_positions) != 2:
|
| 281 |
+
return False
|
| 282 |
+
|
| 283 |
+
# The positions must be adjacent
|
| 284 |
+
if diff_positions[1] - diff_positions[0] != 1:
|
| 285 |
+
return False
|
| 286 |
+
|
| 287 |
+
# Check if it's actually a swap
|
| 288 |
+
pos1, pos2 = diff_positions
|
| 289 |
+
return (str1_no_dot[pos1] == str2_no_dot[pos2] and
|
| 290 |
+
str1_no_dot[pos2] == str2_no_dot[pos1])
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
def differ_in_single_digit_except_last_two(float1: float, float2: float) -> bool:
|
| 295 |
+
"""
|
| 296 |
+
Checks if two floating-point numbers differ by exactly one digit, excluding the last two digits.
|
| 297 |
+
Handles trailing zeros and decimal points in the comparison.
|
| 298 |
+
|
| 299 |
+
Args:
|
| 300 |
+
float1: First floating-point number
|
| 301 |
+
float2: Second floating-point number
|
| 302 |
+
|
| 303 |
+
Returns:
|
| 304 |
+
True if numbers differ by exactly one digit (excluding last two), False otherwise
|
| 305 |
+
|
| 306 |
+
Examples:
|
| 307 |
+
>>> differ_in_single_digit_except_last_two(123.45, 153.45)
|
| 308 |
+
True
|
| 309 |
+
>>> differ_in_single_digit_except_last_two(123.45, 153.46)
|
| 310 |
+
False
|
| 311 |
+
>>> differ_in_single_digit_except_last_two(123.450, 153.45)
|
| 312 |
+
True
|
| 313 |
+
"""
|
| 314 |
+
# Convert to strings and normalize by removing trailing zeros and decimal points
|
| 315 |
+
str1 = str(float1).rstrip('0').rstrip('.')
|
| 316 |
+
str2 = str(float2).rstrip('0').rstrip('.')
|
| 317 |
+
|
| 318 |
+
# Quick validation checks
|
| 319 |
+
if len(str1) != len(str2) or len(str1) < 3: # Need at least 3 digits for comparison
|
| 320 |
+
return False
|
| 321 |
+
|
| 322 |
+
# Extract main part and last two digits
|
| 323 |
+
main1, last_two1 = str1[:-2], str1[-2:]
|
| 324 |
+
main2, last_two2 = str2[:-2], str2[-2:]
|
| 325 |
+
|
| 326 |
+
# Last two digits must match
|
| 327 |
+
if last_two1 != last_two2:
|
| 328 |
+
return False
|
| 329 |
+
|
| 330 |
+
# Count differing digits in main part
|
| 331 |
+
return sum(1 for a, b in zip(main1, main2) if a != b) == 1
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
def calculate_molecular_weight(formula):
|
| 335 |
+
# Dictionary of atomic weights for elements up to Plutonium (94)
|
| 336 |
+
# Values are in atomic mass units (amu) or g/mol
|
| 337 |
+
atomic_weights = {
|
| 338 |
+
"H": 1.008, "D": 2.0141, "He": 4.002602, "Li": 6.94, "Be": 9.0121831, "B": 10.81, "C": 12.011,
|
| 339 |
+
"N": 14.007, "O": 15.999, "F": 18.9984, "Ne": 20.1797, "Na": 22.98977, "Mg": 24.305, "Al": 26.98154,
|
| 340 |
+
"Si": 28.085, "P": 30.97376, "S": 32.06, "Cl": 35.45, "Ar": 39.948, "K": 39.0983, "Ca": 40.078,
|
| 341 |
+
"Sc": 44.955908, "Ti": 47.867, "V": 50.9415, "Cr": 51.9961, "Mn": 54.938044, "Fe": 55.845,
|
| 342 |
+
"Co": 58.933194, "Ni": 58.6934, "Cu": 63.546, "Zn": 65.38, "Ga": 69.723, "Ge": 72.630,
|
| 343 |
+
"As": 74.921595, "Se": 78.971, "Br": 79.904, "Kr": 83.798, "Rb": 85.4678, "Sr": 87.62,
|
| 344 |
+
"Y": 88.90584, "Zr": 91.224, "Nb": 92.90637, "Mo": 95.95, "Tc": 98, "Ru": 101.07,
|
| 345 |
+
"Rh": 102.90550, "Pd": 106.42, "Ag": 107.8682, "Cd": 112.414, "In": 114.818, "Sn": 118.710,
|
| 346 |
+
"Sb": 121.760, "Te": 127.60, "I": 126.90447, "Xe": 131.293, "Cs": 132.90545196, "Ba": 137.327,
|
| 347 |
+
"La": 138.90547, "Ce": 140.116, "Pr": 140.90766, "Nd": 144.242, "Pm": 145, "Sm": 150.36,
|
| 348 |
+
"Eu": 151.964, "Gd": 157.25, "Tb": 158.92535, "Dy": 162.500, "Ho": 164.93033,
|
| 349 |
+
"Er": 167.259, "Tm": 168.93422, "Yb": 173.04, "Lu": 174.9668, "Hf": 178.49,
|
| 350 |
+
"Ta": 180.94788, "W": 183.84, "Re": 186.207, "Os": 190.23, "Ir": 192.217,
|
| 351 |
+
"Pt": 195.084, "Au": 196.96657, "Hg": 200.592, "Tl": 204.38, "Pb": 207.2,
|
| 352 |
+
"Bi": 208.9804, "Po": 209, "At": 210, "Rn": 222, "Fr": 223, "Ra": 226,
|
| 353 |
+
"Ac": 227, "Th": 232.0377, "Pa": 231.03588, "U": 238.02891, "Np": 237, "Pu": 244
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
# Parse the molecular formula using regex
|
| 357 |
+
formula_components = re.findall(r"([A-Z][a-z]?)(\d*)", formula)
|
| 358 |
+
|
| 359 |
+
# Calculate total molecular weight
|
| 360 |
+
mol_weight = 0.0
|
| 361 |
+
for element, count in formula_components:
|
| 362 |
+
# Get atomic weight from dictionary, default to 0.0 if element not found
|
| 363 |
+
element_weight = atomic_weights.get(element, 0.0)
|
| 364 |
+
# If no count specified, assume 1, otherwise convert string to integer
|
| 365 |
+
mol_weight += element_weight * (int(count) if count else 1)
|
| 366 |
+
|
| 367 |
+
return mol_weight
|
| 368 |
+
|
| 369 |
+
def remove_spaces_within_brackets(s, max_chars=20):
|
| 370 |
+
"""
|
| 371 |
+
Removes all spaces within brackets () or [] if the number of non-space characters inside
|
| 372 |
+
is within max_chars. Handles nested brackets appropriately without affecting spaces outside
|
| 373 |
+
the brackets.
|
| 374 |
+
|
| 375 |
+
Args:
|
| 376 |
+
- s (str): The input string.
|
| 377 |
+
- max_chars (int): Maximum number of non-space characters between opening and closing brackets.
|
| 378 |
+
|
| 379 |
+
Returns:
|
| 380 |
+
- str: The modified string with spaces removed within qualifying brackets.
|
| 381 |
+
"""
|
| 382 |
+
stack = []
|
| 383 |
+
# Mapping of opening brackets to their corresponding closing brackets
|
| 384 |
+
opening_to_closing = {'(': ')', '[': ']'}
|
| 385 |
+
# Mapping of closing brackets to their corresponding opening brackets
|
| 386 |
+
closing_to_opening = {')': '(', ']': '['}
|
| 387 |
+
|
| 388 |
+
s_list = list(s) # Convert string to list for mutable operations
|
| 389 |
+
remove_space_ranges = [] # List to hold ranges where spaces need to be removed
|
| 390 |
+
|
| 391 |
+
for i, char in enumerate(s_list):
|
| 392 |
+
if char in opening_to_closing:
|
| 393 |
+
# Push opening bracket and its position onto the stack
|
| 394 |
+
stack.append((char, i))
|
| 395 |
+
elif char in closing_to_opening:
|
| 396 |
+
if stack and stack[-1][0] == closing_to_opening[char]:
|
| 397 |
+
# Pop the last opening bracket from the stack
|
| 398 |
+
open_char, open_pos = stack.pop()
|
| 399 |
+
close_pos = i
|
| 400 |
+
# Extract the substring inside the brackets
|
| 401 |
+
content = ''.join(s_list[open_pos + 1:close_pos])
|
| 402 |
+
# Count the number of non-space characters
|
| 403 |
+
non_space_chars = len(content.replace(' ', ''))
|
| 404 |
+
if non_space_chars <= max_chars:
|
| 405 |
+
# Define the range for space removal (exclusive of brackets)
|
| 406 |
+
remove_space_ranges.append((open_pos + 1, close_pos))
|
| 407 |
+
else:
|
| 408 |
+
# Unmatched closing bracket; ignore or handle as needed
|
| 409 |
+
pass
|
| 410 |
+
|
| 411 |
+
# Sort ranges in descending order of start index to handle inner brackets first
|
| 412 |
+
remove_space_ranges.sort(key=lambda x: x[0], reverse=True)
|
| 413 |
+
|
| 414 |
+
for start, end in remove_space_ranges:
|
| 415 |
+
# Extract the substring within the current bracket (excluding brackets)
|
| 416 |
+
substring = ''.join(s_list[start:end])
|
| 417 |
+
# Remove all spaces within this substring
|
| 418 |
+
substring_no_spaces = substring.replace(' ', '')
|
| 419 |
+
# Replace the original substring with the modified one
|
| 420 |
+
s_list[start:end] = list(substring_no_spaces)
|
| 421 |
+
|
| 422 |
+
# Join the list back into a string and return
|
| 423 |
+
return ''.join(s_list)
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
def isotope_correct(text):
|
| 427 |
+
"""
|
| 428 |
+
Applies a series of substitutions to a text to correct for isotope labeling and other specific replacements.
|
| 429 |
+
|
| 430 |
+
Parameters:
|
| 431 |
+
text (str): The input text to be processed.
|
| 432 |
+
|
| 433 |
+
Returns:
|
| 434 |
+
str: The processed text with all substitutions applied.
|
| 435 |
+
"""
|
| 436 |
+
# Dictionary of replacements for isotope corrections and other text cleanup
|
| 437 |
+
replacements = {
|
| 438 |
+
"For":" ","[MALDI]":"","[MALDI-TOF]":"","detected":" ","page": " ", "of": " ", "π": " ", "EI": " ", " . ": " ", ":": " ", "Ξ": " ",
|
| 439 |
+
"πΌ": " ", " a ": " ", "M ": " ", " H ": " ", "ESI": " ", " Na ": " ", " K ": " ",
|
| 440 |
+
" NH4 ": " ", "Obs.": " ", "obs": " ", "78.9183": "", "48Ti": "[48Ti]","54Fe":"[54Fe]",
|
| 441 |
+
"46Ti": "[46Ti]", "47Ti": "[47Ti]", " 2H": "D", " [3H]": "[3H]",
|
| 442 |
+
" 10B": "[10B]", "127I": "[127I]", "120Sn":"[120Sn]", "119Sn":"[119Sn]", "118Sn":"[118Sn]",
|
| 443 |
+
"N23Na": "*N23*Na","O23Na": "*O23*Na", "F23Na": "*F23*Na", "H23Na": "*H23*Na", "23Na":"[23Na]","H28Si": "*H28*Si", "H11B": "*H11*B",
|
| 444 |
+
"H13Co": "*H13*Co", "H13Cl": "*H13*Cl", "H18O": "*H18*O", "H218O": "*H218*O", "N18O": "*N18*O",
|
| 445 |
+
"H35Cl": "*H35*Cl", "H37Cl": "*H37*Cl", "H10B":"*H10*B", "H19F": "*H19*F", "H81Br":"*H81*Br","H79Br":"*H79*Br","Br79": "[79Br]",
|
| 446 |
+
" 79Br": "[79Br]", " 81Br": "[81Br]", "18O": "[18O]", "74Ge": "[74Ge]", "65Cu":"[65Cu]",
|
| 447 |
+
"63Cu":"[63Cu]", "Br81": "[81Br]", " 35Cl": "[35Cl]", " 37Cl": "[37Cl]", " 11B": "[11B]",
|
| 448 |
+
" 32S": "S", " 31P": "P", "35Cl":"[35Cl]", "80Se":"[80Se]", "37Cl":"[37Cl]", "28Si":"[28Si]",
|
| 449 |
+
"13C":"[13C]", "[13C]l":"13Cl", "96Ru":"[96Ru]","79Br":"[79Br]", "81Br":"[81Br]", "11B":"[11B]", "10B":"[10B]",
|
| 450 |
+
"[10B]r":"10Br", "[[":"[", "]]":"]", "*H13*Cl": "H13Cl", "*H18*O": "H18O", "*H218*O": "H218O",
|
| 451 |
+
"*N18*O": "N18O", "*H13*Co": "H13Co", "*H37*Cl": "H37Cl", "*H35*Cl": "H35Cl","*H81Br*":"H81Br","*H79Br*":"H79Br",
|
| 452 |
+
"*H28*Si": "H28Si", "*H10*B":"H10B", "*H23*Na": "H23Na", "*F23*Na": "F23Na", "*N23*Na": "N23Na","*O23*Na": "O23Na",
|
| 453 |
+
"*H11*B":"H11B", "*H19*F": "H19F", "cacld": "", "calcd.": "calcd ", "calcβd": "calcd ",
|
| 454 |
+
"calcd gcm": " ", " is ": " ", "calcd": "calcd ", "calcd ": "calcd ","++": "+","(M":"[M", ")+":"]+ ",
|
| 455 |
+
"MALDI":"","Maldi":""," [13C]":"[13C]"," [127I]":"[127I]"," [12C":"C"," [37Cl]":"37Cl"," [35Cl]":"35Cl",
|
| 456 |
+
"C ":"C","H":"H", " N":"N"," O":"O"," Na":"Na", " Br":"Br", "N ":"N"," Cl":"Cl", " F":"F"," S":"S"," P":"P"," B":"B","M]+H+]":"M+H]+","M]-H+]":"M-H]-",
|
| 457 |
+
"MH+":"M+H]+ ","]-(":"]- ","]+)":"]+ ","]-)":"]- ","]2-)":"]2- ","]+C":"]+ C","[MM":"","=":"","[MeOH":" ","[MeCN":" ","m/z":" ","]+2 ":"]2+ ","]+1":"]+","M+ C":"M+C","+]":"]+","+calc":" calc",
|
| 458 |
+
"Na)]":"Na]","+Na)":"+Na]",";":" ","+H)]":"+H]","+K)]":"+K]","+NH4)]":"+NH4]","+H)":"+H]","H+)":"H]+","Na+)":"Na]+","-calcd":"- calcd","[M-H] ":"[M-H]-","--":"-",
|
| 459 |
+
"NH4+)":"MH4]+","M+)":"M]+","M]+)":"M]+","+)":"+","M- ":"M-","+.":"+","[MNa]+":"[M+Na]+","[MH]+":"[M+H]+",
|
| 460 |
+
" M2+ ":" [M]2+ "," M3+ ":" [M]3+ "," M4+ ":" [M]4+ "," M5+ ":" [M]5+ "," M6+ ":" [M]6+ ",
|
| 461 |
+
" M2- ": " [M]2- ", " M3- ": " [M]3- ", " M4- ": " [M]4- ", " M5- ": " [M]5- ", " M6- ": " [M]6- ","[M+H] ":"[M+H]+ ","[M+Na] ":"[M+Na]+ ","[M] ":"[M]+ ","]calcd":"] calcd","-.":"- ","M+1)":"M+1]+ ","+κ":"+","]-calcd":"]- calcd",
|
| 462 |
+
"[Methyl":" ","[MA":" ","[ME":" ","[MI":" ","[MO":" ","[MU":" ","[Ma":" ","[Mi":" ","[Mo":" ","[Mu":" ","[Mg":" ","[M+H].":"[M+H]+ ","[M+Na].":"[M+Na]+ ","].":"] ","[M+Na]+":"[M+Na]+ ","[M+H]+":"[M+H]+ ","[M]+Na]+":"[M+Na]+","[M]+H]+":"[M+H]+"
|
| 463 |
+
}
|
| 464 |
+
|
| 465 |
+
# Apply each replacement in the dictionary to the text
|
| 466 |
+
for original, replacement in replacements.items():
|
| 467 |
+
text = text.replace(original, replacement)
|
| 468 |
+
|
| 469 |
+
return text
|
| 470 |
+
|
| 471 |
+
|
| 472 |
+
def transform_expressions_in_text(text):
|
| 473 |
+
"""
|
| 474 |
+
Transforms all chemical expressions within a given text into a standardized format.
|
| 475 |
+
|
| 476 |
+
Rules for expressions:
|
| 477 |
+
- Starts with M or nM, where n is a single digit integer.
|
| 478 |
+
- Ends with a charge (e.g., +, 2+, -).
|
| 479 |
+
- Can be enclosed in () or [] brackets.
|
| 480 |
+
- May contain spaces which are removed within the expression.
|
| 481 |
+
- Charges can be inside or outside the brackets.
|
| 482 |
+
|
| 483 |
+
The transformed expression:
|
| 484 |
+
- Contains no spaces within the expression.
|
| 485 |
+
- Preserves surrounding text intact.
|
| 486 |
+
|
| 487 |
+
Args:
|
| 488 |
+
- text (str): The input text containing chemical expressions.
|
| 489 |
+
|
| 490 |
+
Returns:
|
| 491 |
+
- str: The text with all expressions transformed accordingly.
|
| 492 |
+
"""
|
| 493 |
+
|
| 494 |
+
# Step 1: Replace specific symbols with corresponding charges
|
| 495 |
+
symbol_replacements = {
|
| 496 |
+
|
| 497 |
+
'β': '+',
|
| 498 |
+
'β’+': '+',
|
| 499 |
+
'ο': '+',
|
| 500 |
+
'ο«': "+",
|
| 501 |
+
'+.':'+ ',
|
| 502 |
+
'β’': '',
|
| 503 |
+
'Β·':'',
|
| 504 |
+
'β':'',
|
| 505 |
+
'κ': '',
|
| 506 |
+
'β': '-',
|
| 507 |
+
'-':'-',
|
| 508 |
+
'β.':'- ',
|
| 509 |
+
'β': '-', # Minus sign
|
| 510 |
+
'β': '-', # Em dash
|
| 511 |
+
'β': '-',
|
| 512 |
+
'Λ': '-',
|
| 513 |
+
'-.': '- ',
|
| 514 |
+
}
|
| 515 |
+
|
| 516 |
+
# Create a regex pattern to match all keys in symbol_replacements
|
| 517 |
+
symbols_pattern = re.compile('|'.join(map(re.escape, symbol_replacements.keys())))
|
| 518 |
+
text = symbols_pattern.sub(lambda match: symbol_replacements[match.group()], text)
|
| 519 |
+
|
| 520 |
+
# Step 2: Define regex to find expressions
|
| 521 |
+
# This pattern matches expressions enclosed in [] or () with optional charges outside
|
| 522 |
+
expression_pattern = re.compile(
|
| 523 |
+
r'[\[(]' # Opening bracket [ or (
|
| 524 |
+
r'(\d*M?\d*[a-zA-Z\d-]*)' # Capture group (explained above)
|
| 525 |
+
r'[])]' # Closing bracket ] or )
|
| 526 |
+
r'(\d*\+|-)?' # Optional charge outside the brackets
|
| 527 |
+
r'[,:]*' # Optional trailing characters
|
| 528 |
+
)
|
| 529 |
+
|
| 530 |
+
def replace_expression(match):
|
| 531 |
+
expression_part = match.group(1) # The main part of the expression
|
| 532 |
+
charge_outside = match.group(2) # The charge outside the brackets, if any
|
| 533 |
+
|
| 534 |
+
# Step 3: Remove all internal brackets within the main expression
|
| 535 |
+
expression_part = re.sub(r'[\[\]()]', '', expression_part)
|
| 536 |
+
|
| 537 |
+
# Step 4: Remove all spaces within the main expression
|
| 538 |
+
expression_part = re.sub(r'\s+', '', expression_part)
|
| 539 |
+
|
| 540 |
+
if not charge_outside:
|
| 541 |
+
# Step 5: Extract charge from the main expression if charge_outside is not present
|
| 542 |
+
charge_match = re.search(r'([+-])$', expression_part)
|
| 543 |
+
if charge_match:
|
| 544 |
+
charge = charge_match.group(1)
|
| 545 |
+
expression_part = expression_part[:charge_match.start()]
|
| 546 |
+
else:
|
| 547 |
+
charge = ''
|
| 548 |
+
else:
|
| 549 |
+
charge = charge_outside
|
| 550 |
+
|
| 551 |
+
# Step 6: Format the transformed expression
|
| 552 |
+
transformed = f'[{expression_part}]{charge}'
|
| 553 |
+
|
| 554 |
+
return transformed
|
| 555 |
+
|
| 556 |
+
# Step 7: Substitute all matching expressions in the text
|
| 557 |
+
transformed_text = expression_pattern.sub(replace_expression, text)
|
| 558 |
+
|
| 559 |
+
return transformed_text
|
| 560 |
+
|
| 561 |
+
def transform_molecular_formula(formula):
|
| 562 |
+
"""
|
| 563 |
+
Transforms a molecular formula string to a standardized format.
|
| 564 |
+
|
| 565 |
+
Args:
|
| 566 |
+
formula: The molecular formula string to transform.
|
| 567 |
+
|
| 568 |
+
Returns:
|
| 569 |
+
The transformed molecular formula string.
|
| 570 |
+
"""
|
| 571 |
+
|
| 572 |
+
# Remove all round brackets and colons
|
| 573 |
+
formula = formula.replace("(", "").replace(")", "").replace(":", "").replace("]+-", "]+")
|
| 574 |
+
|
| 575 |
+
# Remove ALL spaces within brackets and move the + or - sign after the bracket (if any)
|
| 576 |
+
formula = re.sub(r'\[(.*?)]', lambda m: '[' + m.group(1).replace(' ', '') + ']' + ('+' if '+' in m.group(1) else '') + ('-' if '-' in m.group(1) else ''), formula)
|
| 577 |
+
|
| 578 |
+
# Replace "M-" with "M-"
|
| 579 |
+
formula = re.sub(r'M\s*β', 'M-', formula)
|
| 580 |
+
|
| 581 |
+
# Replace "M +" or "M+" with "M+"
|
| 582 |
+
formula = re.sub(r'M\s*\+', 'M+', formula)
|
| 583 |
+
|
| 584 |
+
# Ensure standardized ion is surrounded by one space, BUT NOT IF IT IS THE LAST THING
|
| 585 |
+
formula = re.sub(r'([^ ])(\[\w+][+-]?)(?=\S)', r'\1 \2 ', formula) # Include optional + or - in the ion group
|
| 586 |
+
|
| 587 |
+
# Add brackets if "M" is present without brackets
|
| 588 |
+
if "M" in formula and "[" not in formula:
|
| 589 |
+
formula = "[" + formula + "]"
|
| 590 |
+
|
| 591 |
+
# Add spaces around "calcd for", "found"
|
| 592 |
+
formula = re.sub(r'(calcd\s*for|found)', r' \1 ', formula)
|
| 593 |
+
|
| 594 |
+
# Remove double spaces
|
| 595 |
+
formula = formula.replace("++", "+").replace("++", "+").replace(",", " ")
|
| 596 |
+
formula = re.sub(r'\s+', ' ', formula)
|
| 597 |
+
formula = formula.replace("-+", "+").replace("]+-", "]+").replace("+]+", "]+ ").replace("++", "+").replace("--", "").replace(",", "+")
|
| 598 |
+
|
| 599 |
+
return formula
|
| 600 |
+
|
| 601 |
+
|
| 602 |
+
# Configure logging
|
| 603 |
+
logging.basicConfig(level=logging.INFO, format='%(message)s')
|
| 604 |
+
|
| 605 |
+
|
| 606 |
+
def generate_error_dictionary(element_list, counts_range, special_cases=None):
|
| 607 |
+
"""
|
| 608 |
+
Generates an error dictionary mapping mass differences to element or group descriptions.
|
| 609 |
+
For atoms, includes entries for counts from counts_range.
|
| 610 |
+
For groups, includes entries only for count=1, with descriptions like "1 OH-group".
|
| 611 |
+
|
| 612 |
+
Parameters:
|
| 613 |
+
- element_list (list): List of element symbols or groups (e.g., ['H', 'O', 'N', 'OH']).
|
| 614 |
+
- counts_range (range): Range of atom counts for atoms (e.g., range(1, 11) for counts 1-10).
|
| 615 |
+
- special_cases (dict): Optional dictionary for special error cases
|
| 616 |
+
(e.g., {'0.0005': 'Electron mass error'}).
|
| 617 |
+
|
| 618 |
+
Returns:
|
| 619 |
+
- dict: Error dictionary with mass differences as keys and descriptions as values.
|
| 620 |
+
"""
|
| 621 |
+
|
| 622 |
+
error_dict = {}
|
| 623 |
+
electron_mass = 0.0005486 # Atomic mass units (amu)
|
| 624 |
+
|
| 625 |
+
for element in element_list:
|
| 626 |
+
try:
|
| 627 |
+
atomic_mass = Formula(element).monoisotopic_mass
|
| 628 |
+
except Exception as e:
|
| 629 |
+
print(f"Error processing element {element}: {e}")
|
| 630 |
+
continue # Skip this element if there's an error
|
| 631 |
+
|
| 632 |
+
# Determine if the element is a group (more than one capital letter)
|
| 633 |
+
is_group = sum(1 for c in element if c.isupper()) > 1
|
| 634 |
+
|
| 635 |
+
if is_group:
|
| 636 |
+
# For groups, create entry only for count=1
|
| 637 |
+
count = 1
|
| 638 |
+
mass_diff_e = atomic_mass * count
|
| 639 |
+
mass_diff_e_rounded = round(mass_diff_e, 4)
|
| 640 |
+
description = f"{count} {element}-group" # Use the group name with '1' and 'group' with hyphen
|
| 641 |
+
if mass_diff_e_rounded in error_dict:
|
| 642 |
+
if description not in error_dict[mass_diff_e_rounded]:
|
| 643 |
+
error_dict[mass_diff_e_rounded] += f", {description}"
|
| 644 |
+
else:
|
| 645 |
+
error_dict[mass_diff_e_rounded] = description
|
| 646 |
+
|
| 647 |
+
# Positively Charged Ion (E+)
|
| 648 |
+
mass_diff_e_plus = mass_diff_e + (electron_mass * count)
|
| 649 |
+
mass_diff_e_plus_rounded = round(mass_diff_e_plus, 4)
|
| 650 |
+
if mass_diff_e_plus_rounded in error_dict:
|
| 651 |
+
if description not in error_dict[mass_diff_e_plus_rounded]:
|
| 652 |
+
error_dict[mass_diff_e_plus_rounded] += f", {description}"
|
| 653 |
+
else:
|
| 654 |
+
error_dict[mass_diff_e_plus_rounded] = description
|
| 655 |
+
|
| 656 |
+
# Negatively Charged Ion (E-)
|
| 657 |
+
mass_diff_e_minus = mass_diff_e - (electron_mass * count)
|
| 658 |
+
mass_diff_e_minus_rounded = round(mass_diff_e_minus, 4)
|
| 659 |
+
if mass_diff_e_minus_rounded in error_dict:
|
| 660 |
+
if description not in error_dict[mass_diff_e_minus_rounded]:
|
| 661 |
+
error_dict[mass_diff_e_minus_rounded] += f", {description}"
|
| 662 |
+
else:
|
| 663 |
+
error_dict[mass_diff_e_minus_rounded] = description
|
| 664 |
+
else:
|
| 665 |
+
# For atoms, create entries for counts in counts_range
|
| 666 |
+
for count in counts_range:
|
| 667 |
+
mass_diff_e = atomic_mass * count
|
| 668 |
+
mass_diff_e_rounded = round(mass_diff_e, 4)
|
| 669 |
+
if count == 1:
|
| 670 |
+
description = f"{count} {element}-atom"
|
| 671 |
+
else:
|
| 672 |
+
description = f"{count} {element}-atoms"
|
| 673 |
+
|
| 674 |
+
if mass_diff_e_rounded in error_dict:
|
| 675 |
+
if description not in error_dict[mass_diff_e_rounded]:
|
| 676 |
+
error_dict[mass_diff_e_rounded] += f", {description}"
|
| 677 |
+
else:
|
| 678 |
+
error_dict[mass_diff_e_rounded] = description
|
| 679 |
+
|
| 680 |
+
# Positively Charged Ion (E+)
|
| 681 |
+
mass_diff_e_plus = mass_diff_e + (electron_mass * count)
|
| 682 |
+
mass_diff_e_plus_rounded = round(mass_diff_e_plus, 4)
|
| 683 |
+
if mass_diff_e_plus_rounded in error_dict:
|
| 684 |
+
if description not in error_dict[mass_diff_e_plus_rounded]:
|
| 685 |
+
error_dict[mass_diff_e_plus_rounded] += f", {description}"
|
| 686 |
+
else:
|
| 687 |
+
error_dict[mass_diff_e_plus_rounded] = description
|
| 688 |
+
|
| 689 |
+
# Negatively Charged Ion (E-)
|
| 690 |
+
mass_diff_e_minus = mass_diff_e - (electron_mass * count)
|
| 691 |
+
mass_diff_e_minus_rounded = round(mass_diff_e_minus, 4)
|
| 692 |
+
if mass_diff_e_minus_rounded in error_dict:
|
| 693 |
+
if description not in error_dict[mass_diff_e_minus_rounded]:
|
| 694 |
+
error_dict[mass_diff_e_minus_rounded] += f", {description}"
|
| 695 |
+
else:
|
| 696 |
+
error_dict[mass_diff_e_minus_rounded] = description
|
| 697 |
+
|
| 698 |
+
# Add Special Cases if Provided
|
| 699 |
+
if special_cases:
|
| 700 |
+
for mass, desc in special_cases.items():
|
| 701 |
+
mass_float = float(mass)
|
| 702 |
+
mass_rounded = round(mass_float, 4)
|
| 703 |
+
if mass_rounded in error_dict:
|
| 704 |
+
if desc not in error_dict[mass_rounded]:
|
| 705 |
+
error_dict[mass_rounded] += f", {desc}"
|
| 706 |
+
else:
|
| 707 |
+
error_dict[mass_rounded] = desc
|
| 708 |
+
|
| 709 |
+
return error_dict
|
| 710 |
+
|
| 711 |
+
|
| 712 |
+
# Define special cases like electron mass error
|
| 713 |
+
special_errors = {
|
| 714 |
+
'0.0005': "Electron mass error",
|
| 715 |
+
'0.0006': "Electron mass error",
|
| 716 |
+
'0.0073': "Nominal mass error (H=1.0000)?",
|
| 717 |
+
'0.0072': "Nominal mass error (H=1.0000)?",
|
| 718 |
+
'0.0071': "Nominal mass error (H=1.0000)?",
|
| 719 |
+
'0.0070': "Nominal mass error (H=1.0000)?",
|
| 720 |
+
'1.0005': "Nominal mass error (H=1.0000)?",
|
| 721 |
+
'1.0006': "Nominal mass error (H=1.0000)?",
|
| 722 |
+
'0.0102': "Nominal mass error (Na=23.0000)?",
|
| 723 |
+
'0.0103': "Nominal mass error (Na=23.0000)?",
|
| 724 |
+
'0.0107': "Nominal mass error (Na=23.0000)?",
|
| 725 |
+
'0.0108': "Nominal mass error (Na=23.0000)?",
|
| 726 |
+
'1.0077': '1 H-atom',
|
| 727 |
+
'1.0076': '1 H-atom',
|
| 728 |
+
'1.0075': '1 H-atom',
|
| 729 |
+
'1.0083': '1 H-atom',
|
| 730 |
+
'+22.9897': '1 Na-atom',
|
| 731 |
+
'+22.9902': '1 Na-atom',
|
| 732 |
+
'21.9892':"Nominal mass error [M]+1.0000 (not [M+Na]+)",
|
| 733 |
+
'21.9893':"Nominal mass error [M]+1.0000 (not [M+Na]+)",
|
| 734 |
+
'0.9964': 'Specify measured B-isotope(s)',
|
| 735 |
+
'0.9963': 'Specify measured B-isotope(s)',
|
| 736 |
+
'1.9927': 'Specify measured B-isotopes',
|
| 737 |
+
'1.9928': 'Specify measured B-isotopes',
|
| 738 |
+
'1.9979': 'Specify measured Br-isotope(s)',
|
| 739 |
+
'1.9980': 'Specify measured Br-isotope(s)',
|
| 740 |
+
'+17.9906':"Exchange 1 H- with 1 F-atom",
|
| 741 |
+
'-17.9906':"Exchange 1 F- with 1 H-atom",
|
| 742 |
+
'+14.9871':"Exchange 1 H- with 1 O-atom",
|
| 743 |
+
'-14.9871':"Exchange 1 O- with 1 H-atom",
|
| 744 |
+
'+77.9105':"Exchange 1 H- with 1 Br-atom",
|
| 745 |
+
'-77.9105':"Exchange 1 Br- with 1 H-atom",
|
| 746 |
+
'1.0039': 'Mass calcd for [M+1] (1x 13C)',
|
| 747 |
+
'1.0038': 'Mass calcd for [M+1] (1x 13C)',
|
| 748 |
+
'1.0034': 'Mass calcd for [M+1] (1x 13C)',
|
| 749 |
+
'1.0033': 'Mass calcd for [M+1] (1x 13C)',
|
| 750 |
+
'1.0032': 'Mass calcd for [M+1] (1x 13C)',
|
| 751 |
+
'2.0064': 'Mass calcd for [M+2] (2x 13C)',
|
| 752 |
+
|
| 753 |
+
}
|
| 754 |
+
|
| 755 |
+
# Generate the error dictionary
|
| 756 |
+
elements = [
|
| 757 |
+
'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne',
|
| 758 |
+
'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca',
|
| 759 |
+
'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn',
|
| 760 |
+
'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr',
|
| 761 |
+
'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn',
|
| 762 |
+
'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd',
|
| 763 |
+
'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb',
|
| 764 |
+
'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg',
|
| 765 |
+
'Tl', 'Pb', 'Bi','D','CH','CH2','CH3','CH4','NH','NH2','NH3','NH4',
|
| 766 |
+
'OH','H2O','H3O','NO','NO2','OCH3','CF3','C2H5','C2H6','HF','HCl',
|
| 767 |
+
'HBr','HS','HI','C3H8','C4H10'
|
| 768 |
+
|
| 769 |
+
]
|
| 770 |
+
|
| 771 |
+
atom_counts = range(1, 11) # 1 to 10
|
| 772 |
+
error_dictionary = generate_error_dictionary(elements, atom_counts, special_errors)
|
| 773 |
+
|
| 774 |
+
def categorize_error(error_value, known_errors, tolerance=0.0001):
|
| 775 |
+
"""
|
| 776 |
+
Categorizes the error based on a given error value and a dictionary of known atomic masses.
|
| 777 |
+
Generates a message indicating whether atoms should be added or removed.
|
| 778 |
+
|
| 779 |
+
Parameters:
|
| 780 |
+
error_value (float): The calculated error between the calculated and recalculated mass.
|
| 781 |
+
known_errors (dict): A dictionary where keys are atomic masses and values are the element descriptions.
|
| 782 |
+
tolerance (float): The tolerance range within which the error value should match a known difference.
|
| 783 |
+
|
| 784 |
+
Returns:
|
| 785 |
+
str: The dynamically generated error message if a match is found, otherwise returns a blank space for zero difference.
|
| 786 |
+
"""
|
| 787 |
+
# Check if the error value is effectively zero within the tolerance range
|
| 788 |
+
if abs(error_value) <= tolerance:
|
| 789 |
+
return "" # Return a blank space if the difference is zero
|
| 790 |
+
|
| 791 |
+
# Special case handling for known mass differences
|
| 792 |
+
for atomic_mass, atom_description in known_errors.items():
|
| 793 |
+
# Check if the error matches the dictionary value or the dictionary value plus 0.0001
|
| 794 |
+
if (abs(abs(error_value) - atomic_mass) <= tolerance or
|
| 795 |
+
abs(abs(error_value) - (atomic_mass + 0.0001)) <= tolerance):
|
| 796 |
+
|
| 797 |
+
if len(atom_description) > 13: # Check if the database entry is longer than six characters
|
| 798 |
+
return atom_description # Return the database entry directly
|
| 799 |
+
|
| 800 |
+
# Extract the count and element from the dictionary entry
|
| 801 |
+
parts = atom_description.split()
|
| 802 |
+
if len(parts) != 2:
|
| 803 |
+
# Handle unexpected format
|
| 804 |
+
return atom_description
|
| 805 |
+
|
| 806 |
+
count_str, element = parts
|
| 807 |
+
try:
|
| 808 |
+
count = int(count_str)
|
| 809 |
+
except ValueError:
|
| 810 |
+
# Handle cases where count is not an integer
|
| 811 |
+
return atom_description
|
| 812 |
+
|
| 813 |
+
# Generate the correct message based on the sign of the error
|
| 814 |
+
if error_value > 0:
|
| 815 |
+
return f"Add {count} {element} to formula"
|
| 816 |
+
else:
|
| 817 |
+
return f"Remove {count} {element} from formula"
|
| 818 |
+
|
| 819 |
+
# If no match found, return the error value as a string with the correct sign
|
| 820 |
+
return f"{error_value:+.4f}"
|
| 821 |
+
|
| 822 |
+
|
| 823 |
+
def hrms_cleanup(result, error_dictionary):
|
| 824 |
+
"""
|
| 825 |
+
Processes a list of HRMS data strings and extracts specified components,
|
| 826 |
+
ensuring that the ion notation is correctly captured and then removed from the line.
|
| 827 |
+
Before processing each line, it removes all strings within the line that are shorter than
|
| 828 |
+
5 characters and do not contain a capital 'M'.
|
| 829 |
+
Recalculates the monoisotopic mass using the molmass library and computes error.
|
| 830 |
+
|
| 831 |
+
Parameters:
|
| 832 |
+
- result (list of str): The list containing HRMS data strings.
|
| 833 |
+
- error_dictionary (dict): The autogenerated error dictionary with mass differences and descriptions.
|
| 834 |
+
|
| 835 |
+
Returns:
|
| 836 |
+
- list of list: A list where each sublist contains extracted data, including error calculations and descriptions.
|
| 837 |
+
"""
|
| 838 |
+
|
| 839 |
+
# Initialize the parsed_results list
|
| 840 |
+
parsed_results = []
|
| 841 |
+
|
| 842 |
+
# Updated ion_pattern to include optional digits before 'M'
|
| 843 |
+
ion_pattern = re.compile(r'\[\d*M[^]]*]\S*')
|
| 844 |
+
|
| 845 |
+
# New formula pattern: word starting with 'C', followed by digits, 'H', digits, and possibly other elements
|
| 846 |
+
#formula_pattern = re.compile(r'C\d+H\d+(?:[A-Z][a-z]?\d*|\[\d+[A-Z][a-z]*\d*)*[+-]?')
|
| 847 |
+
#formula_pattern = re.compile(r'C\d+H\d+(?:[A-Z][a-z]?\d*|\[\d+[A-Z][a-z]*\d*\])*[+-]?')
|
| 848 |
+
|
| 849 |
+
#formula_pattern = re.compile(r'C\d+(?:H\d+|F\d+)(?:[A-Z][a-z]?\d*|\[\d+[A-Z][a-z]*\d*])*[+-]?')
|
| 850 |
+
formula_pattern = re.compile(r'C\d+(?:H\d+|F\d+|D\d+)(?:[A-Z][a-z]?\d*|\[\d+[A-Z][a-z]*\d*])*[+-]?')
|
| 851 |
+
|
| 852 |
+
# Pattern for floats with exactly 4 digits after decimal point
|
| 853 |
+
float_pattern = re.compile(r'\d+\.\d{4}')
|
| 854 |
+
|
| 855 |
+
# Process each line in the result list
|
| 856 |
+
for line in result:
|
| 857 |
+
# Remove words shorter than 5 characters that do not contain a capital 'M'
|
| 858 |
+
words = line.split()
|
| 859 |
+
words_filtered = [word for word in words if len(word) >= 5 or ('M' in word)]
|
| 860 |
+
line = ' '.join(words_filtered)
|
| 861 |
+
|
| 862 |
+
# Initialize a row with 8 empty elements (added a column for Error)
|
| 863 |
+
row = [''] * 8
|
| 864 |
+
|
| 865 |
+
# Extract the ion notation and its charge
|
| 866 |
+
ion_match = ion_pattern.search(line)
|
| 867 |
+
ion_charge = ''
|
| 868 |
+
if ion_match:
|
| 869 |
+
ion = ion_match.group(0)
|
| 870 |
+
row[1] = ion.strip()
|
| 871 |
+
# Extract the charge from the ion notation if present (e.g., ]+, ]-, ]2+)
|
| 872 |
+
ion_charge_match = re.search(r'(\d*[+-])?$', ion)
|
| 873 |
+
if ion_charge_match:
|
| 874 |
+
ion_charge = ion_charge_match.group(1)
|
| 875 |
+
# Remove the ion notation from the line
|
| 876 |
+
line = line.replace(ion, '')
|
| 877 |
+
else:
|
| 878 |
+
row[1] = ''
|
| 879 |
+
|
| 880 |
+
# Now proceed to extract the formula, calcd mass, and found mass from the modified line
|
| 881 |
+
|
| 882 |
+
# Extract the formula
|
| 883 |
+
formula_match = formula_pattern.search(line)
|
| 884 |
+
|
| 885 |
+
if formula_match:
|
| 886 |
+
formula = formula_match.group(0).strip()
|
| 887 |
+
# If the formula ends with ion_charge, remove ion_charge from formula
|
| 888 |
+
if ion_charge and formula.endswith(ion_charge):
|
| 889 |
+
formula = formula[:-len(ion_charge)].strip()
|
| 890 |
+
# Check if there's a charge present in the formula
|
| 891 |
+
charge_match = re.search(r'([+-]\d*)$', formula)
|
| 892 |
+
if charge_match:
|
| 893 |
+
charge = charge_match.group(1)
|
| 894 |
+
formula_no_charge = formula.replace(charge, "")
|
| 895 |
+
else:
|
| 896 |
+
charge = ion_charge if ion_charge else '+'
|
| 897 |
+
formula_no_charge = formula
|
| 898 |
+
|
| 899 |
+
# Enclose the formula in square brackets before recalculating the mass
|
| 900 |
+
|
| 901 |
+
formula_in_brackets = f'[{formula_no_charge}]{charge}'
|
| 902 |
+
formula_in_brackets = formula_in_brackets.replace("H1HeXe", "[13C]")
|
| 903 |
+
formula_in_brackets = formula_in_brackets.replace("C1F", "CF")
|
| 904 |
+
formula_in_brackets = formula_in_brackets.replace("H1N", "HN")
|
| 905 |
+
row[0] = formula_in_brackets
|
| 906 |
+
|
| 907 |
+
# Recalculate the monoisotopic mass using molmass while keeping isotopic notation intact
|
| 908 |
+
try:
|
| 909 |
+
recalculated_mass = Formula(formula_in_brackets).monoisotopic_mass
|
| 910 |
+
if ion_charge:
|
| 911 |
+
if ion_charge in ("+", "-"):
|
| 912 |
+
charge_number = 1
|
| 913 |
+
else:
|
| 914 |
+
charge_number = int(ion_charge[:-1]) # Extract the numeric part of the charge
|
| 915 |
+
recalculated_mass /= abs(charge_number)
|
| 916 |
+
|
| 917 |
+
row[4] = f'{recalculated_mass:.4f}' # Store the monoisotopic mass with 4 decimal precision
|
| 918 |
+
except Exception as e:
|
| 919 |
+
row[4] = 'Error' # Handle the case where the formula is invalid for molmass
|
| 920 |
+
else:
|
| 921 |
+
row[0] = ''
|
| 922 |
+
row[4] = ''
|
| 923 |
+
|
| 924 |
+
# Extract all floats with exactly 4 decimal places
|
| 925 |
+
floats_with_4_decimals = float_pattern.findall(line)
|
| 926 |
+
|
| 927 |
+
# Extract the calcd mass - first occurring float with 4 decimal places
|
| 928 |
+
if floats_with_4_decimals:
|
| 929 |
+
calcd_mass = floats_with_4_decimals[0]
|
| 930 |
+
row[2] = calcd_mass.strip()
|
| 931 |
+
else:
|
| 932 |
+
row[2] = ''
|
| 933 |
+
|
| 934 |
+
# Extract the found mass - second float with 4 decimal places, if it exists
|
| 935 |
+
if len(floats_with_4_decimals) >= 2:
|
| 936 |
+
found_mass = floats_with_4_decimals[1]
|
| 937 |
+
row[3] = found_mass.strip()
|
| 938 |
+
else:
|
| 939 |
+
row[3] = ''
|
| 940 |
+
|
| 941 |
+
# Calculate the error between the calculated mass and the recalculated mass
|
| 942 |
+
if row[2] and row[4] and row[2] != 'Error' and row[4] != 'Error':
|
| 943 |
+
try:
|
| 944 |
+
error = float(row[2]) - float(row[4])
|
| 945 |
+
# Categorize the error based on the error value
|
| 946 |
+
error_description = categorize_error(error, error_dictionary)
|
| 947 |
+
# Check for a typo error if no existing error description
|
| 948 |
+
if is_float(error_description) or error==0:
|
| 949 |
+
|
| 950 |
+
if differ_in_single_digit_except_last_two(float(row[2]), float(row[3])):
|
| 951 |
+
error_description = "Typo (Calcd,Found)"
|
| 952 |
+
|
| 953 |
+
if differ_in_single_digit_except_last_two(float(row[2]), float(row[4])):
|
| 954 |
+
error_description = "Typo (Calcd,Recalcd)"
|
| 955 |
+
|
| 956 |
+
if have_swapped_adjacent_digits(float(row[2]), float(row[3])):
|
| 957 |
+
error_description = "Transposed digits (Calcd,Found)"
|
| 958 |
+
|
| 959 |
+
if have_swapped_adjacent_digits(float(row[2]), float(row[4])):
|
| 960 |
+
error_description = "Transposed digits (Calcd,Recalcd)"
|
| 961 |
+
|
| 962 |
+
if error_description in ("-0.0010", "-0.0011", "-0.0012") and ion_charge == "-":
|
| 963 |
+
error_description = "Mass was calculated for cation"
|
| 964 |
+
|
| 965 |
+
if error_description in ("-0.0010", "-0.0011", "-0.0012") and "M-" in row[1]:
|
| 966 |
+
error_description = "Mass was calculated for cation"
|
| 967 |
+
#print(error_description)
|
| 968 |
+
|
| 969 |
+
mw_plus = round(calculate_molecular_weight(row[0]), 4)
|
| 970 |
+
if float(row[2]) == mw_plus:
|
| 971 |
+
error_description = "Molecular weight error"
|
| 972 |
+
|
| 973 |
+
mw_plus_plus1 = round(mw_plus + 1, 4)
|
| 974 |
+
if float(row[2]) == mw_plus_plus1:
|
| 975 |
+
error_description = "Molecular weight error"
|
| 976 |
+
|
| 977 |
+
mw_plus_plus23 = round(mw_plus + 23, 4)
|
| 978 |
+
if float(row[2]) == mw_plus_plus23:
|
| 979 |
+
error_description = "Molecular weight error"
|
| 980 |
+
|
| 981 |
+
formula_neutral = row[0].replace("+", "")
|
| 982 |
+
mw_neutral = round(calculate_molecular_weight(formula_neutral), 4)
|
| 983 |
+
if mw_neutral == float(row[2]):
|
| 984 |
+
error_description = "Molecular weight error (neutral)"
|
| 985 |
+
|
| 986 |
+
mw_neutral_plus1 = round(mw_neutral + 1, 4)
|
| 987 |
+
if mw_neutral_plus1 == float(row[2]):
|
| 988 |
+
error_description = "Molecular weight error (neutral+1)"
|
| 989 |
+
|
| 990 |
+
mw_neutral_plus23 = round(mw_neutral + 23, 4)
|
| 991 |
+
if mw_neutral_plus23 == float(row[2]):
|
| 992 |
+
error_description = "Molecular weight error (neutral+23)"
|
| 993 |
+
|
| 994 |
+
if "Na" in row[0]:
|
| 995 |
+
formula_minus_sodium = row[0].replace("Na", "")
|
| 996 |
+
mw1 = round(calculate_molecular_weight(formula_minus_sodium), 4) + 23
|
| 997 |
+
if mw1 == float(row[2]):
|
| 998 |
+
error_description = "Molecular weight + 23.0000"
|
| 999 |
+
else:
|
| 1000 |
+
formula_plus_sodium = row[0].replace("[", "").replace("]", "").replace("+", "").replace("-", "")
|
| 1001 |
+
formula_plus_sodium = formula_plus_sodium+"Na"
|
| 1002 |
+
mw_plus_sodium = round(calculate_molecular_weight(formula_plus_sodium), 4)
|
| 1003 |
+
if mw_plus_sodium == float(row[2]):
|
| 1004 |
+
error_description = "Molecular weight error (Formula+Na)"
|
| 1005 |
+
|
| 1006 |
+
formula_minus_h = row[0].replace("[", "").replace("]", "").replace("+", "").replace("-", "")
|
| 1007 |
+
formula_minus_h = decrease_element_count(formula_minus_h, 'H')
|
| 1008 |
+
mw2 = round(calculate_molecular_weight(formula_minus_h), 4) + 1
|
| 1009 |
+
if mw2 == float(row[2]):
|
| 1010 |
+
error_description = "Molecular weight + 1.0000"
|
| 1011 |
+
|
| 1012 |
+
row[7] = error_description # Replace the error value with the error description or keep the difference
|
| 1013 |
+
|
| 1014 |
+
except ValueError:
|
| 1015 |
+
row[7] = 'Error'
|
| 1016 |
+
else:
|
| 1017 |
+
row[7] = 'Error'
|
| 1018 |
+
|
| 1019 |
+
if row[1] and row[2] and row[3] and not row[0]:
|
| 1020 |
+
row[7] = 'No formula found'
|
| 1021 |
+
|
| 1022 |
+
# Skip the row if both row[0] and row[1] are empty
|
| 1023 |
+
if not row[0] and not row[1]:
|
| 1024 |
+
continue # Do not append this row to parsed_results
|
| 1025 |
+
|
| 1026 |
+
# Append the row to the parsed_results list
|
| 1027 |
+
parsed_results.append(row)
|
| 1028 |
+
|
| 1029 |
+
return parsed_results
|
| 1030 |
+
|
| 1031 |
+
|
| 1032 |
+
def calc_dev_calcd_and_recalcd(cleaned_results):
|
| 1033 |
+
"""
|
| 1034 |
+
Calculates the absolute deviation between the calculated mass, recalculated mass, and the found mass in ppm,
|
| 1035 |
+
and updates the 'Dev (Calcd)' and 'Dev (Recalcd)' columns in the cleaned_results list.
|
| 1036 |
+
|
| 1037 |
+
Parameters:
|
| 1038 |
+
cleaned_results (list of list): The list containing extracted data.
|
| 1039 |
+
|
| 1040 |
+
Returns:
|
| 1041 |
+
list of list: The updated cleaned_results list with 'Dev (Calcd)' and 'Dev (Recalcd)' columns filled.
|
| 1042 |
+
"""
|
| 1043 |
+
for row in cleaned_results:
|
| 1044 |
+
calcd_mass = row[2]
|
| 1045 |
+
found_mass = row[3]
|
| 1046 |
+
recalcd_mass = row[4]
|
| 1047 |
+
|
| 1048 |
+
# Initialize found_mass_float only if found_mass exists and is valid
|
| 1049 |
+
found_mass_float = None
|
| 1050 |
+
if found_mass:
|
| 1051 |
+
try:
|
| 1052 |
+
found_mass_float = float(found_mass)
|
| 1053 |
+
except ValueError:
|
| 1054 |
+
found_mass_float = None
|
| 1055 |
+
|
| 1056 |
+
# Calculate deviation for the calculated mass
|
| 1057 |
+
if calcd_mass and found_mass_float is not None:
|
| 1058 |
+
try:
|
| 1059 |
+
calcd_mass_float = float(calcd_mass)
|
| 1060 |
+
deviation_calcd = abs((found_mass_float - calcd_mass_float) / calcd_mass_float) * 1e6 # ppm
|
| 1061 |
+
row[5] = f"{deviation_calcd:.1f}" # Format to one decimal place
|
| 1062 |
+
except ValueError:
|
| 1063 |
+
row[5] = '' # Leave the field empty if conversion fails
|
| 1064 |
+
else:
|
| 1065 |
+
row[5] = ''
|
| 1066 |
+
|
| 1067 |
+
# Calculate deviation for the recalculated mass
|
| 1068 |
+
if recalcd_mass and found_mass_float is not None:
|
| 1069 |
+
try:
|
| 1070 |
+
recalcd_mass_float = float(recalcd_mass)
|
| 1071 |
+
deviation_recalcd = abs((found_mass_float - recalcd_mass_float) / recalcd_mass_float) * 1e6 # ppm
|
| 1072 |
+
row[6] = f"{deviation_recalcd:.1f}" # Format to one decimal place
|
| 1073 |
+
except ValueError:
|
| 1074 |
+
row[6] = '' # Leave the field empty if conversion fails
|
| 1075 |
+
else:
|
| 1076 |
+
row[6] = ''
|
| 1077 |
+
return cleaned_results
|
| 1078 |
+
|
| 1079 |
+
|
| 1080 |
+
def print_aligned_table(cleaned_results):
|
| 1081 |
+
"""
|
| 1082 |
+
Displays the cleaned_results in an aligned table format in Streamlit,
|
| 1083 |
+
highlighting deviations greater than 10 ppm in red and error messages in purple.
|
| 1084 |
+
"""
|
| 1085 |
+
headers = ['Formula', 'Ion', 'Calcd Mass', 'Found Mass', 'Recalcd Mass',
|
| 1086 |
+
'Dev (Calcd)', 'Dev (Recalcd)', 'Error']
|
| 1087 |
+
|
| 1088 |
+
# Build HTML table with inline CSS for borders, padding, and monospaced font
|
| 1089 |
+
table_html = '<table style="border-collapse: collapse; width: 100%; font-family: monospace;">'
|
| 1090 |
+
|
| 1091 |
+
# Create header row
|
| 1092 |
+
table_html += '<tr>'
|
| 1093 |
+
for header in headers:
|
| 1094 |
+
table_html += f'<th style="border: 1px solid black; padding: 4px; text-align: left;">{header}</th>'
|
| 1095 |
+
table_html += '</tr>'
|
| 1096 |
+
|
| 1097 |
+
# Create data rows
|
| 1098 |
+
for row in cleaned_results:
|
| 1099 |
+
table_html += '<tr>'
|
| 1100 |
+
for i, cell in enumerate(row):
|
| 1101 |
+
# Determine text alignment: right align for numeric columns
|
| 1102 |
+
align = 'right' if i in [2, 3, 4, 5, 6, 7] else 'left'
|
| 1103 |
+
style = f"text-align: {align}; border: 1px solid black; padding: 4px;"
|
| 1104 |
+
cell_str = str(cell)
|
| 1105 |
+
|
| 1106 |
+
# For deviation columns, apply red color if deviation > 10
|
| 1107 |
+
if i in [5, 6]:
|
| 1108 |
+
try:
|
| 1109 |
+
if float(cell) > 10:
|
| 1110 |
+
style += " color: red;"
|
| 1111 |
+
except (ValueError, TypeError):
|
| 1112 |
+
pass
|
| 1113 |
+
# For the error column, apply purple if the cell doesn't represent a number
|
| 1114 |
+
elif i == 7:
|
| 1115 |
+
if isinstance(cell, str) and not re.match(r'^[+-]?\d*\.?\d+$', cell_str):
|
| 1116 |
+
style += " color: purple;"
|
| 1117 |
+
|
| 1118 |
+
table_html += f'<td style="{style}">{cell_str}</td>'
|
| 1119 |
+
table_html += '</tr>'
|
| 1120 |
+
|
| 1121 |
+
table_html += '</table>'
|
| 1122 |
+
|
| 1123 |
+
# Display the table in Streamlit
|
| 1124 |
+
st.markdown(table_html, unsafe_allow_html=True)
|
| 1125 |
+
|
| 1126 |
+
|
| 1127 |
+
def search_calcd_with_floats(text: str) -> List[str]:
|
| 1128 |
+
"""
|
| 1129 |
+
Search for 'calcd' followed by two floats with four decimal places.
|
| 1130 |
+
Extract from up to 25 characters before 'calcd' (if no float present) until the second float.
|
| 1131 |
+
Only extract if total length is less than 100 characters.
|
| 1132 |
+
|
| 1133 |
+
Args:
|
| 1134 |
+
text (str): Input text to search
|
| 1135 |
+
|
| 1136 |
+
Returns:
|
| 1137 |
+
List[str]: List of matching strings
|
| 1138 |
+
"""
|
| 1139 |
+
pattern_float = re.compile(r'\d+\.\d{4}')
|
| 1140 |
+
results = []
|
| 1141 |
+
|
| 1142 |
+
# Find all occurrences of 'calcd', case-insensitive
|
| 1143 |
+
for calcd_match in re.finditer('calcd', text, re.IGNORECASE):
|
| 1144 |
+
calcd_start = calcd_match.start()
|
| 1145 |
+
|
| 1146 |
+
# Look at up to 25 characters before 'calcd'
|
| 1147 |
+
pre_calcd_start = max(0, calcd_start - 25)
|
| 1148 |
+
pre_calcd_text = text[pre_calcd_start:calcd_start]
|
| 1149 |
+
|
| 1150 |
+
# Check if there's a float in the pre-calcd text
|
| 1151 |
+
pre_calcd_floats = list(pattern_float.finditer(pre_calcd_text))
|
| 1152 |
+
|
| 1153 |
+
# Determine the start position based on pre-calcd text
|
| 1154 |
+
if not pre_calcd_floats: # If no floats found before calcd
|
| 1155 |
+
extraction_start = pre_calcd_start
|
| 1156 |
+
else:
|
| 1157 |
+
extraction_start = calcd_start
|
| 1158 |
+
|
| 1159 |
+
# Look ahead for floats after 'calcd'
|
| 1160 |
+
post_calcd_text = text[calcd_start:calcd_start + 100]
|
| 1161 |
+
post_floats = list(pattern_float.finditer(post_calcd_text))
|
| 1162 |
+
|
| 1163 |
+
if len(post_floats) >= 2:
|
| 1164 |
+
# End at the second float
|
| 1165 |
+
end_pos = calcd_start + post_floats[1].end()
|
| 1166 |
+
|
| 1167 |
+
# Only extract if total length is less than 100 characters
|
| 1168 |
+
if end_pos - extraction_start < 100:
|
| 1169 |
+
result = text[extraction_start:end_pos]
|
| 1170 |
+
results.append(result)
|
| 1171 |
+
|
| 1172 |
+
return results
|
| 1173 |
+
|
| 1174 |
+
|
| 1175 |
+
def search_hrms_with_floats(text: str) -> List[str]:
|
| 1176 |
+
"""
|
| 1177 |
+
Search for 'HRMS' followed by at least two floats with four decimal places.
|
| 1178 |
+
If 'calcd' appears in the 25 characters after the second float, stop at the second float.
|
| 1179 |
+
Otherwise, include up to 25 characters after the second float.
|
| 1180 |
+
|
| 1181 |
+
Args:
|
| 1182 |
+
text (str): Input text to search
|
| 1183 |
+
|
| 1184 |
+
Returns:
|
| 1185 |
+
List[str]: List of matching strings
|
| 1186 |
+
"""
|
| 1187 |
+
pattern_float = re.compile(r'\d+\.\d{4}')
|
| 1188 |
+
hrms_positions = [m.start() for m in re.finditer('HRMS', text)]
|
| 1189 |
+
results = []
|
| 1190 |
+
|
| 1191 |
+
for hrms_pos in hrms_positions:
|
| 1192 |
+
# Extract up to 100 characters from 'HRMS'
|
| 1193 |
+
max_length_substring = text[hrms_pos:hrms_pos + 100]
|
| 1194 |
+
floats = list(pattern_float.finditer(max_length_substring))
|
| 1195 |
+
|
| 1196 |
+
if len(floats) >= 2:
|
| 1197 |
+
second_float_end = floats[1].end()
|
| 1198 |
+
|
| 1199 |
+
# Look at the next 25 characters after the second float
|
| 1200 |
+
next_25_chars = max_length_substring[second_float_end:second_float_end + 25]
|
| 1201 |
+
|
| 1202 |
+
# If 'calcd' appears in next 25 chars, stop at second float
|
| 1203 |
+
if 'calcd' in next_25_chars.lower():
|
| 1204 |
+
end_pos = hrms_pos + second_float_end
|
| 1205 |
+
else:
|
| 1206 |
+
# If no 'calcd', include up to 25 characters after second float
|
| 1207 |
+
end_pos = hrms_pos + second_float_end + 25
|
| 1208 |
+
|
| 1209 |
+
# Ensure end position doesn't exceed text length or 100 characters from 'HRMS'
|
| 1210 |
+
end_pos = min(len(text), end_pos, hrms_pos + 100)
|
| 1211 |
+
result = text[hrms_pos:end_pos].strip()
|
| 1212 |
+
results.append(result)
|
| 1213 |
+
|
| 1214 |
+
return results
|
| 1215 |
+
|
| 1216 |
+
def process_replacements(text: str) -> str:
|
| 1217 |
+
"""
|
| 1218 |
+
Perform all necessary string replacements on the text.
|
| 1219 |
+
"""
|
| 1220 |
+
replacements = {
|
| 1221 |
+
r' is ':' ',
|
| 1222 |
+
r'LCMS':'HRMS',
|
| 1223 |
+
r'HRESIMS':"HRMS",
|
| 1224 |
+
r'HRESI': 'HRMS',
|
| 1225 |
+
r'HR-MS': 'HRMS',
|
| 1226 |
+
r'ESI-MS': ' HRMS',
|
| 1227 |
+
r'β': '-',
|
| 1228 |
+
r'β':r'-',
|
| 1229 |
+
r'MHz':'',
|
| 1230 |
+
r'MeOD':'',
|
| 1231 |
+
r'Cal':"cal",
|
| 1232 |
+
r'calculated': 'calcd ',
|
| 1233 |
+
r'calcd.': 'calcd ',
|
| 1234 |
+
r'calc. ': 'calcd ',
|
| 1235 |
+
r'calc ': 'calcd ',
|
| 1236 |
+
r'chemical':'',
|
| 1237 |
+
r'formula':'',
|
| 1238 |
+
r' β': "+",
|
| 1239 |
+
r'β’': "",
|
| 1240 |
+
r'ο¬':'',
|
| 1241 |
+
r'β':'',
|
| 1242 |
+
r'β':'',
|
| 1243 |
+
r'οΌ':'+',
|
| 1244 |
+
r'Observed':' ',
|
| 1245 |
+
r'observed':' ',
|
| 1246 |
+
|
| 1247 |
+
}
|
| 1248 |
+
|
| 1249 |
+
for pattern, replacement in replacements.items():
|
| 1250 |
+
text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
|
| 1251 |
+
text = ' '.join(text.split()).strip()
|
| 1252 |
+
return text
|
| 1253 |
+
|
| 1254 |
+
|
| 1255 |
+
|
| 1256 |
+
def main():
|
| 1257 |
+
st.set_page_config(page_title="Chemistry Text Analyzer", page_icon="π§ͺ", layout="wide")
|
| 1258 |
+
|
| 1259 |
+
st.title("Chemistry Text Analyzer")
|
| 1260 |
+
st.write("""
|
| 1261 |
+
This app analyzes chemistry text for common errors, inconsistencies, and formatting issues.
|
| 1262 |
+
Upload a PDF file or paste your text in the box below to analyze it.
|
| 1263 |
+
""")
|
| 1264 |
+
|
| 1265 |
+
# Create tabs for different input methods
|
| 1266 |
+
tab1, tab2 = st.tabs(["Upload PDF", "Text Input"])
|
| 1267 |
+
|
| 1268 |
+
with tab1:
|
| 1269 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type=['pdf'])
|
| 1270 |
+
analyze_pdf = st.button("Analyze PDF")
|
| 1271 |
+
|
| 1272 |
+
if analyze_pdf and uploaded_file is not None:
|
| 1273 |
+
with st.spinner("Extracting text from PDF..."):
|
| 1274 |
+
text_content = extract_text_from_pdf(uploaded_file)
|
| 1275 |
+
|
| 1276 |
+
if text_content:
|
| 1277 |
+
st.success(f"Successfully extracted text from {uploaded_file.name}")
|
| 1278 |
+
st.write("---")
|
| 1279 |
+
analyze_content(text_content)
|
| 1280 |
+
else:
|
| 1281 |
+
st.error("Failed to extract text from the PDF. Please check if the PDF contains extractable text.")
|
| 1282 |
+
|
| 1283 |
+
with tab2:
|
| 1284 |
+
# Text input area
|
| 1285 |
+
text_input = st.text_area("Paste your text here:", height=300)
|
| 1286 |
+
analyze_text = st.button("Analyze Text")
|
| 1287 |
+
|
| 1288 |
+
if analyze_text:
|
| 1289 |
+
if not text_input:
|
| 1290 |
+
st.warning("Please paste some text to analyze.")
|
| 1291 |
+
else:
|
| 1292 |
+
st.write("---")
|
| 1293 |
+
# Replace newlines with spaces to match the original behavior
|
| 1294 |
+
text_content = text_input.replace('\n', ' ')
|
| 1295 |
+
analyze_content(text_content)
|
| 1296 |
+
|
| 1297 |
+
|
| 1298 |
+
import streamlit as st
|
| 1299 |
+
import re
|
| 1300 |
+
|
| 1301 |
+
|
| 1302 |
+
def analyze_content(text_content):
|
| 1303 |
+
text_content = remove_specific_lines_from_string(text_content)
|
| 1304 |
+
# st.write(text_content)
|
| 1305 |
+
text_content = re.sub(r'\s+', ' ', text_content).strip() # Replace multiple spaces with a single space
|
| 1306 |
+
text_content = process_replacements(text_content)
|
| 1307 |
+
text_content = replace_comma_with_decimal(text_content)
|
| 1308 |
+
text_content = adjust_space_around_decimal(text_content)
|
| 1309 |
+
text_content = fix_floats(text_content)
|
| 1310 |
+
# st.write(text_content)
|
| 1311 |
+
text_content = remove_page_numbers(text_content)
|
| 1312 |
+
text_content = re.sub(r'\[((C\d+(?:[A-Z][a-z]?\d*)*),\s*([M+][^]]+))', r'\1 [\3]', text_content)
|
| 1313 |
+
text_content = re.sub(r'(C)(\d+)(h)(\d+)', lambda m: f'C{m.group(2)}H{m.group(4)}', text_content,
|
| 1314 |
+
flags=re.IGNORECASE)
|
| 1315 |
+
text_content = re.sub(r'(c)(\d+)(H)(\d+)', lambda m: f'C{m.group(2)}H{m.group(4)}', text_content,
|
| 1316 |
+
flags=re.IGNORECASE)
|
| 1317 |
+
text_content = re.sub(r'\b(C)(\d+)(HD)\b', r'C\2H1D', text_content)
|
| 1318 |
+
text_content = re.sub(r'\b(C)\s*(\d*)\s*(H)\s*(\d*)\s*(N)\s*(\d*)\b',
|
| 1319 |
+
lambda
|
| 1320 |
+
m: f"{m.group(1)}{m.group(2) or ''}{m.group(3)}{m.group(4) or ''}{m.group(5)}{m.group(6) or ''}",
|
| 1321 |
+
text_content)
|
| 1322 |
+
|
| 1323 |
+
text_content = re.sub(r'\b(C)\s*(\d*)\s*(H)\s*(\d*)\s*(O)\s*(\d*)\b',
|
| 1324 |
+
lambda
|
| 1325 |
+
m: f"{m.group(1)}{m.group(2) or ''}{m.group(3)}{m.group(4) or ''}{m.group(5)}{m.group(6) or ''}",
|
| 1326 |
+
text_content)
|
| 1327 |
+
text_content = text_content.replace("C2o", "C20").replace("C1o", "C10").replace("Cal", "cal")
|
| 1328 |
+
text_content = re.sub(r'B(\d+)H(\d+)', r'H\2B\1', text_content)
|
| 1329 |
+
text_content = text_content.replace('\n', ' ').replace('+-', '+').replace(':', " ").replace('β', '-').replace(',',
|
| 1330 |
+
" ")
|
| 1331 |
+
text_content = remove_spaces_within_brackets(text_content)
|
| 1332 |
+
# Remove nested brackets from [(M+H]]+ etc.
|
| 1333 |
+
text_content = re.sub(r'\(\[([^]]{1,10})]\+\)', r'[\1]+', text_content)
|
| 1334 |
+
text_content = re.sub(r'\[\[([^]]{1,10})]\+]', r'[\1]+', text_content)
|
| 1335 |
+
text_content = text_content.replace(' [[', '[').replace(']]', ']')
|
| 1336 |
+
|
| 1337 |
+
replacements = {
|
| 1338 |
+
"β": "1", "β": "2", "β": "3", "β": "4", "β
": "5",
|
| 1339 |
+
"β": "6", "β": "7", "β": "8", "β": "9", "β": "0",
|
| 1340 |
+
"ΒΉ": "1", "Β²": "2", "Β³": "3", "β΄": "4", "β΅": "5",
|
| 1341 |
+
"βΆ": "6", "β·": "7", "βΈ": "8", "βΉ": "9", "β°": "0",
|
| 1342 |
+
"Π‘": "C", "Π": "H",
|
| 1343 |
+
"C ": "C", " H ": "H", " F ": "F", " N ": "N", " Cl ": "Cl",
|
| 1344 |
+
" Br ": "Br", " O ": "O", " I ": "I", " P ": "P", " B ": "B",
|
| 1345 |
+
" S ": "S", " NO ": "NO", " Na ": "Na", " SNa ": "SNa", " NNa ": "NNa",
|
| 1346 |
+
" + ": "+"
|
| 1347 |
+
}
|
| 1348 |
+
|
| 1349 |
+
# Apply replacements and additional processing steps.
|
| 1350 |
+
for original, replacement in replacements.items():
|
| 1351 |
+
text_content = text_content.replace(original, replacement)
|
| 1352 |
+
text_content = remove_spaces_in_formula(text_content)
|
| 1353 |
+
text_content = text_content.replace('#', '')
|
| 1354 |
+
text_content = re.sub(r'(C\d+)', r' \1', text_content)
|
| 1355 |
+
text_content = transform_expressions_in_text(text_content)
|
| 1356 |
+
text_content = isotope_correct(text_content)
|
| 1357 |
+
text_content = protect_floats(text_content)
|
| 1358 |
+
text_content = text_content.replace("[13C]", "H1HeXe")
|
| 1359 |
+
text_content = text_content.replace("CF", "C1F")
|
| 1360 |
+
text_content = text_content.replace("HN", "H1N")
|
| 1361 |
+
# st.write(text_content) # Optionally display intermediate output
|
| 1362 |
+
results1 = search_hrms_with_floats(text_content)
|
| 1363 |
+
modified_text = text_content
|
| 1364 |
+
for match in results1:
|
| 1365 |
+
modified_text = modified_text.replace(match, '')
|
| 1366 |
+
# Clean up any extra spaces
|
| 1367 |
+
modified_text = re.sub(r'\s+', ' ', modified_text).strip()
|
| 1368 |
+
text_content = modified_text
|
| 1369 |
+
results2 = search_calcd_with_floats(text_content)
|
| 1370 |
+
|
| 1371 |
+
results = results1 + results2
|
| 1372 |
+
cleaned_results = hrms_cleanup(results, error_dictionary)
|
| 1373 |
+
|
| 1374 |
+
cleaned_results = calc_dev_calcd_and_recalcd(cleaned_results)
|
| 1375 |
+
cleaned_results = remove_sublists_with_missing_element1_positions_swapped(cleaned_results)
|
| 1376 |
+
|
| 1377 |
+
# Remove duplicate sublists
|
| 1378 |
+
cleaned_results_new = []
|
| 1379 |
+
for sublist in cleaned_results:
|
| 1380 |
+
if sublist not in cleaned_results_new:
|
| 1381 |
+
cleaned_results_new.append(sublist)
|
| 1382 |
+
cleaned_results = cleaned_results_new
|
| 1383 |
+
|
| 1384 |
+
# Count the total number of measurements
|
| 1385 |
+
num_row = len(cleaned_results)
|
| 1386 |
+
|
| 1387 |
+
if cleaned_results:
|
| 1388 |
+
st.write(" ")
|
| 1389 |
+
# Use the Streamlit version of print_aligned_table to display the table
|
| 1390 |
+
print_aligned_table(cleaned_results)
|
| 1391 |
+
if check_conditions(cleaned_results):
|
| 1392 |
+
st.success("Awesome! No mistakes!")
|
| 1393 |
+
# Optionally, display the raw results:
|
| 1394 |
+
# for result in results:
|
| 1395 |
+
# st.write(result)
|
| 1396 |
+
|
| 1397 |
+
|
| 1398 |
+
if __name__ == '__main__':
|
| 1399 |
+
main()
|