diff --git "a/notebooks/peak_embedding_UMAP.ipynb" "b/notebooks/peak_embedding_UMAP.ipynb" deleted file mode 100644--- "a/notebooks/peak_embedding_UMAP.ipynb" +++ /dev/null @@ -1,1333 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "8d4475ae", - "metadata": {}, - "source": [ - "Visualize peak embeddings\n", - "\n", - "peaks of the same formula same substructure should be clustered closer togther" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75410b71", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "b906b7f7", - "metadata": {}, - "outputs": [], - "source": [ - "data = pd.read_csv(\"/r/hassounlab/spectra_data/msgym/MassSpecGym.tsv\", sep=\"\\t\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "d83156c1", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
| \n", - " | identifier | \n", - "mzs | \n", - "intensities | \n", - "smiles | \n", - "inchikey | \n", - "formula | \n", - "precursor_formula | \n", - "parent_mass | \n", - "precursor_mz | \n", - "adduct | \n", - "instrument_type | \n", - "collision_energy | \n", - "fold | \n", - "simulation_challenge | \n", - "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", - "MassSpecGymID0000001 | \n", - "91.0542,125.0233,154.0499,155.0577,185.0961,20... | \n", - "0.24524524524524524,1.0,0.08008008008008008,0.... | \n", - "CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC | \n", - "VFMQMACUYWGDOJ | \n", - "C16H17NO4 | \n", - "C16H18NO4 | \n", - "287.115224 | \n", - "288.1225 | \n", - "[M+H]+ | \n", - "Orbitrap | \n", - "30.0 | \n", - "train | \n", - "True | \n", - "
| 1 | \n", - "MassSpecGymID0000002 | \n", - "91.0542,125.0233,155.0577,185.0961,229.0859,24... | \n", - "0.0990990990990991,0.28128128128128127,0.04004... | \n", - "CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC | \n", - "VFMQMACUYWGDOJ | \n", - "C16H17NO4 | \n", - "C16H18NO4 | \n", - "287.115224 | \n", - "288.1225 | \n", - "[M+H]+ | \n", - "Orbitrap | \n", - "20.0 | \n", - "train | \n", - "True | \n", - "
| 2 | \n", - "MassSpecGymID0000003 | \n", - "69.0343,91.0542,125.0233,127.039,153.0699,154.... | \n", - "0.03403403403403404,0.31431431431431434,1.0,0.... | \n", - "CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC | \n", - "VFMQMACUYWGDOJ | \n", - "C16H17NO4 | \n", - "C16H18NO4 | \n", - "287.115224 | \n", - "288.1225 | \n", - "[M+H]+ | \n", - "Orbitrap | \n", - "40.0 | \n", - "train | \n", - "True | \n", - "
| 3 | \n", - "MassSpecGymID0000004 | \n", - "69.0343,91.0542,110.06,111.0441,112.0393,120.0... | \n", - "0.17917917917917917,0.47347347347347346,0.0380... | \n", - "CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC | \n", - "VFMQMACUYWGDOJ | \n", - "C16H17NO4 | \n", - "C16H18NO4 | \n", - "287.115224 | \n", - "288.1225 | \n", - "[M+H]+ | \n", - "Orbitrap | \n", - "55.0 | \n", - "train | \n", - "True | \n", - "
| 4 | \n", - "MassSpecGymID0000005 | \n", - "91.0542,125.0233,185.0961,229.0859,246.1125,28... | \n", - "0.07807807807807808,0.1841841841841842,0.03503... | \n", - "CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC | \n", - "VFMQMACUYWGDOJ | \n", - "C16H17NO4 | \n", - "C16H18NO4 | \n", - "287.115224 | \n", - "288.1225 | \n", - "[M+H]+ | \n", - "Orbitrap | \n", - "10.0 | \n", - "train | \n", - "True | \n", - "
| \n", - " | Subformula | \n", - "Unique_SMILES | \n", - "Spectra_Indices | \n", - "n_smiles | \n", - "has_nitrogen | \n", - "has_oxygen | \n", - "
|---|---|---|---|---|---|---|
| 1046 | \n", - "C9H7NO | \n", - "[COC1=CC(=CC(=C1)NC(=O)CN2C3=CC=CC=C3C(=CC2=O)... | \n", - "[392, 396, 671, 674, 675, 677, 901, 905, 910, ... | \n", - "2345 | \n", - "True | \n", - "True | \n", - "
| 1052 | \n", - "C10H9NO | \n", - "[C1OC2=CC3=CC(=C(N=C3C=C2O1)N)C(=O)NC4=CC=CC=C... | \n", - "[392, 393, 394, 396, 397, 398, 901, 902, 904, ... | \n", - "1788 | \n", - "True | \n", - "True | \n", - "
| 2587 | \n", - "C8H7NO | \n", - "[C1CCOC(C1)CN2C=C(C=N2)NC(=O)C3=CC4=C(C=C3)NC=... | \n", - "[1696, 1698, 1699, 1704, 1705, 1706, 1707, 170... | \n", - "1706 | \n", - "True | \n", - "True | \n", - "
| 1159 | \n", - "C5H5NO | \n", - "[C1OC2=CC3=CC(=C(N=C3C=C2O1)N)C(=O)NC4=CC=CC=C... | \n", - "[421, 422, 424, 425, 1047, 1721, 1725, 1727, 1... | \n", - "1450 | \n", - "True | \n", - "True | \n", - "
| 2787 | \n", - "C4H5NO | \n", - "[C1=CC=C(C=C1)C[C@@H](C(=O)N[C@@H](CCC(=O)N)C(... | \n", - "[1779, 1780, 1787, 1789, 1790, 1791, 1793, 179... | \n", - "1380 | \n", - "True | \n", - "True | \n", - "
| ... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "
| 2174 | \n", - "C10H18N2O | \n", - "[CC(C)[C@H](C(=O)NCC1=CC=C(C=C1)OC)NC(=O)C2CCN... | \n", - "[1171, 1179, 51943, 51954, 62242, 82752, 82754... | \n", - "100 | \n", - "True | \n", - "True | \n", - "
| 6157 | \n", - "C6H8NO2 | \n", - "[CC1=CC(=O)OC2=C1C=CC(=C2)NC(=O)[C@@H]3CCCN3C(... | \n", - "[4680, 6057, 7397, 8570, 9357, 21802, 23017, 2... | \n", - "100 | \n", - "True | \n", - "True | \n", - "
| 4045 | \n", - "C3H7N2O2 | \n", - "[CC(C)CS(=O)(=O)NCC(C1=COC=C1)N2CCSCC2, CCC1CN... | \n", - "[2989, 7004, 9280, 9295, 9298, 20159, 35354, 3... | \n", - "100 | \n", - "True | \n", - "True | \n", - "
| 11009 | \n", - "C15H10N2O2 | \n", - "[CC(C)N1C=C(C=N1)NC(=O)CC2=CC=C(C=C2)OC3=NC=NC... | \n", - "[10980, 10981, 10982, 10987, 36081, 36087, 360... | \n", - "100 | \n", - "True | \n", - "True | \n", - "
| 10951 | \n", - "C9H5N2O | \n", - "[CCOC(=O)C1=CN=C2C=CC(=NC2=C1Br)OC, CC(C)C1=C2... | \n", - "[10908, 10914, 10915, 10956, 10957, 10958, 115... | \n", - "100 | \n", - "True | \n", - "True | \n", - "
547 rows × 6 columns
\n", - "