Upload 5 files

Browse files

Using new loss function and bug fix of predict.py.

Files changed (5) hide show

code/dataset.py +4 -1
code/loss.py +77 -0
code/modules.py +8 -5
code/predict.py +130 -139
code/train.py +105 -36

code/dataset.py CHANGED Viewed

@@ -44,6 +44,9 @@ class Dataset(torch.utils.data.Dataset):
     def __getitem__(self, idx):
         item = {}
         try:
             if 'nls' in self.data[idx]:
                 nls = self.data[idx]['nls']
             else:
@@ -111,7 +114,7 @@ class PathDataset(torch.utils.data.Dataset):
             item = calc_feats(smi, ms, nls, self.cfg)
         except Exception as e:
-            print('='*50, idx, str(e))
             return None
         return item

     def __getitem__(self, idx):
         item = {}
         try:
+            if 'ms_bins' in self.data[idx]:
+                return self.data[idx]
             if 'nls' in self.data[idx]:
                 nls = self.data[idx]['nls']
             else:
             item = calc_feats(smi, ms, nls, self.cfg)
         except Exception as e:
+            #print('='*50, idx, str(e))
             return None
         return item

code/loss.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def infoNCE_loss1(mol_features, ms_features, temperature=0.1, norm=True):
+        # Normalize features
+        if norm:
+            mol_features = F.normalize(mol_features, p=2, dim=1)
+            ms_features = F.normalize(ms_features, p=2, dim=1)
+        # Compute similarity matrix
+        logits = torch.mm(mol_features, ms_features.T) / temperature
+        # Labels: positive pairs are on the diagonal
+        batch_size = mol_features.size(0)
+        labels = torch.arange(batch_size, device=mol_features.device)
+        # Cross entropy loss
+        loss_mol = F.cross_entropy(logits, labels)
+        loss_trans = F.cross_entropy(logits.T, labels)
+        loss = (loss_mol + loss_trans) / 2
+        return loss
+def infoNCE_loss2(mol_features, ms_features, temperature=0.1, alpha=0.75, norm=True):
+    """
+    使用更合适的temperature (0.07是CLIP中常用的值)
+    添加更多的数值稳定性措施
+    """
+    if norm:
+        mol_features = F.normalize(mol_features, p=2, dim=1)
+        ms_features = F.normalize(ms_features, p=2, dim=1)
+    batch_size = mol_features.size(0)
+    # 计算相似度矩阵
+    logits_ab = torch.matmul(mol_features, ms_features.T) / temperature
+    logits_ba = torch.matmul(ms_features, mol_features.T) / temperature
+    # 创建标签
+    labels = torch.arange(batch_size, device=mol_features.device)
+    # 计算损失
+    loss_ab = F.cross_entropy(logits_ab, labels)
+    loss_ba = F.cross_entropy(logits_ba, labels)
+    return alpha * loss_ab + (1 - alpha) * loss_ba
+# 在对比损失函数中增加困难负样本挖掘
+def contrastive_loss_with_hard_negatives(features1, features2, margin=1.0, hard_negative_ratio=0.3):
+    """
+    改进的对比损失函数，包含困难负样本挖掘
+    """
+    batch_size = features1.shape[0]
+    # 计算相似度矩阵
+    similarity = torch.matmul(features1, features2.t())
+    # 正样本对（对角线）
+    positive_similarity = torch.diag(similarity)
+    # 困难负样本挖掘：选择相似度较高的负样本
+    mask = ~torch.eye(batch_size, dtype=torch.bool)
+    negative_similarities = similarity[mask].view(batch_size, batch_size-1)
+    # 选择最困难的前k个负样本
+    k = int(batch_size * hard_negative_ratio)
+    hard_negatives, _ = torch.topk(negative_similarities, k=k, dim=1)
+    # 对比损失计算
+    loss = 0
+    for i in range(batch_size):
+        pos_loss = 1 - positive_similarity[i]
+        neg_loss = torch.mean(torch.clamp(hard_negatives[i] - margin, min=0))
+        loss += pos_loss + neg_loss
+    return loss / batch_size

code/modules.py CHANGED Viewed

@@ -8,9 +8,6 @@ import numpy as np
 from cliplayers import QuickGELU, Transformer as MSTsfmEncoder
 from GNN import layers as gly
-loss_func_ms  = nn.CrossEntropyLoss()
-loss_func     = nn.CrossEntropyLoss()
 class MolGNNEncoder(nn.Module):
     def __init__(self,
                  outdim,
@@ -144,10 +141,16 @@ class FragSimiModel(nn.Module):
         ms_embeddings = self.ms_projection(ms_features)
         mol_embeddings = self.mol_projection(mol_features)
         # Calculating the Loss
         #logits = (mol_embeddings @ ms_embeddings.t())
         #logit_scale = self.logit_scale.exp()
-        logits = mol_embeddings @ ms_embeddings.t()
         ground_truth = torch.arange(ms_features.shape[0], dtype=torch.long, device=self.cfg.device)
@@ -155,4 +158,4 @@ class FragSimiModel(nn.Module):
         mol_loss = loss_func(logits.t(), ground_truth)
         loss =  (ms_loss + mol_loss) / 2.0 # shape: (batch_size)
-        return loss.mean()

 from cliplayers import QuickGELU, Transformer as MSTsfmEncoder
 from GNN import layers as gly
 class MolGNNEncoder(nn.Module):
     def __init__(self,
                  outdim,
         ms_embeddings = self.ms_projection(ms_features)
         mol_embeddings = self.mol_projection(mol_features)
+        # Normalize the projected embeddings
+        mol_embeddings = F.normalize(mol_embeddings, p=2, dim=1)
+        ms_embeddings = F.normalize(ms_embeddings, p=2, dim=1)
+        return mol_embeddings, ms_embeddings
         # Calculating the Loss
         #logits = (mol_embeddings @ ms_embeddings.t())
         #logit_scale = self.logit_scale.exp()
+        '''logits = mol_embeddings @ ms_embeddings.t()
         ground_truth = torch.arange(ms_features.shape[0], dtype=torch.long, device=self.cfg.device)
         mol_loss = loss_func(logits.t(), ground_truth)
         loss =  (ms_loss + mol_loss) / 2.0 # shape: (batch_size)
+        return loss.mean()'''

code/predict.py CHANGED Viewed

@@ -9,105 +9,55 @@ import utils
 import json
 import pandas as pd
 import pickle
-MolFeatsCached = {}
-def calc_mol_embeddings0(model, smis, cfg):
-    model.eval()
-    valid_mol_embeddings = []
-    with torch.no_grad():
-        for smi in smis:
-            try:
-                mol_features = utils.mol_fp_encoder(smi, tp=cfg.fptype, nbits=cfg.mol_embedding_dim).to(cfg.device)
-                mol_embeddings = model.mol_projection(mol_features.unsqueeze(0))
-                valid_mol_embeddings.append(mol_embeddings.squeeze(0))
-            except Exception as e:
-                print(smi, e)
-                continue
-    return torch.stack(valid_mol_embeddings)
-def calc_mol_embeddings1(model, smis, cfg):
-    model.eval()
-    mol_embeddings = []
-    with torch.no_grad():
-        for smi in smis:
-            try:
-                if cfg.mol_encoder == 'fp':
-                    k = hash(smi + f'fp-{cfg.fptype}-{cfg.mol_embedding_dim}')
-                    if k in MolFeatsCached:
-                        feats = MolFeatsCached[k]
-                    else:
-                        feats = utils.mol_fp_encoder(smi, tp=cfg.fptype, nbits=cfg.mol_embedding_dim).to(cfg.device)
-                        MolFeatsCached[k] = feats
-                    me = model.mol_projection(feats.unsqueeze(0))
-                    mol_embeddings.append(me.squeeze(0))
-                elif cfg.mol_encoder == 'gnn':
-                    k = hash(smi + 'gnn')
-                    if k in MolFeatsCached:
-                        gfeats = MolFeatsCached[k]
-                    else:
-                        gfeats = utils.mol_graph_featurizer(smi)
-                        MolFeatsCached[k] = gfeats
-                    bat = {'A': gfeats['A'].unsqueeze(0).to(cfg.device),
-                           'V': gfeats['V'].unsqueeze(0).to(cfg.device),
-                           'mol_size': gfeats['mol_size'].unsqueeze(0).to(cfg.device)}
-                    feats = model.mol_gnn_encoder(bat)
-                    me = model.mol_projection(feats)
-                    mol_embeddings.append(me.squeeze(0))
-            except Exception as e:
-                print(smi, e)
-                continue
-    return torch.stack(mol_embeddings)
 def calc_mol_embeddings(model, smis, cfg):
     model.eval()
     fp_featsl = []
     gnn_featsl = []
     fm_featsl = []
-    for smi in smis:
         try:
             if 'gnn' in cfg.mol_encoder:
-                k = hash(smi + 'gnn')
-                if k in MolFeatsCached:
-                    gnn_feats = MolFeatsCached[k]
-                    if gnn_feats is None:
-                        continue
-                else:
-                    gnn_feats = utils.mol_graph_featurizer(smi)
-                    MolFeatsCached[k] = gnn_feats
-                    if gnn_feats is None:
-                        continue
                 gnn_featsl.append(gnn_feats)
             if 'fp' in cfg.mol_encoder:
-                k = hash(smi + f'fp-{cfg.fptype}-{cfg.mol_embedding_dim}')
-                if k in MolFeatsCached:
-                    fp_feats = MolFeatsCached[k]
-                    if fp_feats is None:
-                        continue
-                else:
-                    fp_feats = utils.mol_fp_encoder(smi, tp=cfg.fptype, nbits=cfg.mol_embedding_dim).to(cfg.device)
-                    MolFeatsCached[k] = fp_feats
                 fp_featsl.append(fp_feats)
             if 'fm' in cfg.mol_encoder:
-                k = hash(smi + f'fm-{cfg.fptype}-{cfg.mol_embedding_dim}')
-                if k in MolFeatsCached:
-                    fm_feats = MolFeatsCached[k]
-                    if fm_feats is None:
-                        continue
-                else:
-                    fm_feats = utils.smi2fmvec(smi).to(cfg.device)
-                    MolFeatsCached[k] = fm_feats
                 fm_featsl.append(fm_feats)
         except Exception as e:
             print(smi, e)
-            MolFeatsCached[k] = None
             continue
     mol_feat_list = []
@@ -136,6 +86,8 @@ def calc_mol_embeddings(model, smis, cfg):
         mol_feat_list.append(model.mol_gnn_encoder(bat))
     if 'fp' in cfg.mol_encoder:
         mol_feat_list.append(torch.stack(fp_featsl).to(cfg.device))
@@ -150,62 +102,85 @@ def calc_mol_embeddings(model, smis, cfg):
     with torch.no_grad():
         mol_embeddings = model.mol_projection(mol_features)
-    return mol_embeddings
-def find_matches(model, ms, smis, cfg, n=10):
     model.eval()
     with torch.no_grad():
         ms_features = utils.ms_binner(ms, min_mz=cfg.min_mz, max_mz=cfg.max_mz, bin_size=cfg.bin_size, add_nl=cfg.add_nl, binary_intn=cfg.binary_intn).to(cfg.device)
         ms_features = ms_features.unsqueeze(0)
-        ms_embeddings = model.ms_projection(ms_features).squeeze(0)
-        #print(43, ms_features.shape, ms_embeddings.shape)
-    mol_embeddings  = calc_mol_embeddings(model, smis, cfg)
-    mol_embeddings_n = F.normalize(mol_embeddings, p=2, dim=-1)
-    ms_embeddings_n = F.normalize(ms_embeddings, p=2, dim=-1)
-    dot_similarity =  mol_embeddings_n @ ms_embeddings_n.t()
-    if n == -1 or n > len(mol_embeddings):
-        n = len(mol_embeddings)
-    values, indices = torch.topk(dot_similarity.squeeze(0), n)
-    matchsmis   = [smis[idx] for idx in indices]
-    return matchsmis, values.to('cpu').data.numpy()*100, indices.to('cpu').data.numpy()
-def calc(models, datal, cfg, saveout=True):
     dicall = {}
     coridxd = {}
     for idx, model in enumerate(models):
         for nn, data in enumerate(datal):
             print(f'Calculating {nn}-th MS...')
-            #smipool = [d[1] for d in data['candidates'][:50]]
-            smipool = [d[1] for d in data['candidates']]
             try:
-                smis, scores, indices = find_matches(model, data['ms'], smipool, cfg, 50)
             except Exception as e:
                 print(131, e)
                 continue
             dic = {}
-            for n, smi in enumerate(smis):
                 if smi in dic:
-                    dic[smi]['score'] += scores[n]
-                    dic[smi]['iscor'] = data['candidates'][indices[n]][-1]
-                    dic[smi]['idx'] = data['candidates'][indices[n]][0]
                 else:
-                    dic[smi] = {'score': scores[n], 'iscor': data['candidates'][indices[n]][-1], 'idx': data['candidates'][indices[n]][0]}
-            ikey = data['ikey']
             if ikey in dicall:
                 for k, v in dic.items():
                     if k in dicall[ikey]:
                         dicall[ikey][k]['score'] += v['score']
                     else:
                         dicall[ikey][k] = v
             else:
@@ -223,11 +198,14 @@ def calc(models, datal, cfg, saveout=True):
             n = len(scorel)
         values, indices = torch.topk(scoretsor, n)
-        scorel = values
-        smis   = [smis[i] for i in indices]
-        iscorl = [iscorl[i] for i in indices]
-        indexl = [indexl[i] for i in indices]
         try:
             i = iscorl.index(True)
@@ -253,23 +231,42 @@ def calc(models, datal, cfg, saveout=True):
         if not k in dc:
             dc[k] = [0]
-    '''if saveout:
-        df0 = pd.DataFrame(dc)
-        df0.to_csv('summary.csv', index=False)
-        df = pd.DataFrame({
-                         'MSFn': ikeysl,
-                         'Item': iteml,
-                         'Index': smisidl,
-                         'Smiles': smis,
-                         'Score': scoresl,
-                         'IsCorrect': iscorl})
-        df.to_csv('predicted.csv', index=False)'''
-    return sumtop3, dc, dicall
-def test(modelfnl, datal, datafn=''):
     maxtop3 = 0
     maxoutt = ''
@@ -281,9 +278,8 @@ def test(modelfnl, datal, datafn=''):
         model = FragSimiModel(CFG).to(CFG.device)
         model.load_state_dict(d['state_dict'])
-        model.to(CFG.device)
-        sumtop3, dc, dicall = calc([model], datal, CFG, saveout=False)
         sumtop10 = 0
         for k in ['Hit %.3d' %(i+1) for i in range(10)]:
@@ -313,12 +309,12 @@ def test(modelfnl, datal, datafn=''):
             maxtop3 = sumtop3
             maxoutt = outt
-        dicall['testdata'] = datafn
-        dicall['testrlt'] = outt
-        pickle.dump(dicall, open(fn.replace('.pth', f'-{os.path.basename(datafn).split(".")[0]}-tstrlt.pkl'), 'wb'))
         df = pd.DataFrame(tops)
-        df.to_csv(fn.replace('.pth', f'-{os.path.basename(datafn).split(".")[0]}-tstrlt.csv'), index=False)
     return maxoutt, maxtop3
@@ -326,17 +322,12 @@ def main(datafn, fnl):
     outl = []
     datal = json.load(open(datafn))
-    logfn = f'predict_results.csv'
-    if not os.path.exists(logfn):
-        open(logfn, 'w').write('Index,Results,Model,Data\n')
     n = 0
     for n, fn in enumerate(fnl):
-        out, _ = test([fn], datal, datafn)
         print(out, os.path.basename(fn))
         outl.append(out)
-        open(logfn, 'a').write(f'{n},"{out}",{fn},{datafn}\n')
     print(outl)

 import json
 import pandas as pd
 import pickle
+from rdkit import Chem
+from rdkit.Chem import inchi
+def smiles_to_inchikey(smiles, nostereo=True):
+    try:
+        # 将SMILES转换为分子对象
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return None
+        if nostereo:
+            options = "-SNon"
+            inchi_string = inchi.MolToInchi(mol, options=options)
+        else:
+            inchi_string = inchi.MolToInchi(mol)
+        if not inchi_string:
+            return None
+        inchikey = inchi.InchiToInchiKey(inchi_string)
+        return inchikey
+    except Exception as e:
+        print(f"转换失败: {e}")
+        return None
 def calc_mol_embeddings(model, smis, cfg):
     model.eval()
     fp_featsl = []
     gnn_featsl = []
     fm_featsl = []
+    valid_smis = []
+    for smil in smis:
+        smi = smil[1]
         try:
             if 'gnn' in cfg.mol_encoder:
+                gnn_feats = utils.mol_graph_featurizer(smi)
                 gnn_featsl.append(gnn_feats)
             if 'fp' in cfg.mol_encoder:
+                fp_feats = utils.mol_fp_encoder(smi, tp=cfg.fptype, nbits=cfg.mol_embedding_dim).to(cfg.device)
                 fp_featsl.append(fp_feats)
             if 'fm' in cfg.mol_encoder:
+                fm_feats = utils.smi2fmvec(smi).to(cfg.device)
                 fm_featsl.append(fm_feats)
+            valid_smis.append(smil)
         except Exception as e:
             print(smi, e)
             continue
     mol_feat_list = []
         mol_feat_list.append(model.mol_gnn_encoder(bat))
+        del bat
     if 'fp' in cfg.mol_encoder:
         mol_feat_list.append(torch.stack(fp_featsl).to(cfg.device))
     with torch.no_grad():
         mol_embeddings = model.mol_projection(mol_features)
+    del mol_features, mol_feat_list
+    return mol_embeddings, valid_smis
+def find_matches(model, ms, smis, cfg, n=10, batch_size=64):
     model.eval()
     with torch.no_grad():
         ms_features = utils.ms_binner(ms, min_mz=cfg.min_mz, max_mz=cfg.max_mz, bin_size=cfg.bin_size, add_nl=cfg.add_nl, binary_intn=cfg.binary_intn).to(cfg.device)
         ms_features = ms_features.unsqueeze(0)
+        ms_embeddings = model.ms_projection(ms_features)
+        ms_embeddings_n = F.normalize(ms_embeddings, p=2, dim=1)
+    # 分批计算相似度并维护top-k
+    all_similarities = []
+    all_valid_smis = []
+    # 收集所有分子embedding
+    all_embeddings = []
+    for i in tqdm(range(0, len(smis), batch_size)):
+        batch_smis = smis[i:i+batch_size]
+        batch_embeddings, valid_smis = calc_mol_embeddings(model, batch_smis, cfg)
+        all_embeddings.append(batch_embeddings)
+        all_valid_smis.extend(valid_smis)
+        del batch_embeddings
+    # 全局归一化
+    all_embeddings = torch.cat(all_embeddings, dim=0)
+    all_embeddings_n = F.normalize(all_embeddings, p=2, dim=1)
+    # 计算相似度
+    similarities = F.cosine_similarity(all_embeddings_n, ms_embeddings_n, dim=1)
+    #print('all_embeddings_n.shape', all_embeddings_n.shape, ms_embeddings.shape, len(all_valid_smis), similarities.shape)
+    if n == -1 or n > len(all_valid_smis):
+        n = len(all_valid_smis)
+    values, topk_indices = torch.topk(similarities, n)
+    topk_indices_list = topk_indices.cpu().tolist()
+    #print(len(topk_indices_list), len(all_valid_smis), len(similarities))
+    matchsmis = [all_valid_smis[idx] for idx in topk_indices_list]
+    return matchsmis, values.cpu().numpy()*100, topk_indices_list
+def calc(models, datal, cfg):
     dicall = {}
     coridxd = {}
     for idx, model in enumerate(models):
         for nn, data in enumerate(datal):
             print(f'Calculating {nn}-th MS...')
             try:
+                smis, scores, indices = find_matches(model, data['ms'], data['candidates'], cfg, 50)
             except Exception as e:
                 print(131, e)
                 continue
             dic = {}
+            for n, smil in enumerate(smis):
+                smi = smil[1]
                 if smi in dic:
+                    dic[smi]['score'] = scores[n]
+                    dic[smi]['iscor'] = smis[n][-1]
+                    dic[smi]['idx'] = smis[n][0]
                 else:
+                    dic[smi] = {'score': scores[n], 'iscor': smis[n][-1], 'idx': smis[n][0]}
+            # 计算去除立体构型分子的inchikey，由于质谱很难区分立体构型，我们认为分子的不同立体构型都算正确匹配
+            ikey = smiles_to_inchikey(data['smiles'], True)
+            if ikey is None:
+                ikey = data['ikey']
             if ikey in dicall:
                 for k, v in dic.items():
                     if k in dicall[ikey]:
                         dicall[ikey][k]['score'] += v['score']
+                        dicall[ikey][k]['score'] /= 2
                     else:
                         dicall[ikey][k] = v
             else:
             n = len(scorel)
         values, indices = torch.topk(scoretsor, n)
+        # 修复：将张量转换为Python列表
+        indices_list = indices.cpu().tolist()
+        scorel = values.cpu().numpy()
+        smis   = [smis[i] for i in indices_list]
+        iscorl = [iscorl[i] for i in indices_list]
+        indexl = [indexl[i] for i in indices_list]
         try:
             i = iscorl.index(True)
         if not k in dc:
             dc[k] = [0]
+    return sumtop3, dc, dicall
+def calc_rank(dicall):
+    rankd = {}
+    for ikey, dic in dicall.items():
+        smis   = [k for k in dic.keys()]
+        scorel = [d['score'] for d in dic.values()]
+        iscorl = [d['iscor'] for d in dic.values()]
+        indexl = [d['idx'] for d in dic.values()]
+        scoretsor = torch.tensor(scorel)
+        n = 100
+        if n > len(scorel):
+            n = len(scorel)
+        values, indices = torch.topk(scoretsor, n)
+        scorel = values
+        smis   = [smis[i] for i in indices]
+        iscorl = [iscorl[i] for i in indices]
+        indexl = [indexl[i] for i in indices]
+        sl = []
+        for n, smi in enumerate(smis):
+            sl.append(f'{scorel[n]}:{smi}:{smiles_to_inchikey(smi)}')
+        try:
+            i = iscorl.index(True)
+            rankd[ikey] = {'Hit': i+1, 'Rank': sl}
+        except:
+            pass
+    return rankd
+def predict(modelfnl, datal, datafn=''):
     maxtop3 = 0
     maxoutt = ''
         model = FragSimiModel(CFG).to(CFG.device)
         model.load_state_dict(d['state_dict'])
+        sumtop3, dc, dicall = calc([model], datal, CFG)
         sumtop10 = 0
         for k in ['Hit %.3d' %(i+1) for i in range(10)]:
             maxtop3 = sumtop3
             maxoutt = outt
+        basefn = fn.replace('.pth', f'-{os.path.basename(datafn).split(".")[0]}')
+        rank = calc_rank(dicall)
+        json.dump(rank, open(basefn + '-predict-rank.json', 'w'), indent=2)
         df = pd.DataFrame(tops)
+        df.to_csv(basefn + '-predict-summary.csv', index=False)
     return maxoutt, maxtop3
     outl = []
     datal = json.load(open(datafn))
     n = 0
     for n, fn in enumerate(fnl):
+        out, _ = predict([fn], datal, datafn)
         print(out, os.path.basename(fn))
         outl.append(out)
     print(outl)

code/train.py CHANGED Viewed

@@ -11,6 +11,9 @@ from dataset import *
 import torch.utils.data
 import copy, json, pickle
 import itertools as it
 def make_next_record_dir(basedir, prefix=''):
     path = '%s/%%s001/' %basedir
@@ -97,55 +100,91 @@ def build_loaders(inp, mode, cfg, num_workers):
     return dataloader
 def train_epoch(model, train_loader, optimizer, lr_scheduler, step):
     loss_meter = AvgMeter()
     tqdm_object = tqdm(train_loader, total=len(train_loader))
     for batch in tqdm_object:
         for k, v in batch.items():
             batch[k] = v.to(CFG.device)
-        loss = model(batch)
         optimizer.zero_grad()
         loss.backward()
         optimizer.step()
         if step == "batch":
             lr_scheduler.step()
         count = batch["ms_bins"].size(0)
         loss_meter.update(loss.item(), count)
-        tqdm_object.set_postfix(train_loss=loss_meter.avg, lr=get_lr(optimizer))
-    return loss_meter
 def valid_epoch(model, valid_loader):
     loss_meter = AvgMeter()
-    tqdm_object = tqdm(valid_loader, total=len(valid_loader))
-    for batch in tqdm_object:
-        for k, v in batch.items():
-            batch[k] = v.to(CFG.device)
-        loss = model(batch)
-        count = batch["ms_bins"].size(0)
-        loss_meter.update(loss.item(), count)
-        tqdm_object.set_postfix(valid_loss=loss_meter.avg)
-    return loss_meter
-def main(data, cfg=CFG, savedir='data/train', encmodel=None, ratio=1):
     setup_seed(cfg.seed)
     train_set, valid_set = make_train_valid(data, valid_ratio=cfg.valid_ratio, seed=cfg.seed)
     n = len(train_set)
     if ratio < 1:
         train_set = random.sample(train_set, int(n*ratio))
         print(f'Ratio {ratio}, lenall {n}, newtrainset {len(train_set)}')
-    train_loader = build_loaders(train_set, "train", cfg, 10)
-    valid_loader = build_loaders(valid_set, "valid", cfg, 10)
     step = "epoch"
@@ -155,14 +194,6 @@ def main(data, cfg=CFG, savedir='data/train', encmodel=None, ratio=1):
     model = FragSimiModel(cfg).to(cfg.device)
-    if not encmodel is None:
-        model.mol_gnn_encoder.load_state_dict(encmodel.mol_gnn_encoder.state_dict())
-        # fraze mol_gnn_encoder weights
-        '''for name, param in model.named_parameters():
-           if 'mol_gnn_encoder' in name:
-               print(152, 'fraze mol_gnn_encoder weights')
-               param.requires_grad = False'''
     print(model)
     optimizer = torch.optim.AdamW(
@@ -173,27 +204,59 @@ def main(data, cfg=CFG, savedir='data/train', encmodel=None, ratio=1):
         optimizer, mode="min", patience=cfg.patience, factor=cfg.factor
     )
     for epoch in range(cfg.epochs):
         print(f"Epoch: {epoch + 1}/{cfg.epochs}")
-        model.train()
-        train_loss = train_epoch(model, train_loader, optimizer, lr_scheduler, step)
-        model.eval()
-        with torch.no_grad():
-            valid_loss = valid_epoch(model, valid_loader)
-        if valid_loss.avg < best_loss:
             best_loss = valid_loss.avg
-            best_model_fn = f"{savedir}/model-tloss{round(train_loss.avg, 3)}-vloss{round(valid_loss.avg, 3)}-epoch{epoch}.pth"
             best_model_fn_base = best_model_fn.replace('.pth', '')
             n = 1
             while os.path.exists(best_model_fn):
                 best_model_fn = best_model_fn_base + f'-{n}.pth'
                 n += 1
-            checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'config': dict(CFG)}
             best_model_fns.append(best_model_fn)
-            torch.save(checkpoint, best_model_fn)
-            print("Saved Best Model!")
     best_model_fnl = []
     for fn in best_model_fns:
@@ -209,6 +272,8 @@ def main(data, cfg=CFG, savedir='data/train', encmodel=None, ratio=1):
     return best_model_fnl, best_loss
 if __name__ == "__main__":
     try:
         conffn = sys.argv[1]
         if conffn.endswith('.json'):
@@ -229,7 +294,10 @@ if __name__ == "__main__":
     os.system('mkdir -p %s' %savedir)
-    mg = None
     print(CFG)
@@ -237,6 +305,7 @@ if __name__ == "__main__":
         data = [os.path.join(CFG.dataset_path, i) for i in os.listdir(CFG.dataset_path) if i.endswith('mgf')]
     elif os.path.isfile(CFG.dataset_path):
         if CFG.dataset_path.endswith('.pkl'):
             data = pickle.load(open(CFG.dataset_path, 'rb'))
         else:
             data = json.load(open(CFG.dataset_path))
@@ -244,8 +313,8 @@ if __name__ == "__main__":
             if not os.path.exists(pklfn):
                 pickle.dump(data, open(pklfn, 'wb'))
-    subdir = make_next_record_dir(savedir, f'train-')
     os.system(f'cp -a *py {subdir}; cp -a GNN {subdir}')
     CFG.save(f'{subdir}/config.json')
-    modelfnl, _ = main(data, CFG, subdir, mg)

 import torch.utils.data
 import copy, json, pickle
 import itertools as it
+import loss
+loss_func = loss.infoNCE_loss2
 def make_next_record_dir(basedir, prefix=''):
     path = '%s/%%s001/' %basedir
     return dataloader
 def train_epoch(model, train_loader, optimizer, lr_scheduler, step):
+    model.train()
     loss_meter = AvgMeter()
     tqdm_object = tqdm(train_loader, total=len(train_loader))
+    total_cos_sim = 0
     for batch in tqdm_object:
         for k, v in batch.items():
             batch[k] = v.to(CFG.device)
         optimizer.zero_grad()
+        mol_features, ms_features = model(batch)
+        loss = loss_func(mol_features, ms_features)
         loss.backward()
         optimizer.step()
+        with torch.no_grad():
+            cos_sim = F.cosine_similarity(
+                mol_features.detach(),
+                ms_features.detach()
+            ).mean().item()
+            total_cos_sim += cos_sim
         if step == "batch":
             lr_scheduler.step()
         count = batch["ms_bins"].size(0)
         loss_meter.update(loss.item(), count)
+        tqdm_object.set_postfix(train_loss=loss_meter.avg, train_cos_sim=round(cos_sim, 4), lr=get_lr(optimizer))
+        del mol_features, ms_features, loss, cos_sim
+        for k in list(batch.keys()):
+            del batch[k]
+        del batch
+    return loss_meter, total_cos_sim / len(train_loader)
 def valid_epoch(model, valid_loader):
+    model.eval()
     loss_meter = AvgMeter()
+    total_cos_sim = 0
+    with torch.no_grad():
+        tqdm_object = tqdm(valid_loader, total=len(valid_loader))
+        for batch in tqdm_object:
+            for k, v in batch.items():
+                batch[k] = v.to(CFG.device)
+            mol_features, ms_features = model(batch)
+            loss = loss_func(mol_features, ms_features)
+            count = batch["ms_bins"].size(0)
+            loss_meter.update(loss.item(), count)
+            cos_sim = F.cosine_similarity(mol_features.detach(), ms_features.detach()).mean().item()
+            total_cos_sim += cos_sim
+            tqdm_object.set_postfix(valid_loss=loss_meter.avg, valid_cos_sim=round(cos_sim, 4))
+            del mol_features, ms_features, loss, cos_sim
+            for k in list(batch.keys()):
+                del batch[k]
+            del batch
+    return loss_meter, total_cos_sim / len(valid_loader)
+def main(data, cfg=CFG, savedir='data/train', model_path=None, ratio=1):
     setup_seed(cfg.seed)
     train_set, valid_set = make_train_valid(data, valid_ratio=cfg.valid_ratio, seed=cfg.seed)
+    log_file = f'{savedir}/trainlog.txt'
     n = len(train_set)
     if ratio < 1:
         train_set = random.sample(train_set, int(n*ratio))
         print(f'Ratio {ratio}, lenall {n}, newtrainset {len(train_set)}')
+    train_loader = build_loaders(train_set, "train", cfg, 1)
+    valid_loader = build_loaders(valid_set, "valid", cfg, 1)
     step = "epoch"
     model = FragSimiModel(cfg).to(cfg.device)
     print(model)
     optimizer = torch.optim.AdamW(
         optimizer, mode="min", patience=cfg.patience, factor=cfg.factor
     )
+    # Load pre-trained model if path is provided
+    if model_path and os.path.exists(model_path):
+        print(f"Loading model from {model_path}")
+        checkpoint = torch.load(model_path, map_location=cfg.device)
+        model.load_state_dict(checkpoint['state_dict'])
+        '''if 'optimizer' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            print("Loaded optimizer state")'''
+        print(f"Resuming training")
+        del checkpoint
+    # write training log
+    with open(log_file, 'a', encoding='utf8') as f:
+        f.write(f'Start training:\n')
+        f.write(f'Data path: {cfg.dataset_path}, valid ratio: {cfg.valid_ratio}\n')
+        if model_path:
+            f.write(f'Resuming from: {model_path}\n')
+        print(model, file=f)
+        f.write(f'\n')
     for epoch in range(cfg.epochs):
         print(f"Epoch: {epoch + 1}/{cfg.epochs}")
+        train_loss, t_cos_sim = train_epoch(model, train_loader, optimizer, lr_scheduler, step)
+        valid_loss, v_cos_sim = valid_epoch(model, valid_loader)
+        txt = f"Train Loss: {train_loss.avg:.4f} | Val Loss: {valid_loss.avg:.4f} | Train cos sim: {t_cos_sim:.4f}  | Val cos sim: {v_cos_sim:.4f}"
+        print(txt)
+        open(log_file, 'a').write(f"Epoch {epoch + 1}/{cfg.epochs}: {txt}\n")
+        if True: #valid_loss.avg < best_loss:
             best_loss = valid_loss.avg
+            best_model_fn = f"{savedir}/model-tloss{round(train_loss.avg, 3)}-vloss{round(valid_loss.avg, 3)}-tcos{round(t_cos_sim, 3)}-vcos{round(v_cos_sim, 3)}-epoch{epoch}.pth"
             best_model_fn_base = best_model_fn.replace('.pth', '')
             n = 1
             while os.path.exists(best_model_fn):
                 best_model_fn = best_model_fn_base + f'-{n}.pth'
                 n += 1
             best_model_fns.append(best_model_fn)
+            torch.save({'epoch': epoch,
+                        'state_dict': model.state_dict(),
+                        'optimizer': optimizer.state_dict(),
+                        'config': dict(CFG),
+                        'train_loss': train_loss.avg,
+                        'valid_loss': valid_loss.avg,
+                        'train_cos_sim': t_cos_sim,
+                        'val_cos_sim': v_cos_sim
+                       }, best_model_fn)
+            print("Saved new best model!")
     best_model_fnl = []
     for fn in best_model_fns:
     return best_model_fnl, best_loss
 if __name__ == "__main__":
+    import pickle
+    from tqdm import tqdm
     try:
         conffn = sys.argv[1]
         if conffn.endswith('.json'):
     os.system('mkdir -p %s' %savedir)
+    try:
+        prev_model_pth = sys.argv[3]
+    except:
+        prev_model_pth = None
     print(CFG)
         data = [os.path.join(CFG.dataset_path, i) for i in os.listdir(CFG.dataset_path) if i.endswith('mgf')]
     elif os.path.isfile(CFG.dataset_path):
         if CFG.dataset_path.endswith('.pkl'):
+            print(f'loading data from {CFG.dataset_path} ...')
             data = pickle.load(open(CFG.dataset_path, 'rb'))
         else:
             data = json.load(open(CFG.dataset_path))
             if not os.path.exists(pklfn):
                 pickle.dump(data, open(pklfn, 'wb'))
+    subdir = make_next_record_dir(savedir, f'train-neg-')
     os.system(f'cp -a *py {subdir}; cp -a GNN {subdir}')
     CFG.save(f'{subdir}/config.json')
+    modelfnl, _ = main(data, CFG, subdir, prev_model_pth)