Spaces:

jinysun
/

TeLLAgent

Build error

App Files Files Community

jinysun commited on Apr 7, 2025

Commit

64e9ead

verified ·

1 Parent(s): dbaa85f

Upload 46 files

Browse files

Files changed (47) hide show

.gitattributes +3 -0
tool/ImageAnalysis.py +68 -0
tool/PCE.py +146 -0
tool/__init__.py +13 -0
tool/browsersearch.py +36 -0
tool/chemspace.py +195 -0
tool/coder.py +43 -0
tool/comget/dataset.py +65 -0
tool/comget/generator.py +206 -0
tool/comget/model.py +301 -0
tool/comget/ppcenos.json +1 -0
tool/comget/ppcenos.pt +3 -0
tool/comget/utils.py +275 -0
tool/converters.py +154 -0
tool/csv_search.py +34 -0
tool/dap/.gitignore +160 -0
tool/dap/OSC/test.ckpt +3 -0
tool/dap/README.md +1 -0
tool/dap/config/config_hparam.json +26 -0
tool/dap/config/predict.json +26 -0
tool/dap/requirements.txt +18 -0
tool/dap/run.py +124 -0
tool/dap/screen.py +118 -0
tool/dap/train.py +454 -0
tool/dap/util/attention_flow.py +195 -0
tool/dap/util/attention_plot.py +93 -0
tool/dap/util/boxplot.py +201 -0
tool/dap/util/data/bindingdb_kd.tab +3 -0
tool/dap/util/data/davis.tab +3 -0
tool/dap/util/emetric.py +59 -0
tool/dap/util/load_dataset.py +32 -0
tool/dap/util/make_external_validation.py +28 -0
tool/dap/util/utils.py +45 -0
tool/dataset.csv +0 -0
tool/deepacceptor/RF.py +70 -0
tool/deepacceptor/deepacceptor.pkl +3 -0
tool/deepacceptor/dict.json +1 -0
tool/deepdonor/pm.pkl +3 -0
tool/deepdonor/sm.pkl +3 -0
tool/graphconverter.py +33 -0
tool/orbital.py +94 -0
tool/pdfreader.py +86 -0
tool/property.py +220 -0
tool/rag.py +101 -0
tool/rag/index.faiss +3 -0
tool/rag/index.pkl +3 -0
tool/search.py +156 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tool/dap/util/data/bindingdb_kd.tab filter=lfs diff=lfs merge=lfs -text
+tool/dap/util/data/davis.tab filter=lfs diff=lfs merge=lfs -text
+tool/rag/index.faiss filter=lfs diff=lfs merge=lfs -text

tool/ImageAnalysis.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Sat Oct 26 15:35:19 2024
+@author: BM109X32G-10GPU-02
+"""
+from langchain_community.embeddings import OllamaEmbeddings
+from langchain.tools import BaseTool
+from langchain_openai import ChatOpenAI
+from langchain_core.messages import HumanMessage, SystemMessage
+from langchain.base_language import BaseLanguageModel
+import base64
+from io import BytesIO
+from PIL import Image
+def convert_to_base64(pil_image):
+    buffered = BytesIO()
+    pil_image.save(buffered, format="PNG")
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    return img_str
+class Imageanalysis(BaseTool):
+    name: str = "Imageanalysis"
+    description: str = (
+        "Useful to answer questions according to the image, figure, diagram or graph. "
+        "Useful to analysis the information in the image, figure, diagram or graph. "
+        "Input query about image/figure/graph/diagram, return the response"
+    )
+    return_direct: bool = True
+    llm: BaseLanguageModel = None
+    path : str = None
+    def __init__(self, path):
+        super().__init__(  )
+        self.llm = ChatOpenAI(model="gpt-4o-2024-11-20",api_key='sk-itPrztYm9F6XZZpsBMJB9O7Vq0pYUABVVBSoThuBxEGTnDik',
+             base_url="https://www.dmxapi.com/v1")
+        self.path = path
+        # api keys
+    def _run(self, query ) -> str:
+        try:
+            pil_image = Image.open(self.path)
+            rgb_im = pil_image.convert('RGB')
+            image_b64 = convert_to_base64(pil_image)
+            message = HumanMessage(
+                content=[
+                    {"type": "text", "text": query},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url":f"data:image/jpeg;base64,{image_b64}"},
+                        },
+                    ],)
+            response = self.llm.invoke([message])
+            return response.content
+        except Exception as e:
+            return str(e)
+    async def _arun(self, query) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError("this tool does not support async")

tool/PCE.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 11 10:27:20 2024
+@author: BM109X32G-10GPU-02
+"""
+from langchain.tools import BaseTool
+from rdkit import Chem
+from rdkit.Chem import rdMolDescriptors
+from rdkit.Chem import Descriptors
+from .deepacceptor import RF
+from .deepdonor import sm, pm
+from .dap import run, screen
+import pandas as pd
+class acceptor_predictor(BaseTool):
+    name:str  = "acceptor_predictor"
+    description:str  = (
+        "Input acceptor SMILES , returns the score of the acceptor."
+    )
+    def __init__(self):
+        super().__init__()
+    def _run(self, smiles: str) -> str:
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return "Invalid SMILES string"
+        smiles = Chem.MolToSmiles(mol)
+        pce = RF.main( str(smiles) )
+        return f'The power conversion efficiency (PCE) is predicted to be {pce} (predicted by DeepAcceptor)  '
+    async def _arun(self, smiles: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()
+class donor_predictor(BaseTool):
+    name:str  = "donor_predictor"
+    description:str  = (
+        "Input donor SMILES , returns the score of the donor."
+    )
+    def __init__(self):
+        super().__init__()
+    def _run(self, smiles: str) -> str:
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return "Invalid SMILES string"
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return "Invalid SMILES string"
+        sdpce = sm.main( str(smiles) )
+        pdpce =  pm.main( str(smiles) )
+        return f'The power conversion efficiency (PCE) of the given molecule is predicted to be {sdpce} as a small molecule donor , and {pdpce} as a polymer donor(predicted by DeepDonor)  '
+    async def _arun(self, smiles: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()
+class dap_predictor(BaseTool):
+    name:str  = "dap_predictor"
+    description :str = (
+        "Input SMILES of D/A pairs(separated by '.') , returns the performance of the D/A pairs ."
+    )
+    def __init__(self):
+        super().__init__()
+    def _run(self, smiles_pair: str) -> str:
+        smi_list = smiles_pair.split(".")
+        if len(smi_list) != 2:
+            return "Input error, please input two smiles strings separated by '.'"
+        else:
+            smiles1, smiles2 = smi_list
+        pce = run.smiles_aas_test( str(smiles1 ), str(smiles2) )
+        return pce
+    async def _arun(self, smiles_pair: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()
+class dap_screen(BaseTool):
+    name:str  = "dap_screen"
+    description :str = (
+        "Input dataset path containing D/A pairs, returns the files of prediction results."
+    )
+    return_direct: bool = True
+    def __init__(self):
+        super().__init__()
+    def _run(self, file_path: str) -> str:
+        smi_list = screen.smiles_aas_test(file_path)
+        return smi_list
+    async def _arun(self, smiles_pair: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()
+from .comget import generator
+class molgen(BaseTool):
+    name: str = "donorgen"
+    description: str = (
+        "Useful to generate polymer donor molecules with required PCE. "
+        "Input the values of PCE , return the SMILES"
+    )
+    def __init__(self
+                 ):
+        super().__init__(  )
+    def _run(self, value ) -> str:
+        try:
+            results = generator.generation(value)
+            for i in results['smiles']:
+                pdpce =  pm.main( str(i) )
+                if abs(pdpce-float(value))<1.0:
+                    return f"The SMILES of generated donor is {i}, its predicted PCE is {pdpce}."
+                    break
+        except Exception as e:
+            return str(e)
+    async def _arun(self, query) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError("this tool does not support async")

tool/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""load all tools."""
+from .coder import *
+from .property import *
+from .search import *
+from .PCE import *
+from .converters import *
+from .orbital import *
+from .graphconverter import *
+from .ImageAnalysis import *
+from .pdfreader import *
+from .rag import *
+from .browsersearch import *

tool/browsersearch.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from langchain_openai import ChatOpenAI
+from browser_use import Agent
+import asyncio
+from dotenv import load_dotenv
+load_dotenv()
+from langchain.tools import BaseTool
+async def main(task):
+    agent = Agent(
+        task=task,
+        llm=ChatOpenAI(model="gpt-4o-2024-11-20",api_key='sk-itPrztYm9F6XZZpsBMJB9O7Vq0pYUABVVBSoThuBxEGTnDik',
+             base_url="https://www.dmxapi.com/v1"),
+    )
+    result = await agent.run()
+    return result
+class browseruse(BaseTool):
+    name: str = "browseruse"
+    description: str = ("Calling the browser to search for information in specific website"
+                        "input query, return the searching results")
+    def __init__(
+        self,
+    ):
+        super().__init__()
+    def _run(self, task: str) -> str:
+         result = asyncio.run(main(task))
+         return result
+    async def _arun(self, smiles: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()

tool/chemspace.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import os
+import molbloom
+import pandas as pd
+import requests
+from langchain.tools import BaseTool
+from utils import is_smiles
+class ChemSpace:
+    def __init__(self, chemspace_api_key=None):
+        self.chemspace_api_key = chemspace_api_key
+        self._renew_token()  # Create token
+    def _renew_token(self):
+        self.chemspace_token = requests.get(
+            url="https://api.chem-space.com/auth/token",
+            headers={
+                "Accept": "application/json",
+                "Authorization": f"Bearer {self.chemspace_api_key}",
+            },
+        ).json()["access_token"]
+    def _make_api_request(
+        self,
+        query,
+        request_type,
+        count,
+        categories,
+    ):
+        """
+        Make a generic request to chem-space API.
+        Categories request.
+            CSCS: Custom Request: Could be useful for requesting whole synthesis
+            CSMB: Make-On-Demand Building Blocks
+            CSSB: In-Stock Building Blocks
+            CSSS: In-stock Screening Compounds
+            CSMS: Make-On-Demand Screening Compounds
+        """
+        def _do_request():
+            data = requests.request(
+                "POST",
+                url=f"https://api.chem-space.com/v3/search/{request_type}?count={count}&page=1&categories={categories}",
+                headers={
+                    "Accept": "application/json; version=3.1",
+                    "Authorization": f"Bearer {self.chemspace_token}",
+                },
+                data={"SMILES": f"{query}"},
+            ).json()
+            return data
+        data = _do_request()
+        # renew token if token is invalid
+        if "message" in data.keys():
+            if data["message"] == "Your request was made with invalid credentials.":
+                self._renew_token()
+        data = _do_request()
+        return data
+    def _convert_single(self, query, search_type: str):
+        """Do query for a single molecule"""
+        data = self._make_api_request(query, "exact", 1, "CSCS,CSMB,CSSB")
+        if data["count"] > 0:
+            return data["items"][0][search_type]
+        else:
+            return "No data was found for this compound."
+    def convert_mol_rep(self, query, search_type: str = "smiles"):
+        if ", " in query:
+            query_list = query.split(", ")
+        else:
+            query_list = [query]
+        smi = ""
+        try:
+            for q in query_list:
+                smi += f"{query}'s {search_type} is: {str(self._convert_single(q, search_type))}"
+                return smi
+        except Exception:
+            return "The input provided is wrong. Input either a single molecule, or multiple molecules separated by a ', '"
+    def buy_mol(
+        self,
+        smiles,
+        request_type="exact",
+        count=1,
+    ):
+        """
+        Get data about purchasing compounds.
+        smiles: smiles string of the molecule you want to buy
+        request_type: one of "exact", "sim" (search by similarity), "sub" (search by substructure).
+        count: retrieve data for this many substances max.
+        """
+        def purchasable_check(
+            s,
+        ):
+            if not is_smiles(s):
+                try:
+                    s = self.convert_mol_rep(s, "smiles")
+                except:
+                    return "Invalid SMILES string."
+            """Checks if molecule is available for purchase (ZINC20)"""
+            try:
+                r = molbloom.buy(s, canonicalize=True)
+            except:
+                print("invalid smiles")
+                return False
+            if r:
+                return True
+            else:
+                return False
+        purchasable = purchasable_check(smiles)
+        if request_type == "exact":
+            categories = "CSMB,CSSB"
+        elif request_type in ["sim", "sub"]:
+            categories = "CSSS,CSMS"
+        data = self._make_api_request(smiles, request_type, count, categories)
+        try:
+            if data["count"] == 0:
+                if purchasable:
+                    return "Compound is purchasable, but price is unknown."
+                else:
+                    return "Compound is not purchasable."
+        except KeyError:
+            return "Invalid query, try something else. "
+        print(f"Obtaining data for {data['count']} substances.")
+        dfs = []
+        # Convert this data into df
+        for item in data["items"]:
+            dfs_tmp = []
+            smiles = item["smiles"]
+            offers = item["offers"]
+            for off in offers:
+                df_tmp = pd.DataFrame(off["prices"])
+                df_tmp["vendorName"] = off["vendorName"]
+                df_tmp["time"] = off["shipsWithin"]
+                df_tmp["purity"] = off["purity"]
+                dfs_tmp.append(df_tmp)
+            df_this = pd.concat(dfs_tmp)
+            df_this["smiles"] = smiles
+            dfs.append(df_this)
+        df = pd.concat(dfs).reset_index(drop=True)
+        df["quantity"] = df["pack"].astype(str) + df["uom"]
+        df["time"] = df["time"].astype(str) + " days"
+        df = df.drop(columns=["pack", "uom"])
+        # Remove all entries that are not numbers
+        df = df[df["priceUsd"].astype(str).str.isnumeric()]
+        cheapest = df.iloc[df["priceUsd"].astype(float).idxmin()]
+        return f"{cheapest['quantity']} of this molecule cost {cheapest['priceUsd']} USD and can be purchased at {cheapest['vendorName']}."
+class GetMoleculePrice(BaseTool):
+    name :str = "GetMoleculePrice"
+    description :str = "Get the cheapest available price of a molecule."
+    chemspace_api_key: str = None
+    url: str = None
+    def __init__(self, chemspace_api_key: str = None):
+        super().__init__()
+        self.chemspace_api_key = chemspace_api_key
+        self.url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}"
+    def _run(self, query: str) -> str:
+        if not self.chemspace_api_key:
+            return "No Chemspace API key found. This tool may not be used without a Chemspace API key."
+        try:
+            chemspace = ChemSpace(self.chemspace_api_key)
+            price = chemspace.buy_mol(query)
+            return price
+        except Exception as e:
+            return str(e)
+    async def _arun(self, query: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()

tool/coder.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Sat Oct 26 15:35:19 2024
+@author: BM109X32G-10GPU-02
+"""
+from langchain_community.embeddings import OllamaEmbeddings
+from langchain.tools import BaseTool
+from langchain_openai import ChatOpenAI
+from langchain_core.messages import HumanMessage, SystemMessage
+from langchain.base_language import BaseLanguageModel
+class codewriter(BaseTool):
+    name:str = "codewriter"
+    description:str = (
+        "Useful to answer questions that require writing codes "
+        "return the usage and instruction of codes"
+    )
+    llm: BaseLanguageModel = None
+    def __init__(self):
+        super().__init__()
+        self.llm = ChatOpenAI(model="gpt-4o-2024-11-20",api_key='sk-itPrztYm9F6XZZpsBMJB9O7Vq0pYUABVVBSoThuBxEGTnDik',
+             base_url="https://www.dmxapi.com/v1")
+        # api keys
+    def _run(self, query) -> str:
+        messages = [
+            SystemMessage(content="You are an expert at writing code, write the corresponding code based on the inputs"),
+            HumanMessage(content=query),
+        ]
+        response =  self.llm.invoke(messages)
+        return response
+    async def _arun(self, query) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError("this tool does not support async")

tool/comget/dataset.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+from torch.utils.data import Dataset
+from utils import SmilesEnumerator
+import numpy as np
+import re
+class SmileDataset(Dataset):
+    def __init__(self, args, data, content, block_size, aug_prob = 0.5, prop = None, scaffold = None, scaffold_maxlen = None):
+        chars = sorted(list(set(content)))
+        data_size, vocab_size = len(data), len(chars)
+        print('data has %d smiles, %d unique characters.' % (data_size, vocab_size))
+        self.stoi = { ch:i for i,ch in enumerate(chars) }
+        self.itos = { i:ch for i,ch in enumerate(chars) }
+        self.max_len = block_size
+        self.vocab_size = vocab_size
+        self.data = data
+        self.prop = prop
+        self.sca = scaffold
+        self.scaf_max_len = scaffold_maxlen
+        self.debug = args.debug
+        self.tfm = SmilesEnumerator()
+        self.aug_prob = aug_prob
+    def __len__(self):
+        if self.debug:
+            return math.ceil(len(self.data) / (self.max_len + 1))
+        else:
+            return len(self.data)
+    def __getitem__(self, idx):
+        smiles, prop, scaffold = self.data[idx], self.prop[idx], self.sca[idx]    # self.prop.iloc[idx, :].values  --> if multiple properties
+        smiles = smiles.strip()
+        scaffold = scaffold.strip()
+        p = np.random.uniform()
+        if p < self.aug_prob:
+            smiles = self.tfm.randomize_smiles(smiles)
+        pattern =  "(\[[^\]]+]|<|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
+        regex = re.compile(pattern)
+        smiles += str('<')*(self.max_len - len(regex.findall(smiles)))
+        if len(regex.findall(smiles)) > self.max_len:
+            smiles = smiles[:self.max_len]
+        smiles=regex.findall(smiles)
+        scaffold += str('<')*(self.scaf_max_len - len(regex.findall(scaffold)))
+        if len(regex.findall(scaffold)) > self.scaf_max_len:
+            scaffold = scaffold[:self.scaf_max_len]
+        scaffold=regex.findall(scaffold)
+        dix =  [self.stoi[s] for s in smiles]
+        sca_dix = [self.stoi[s] for s in scaffold]
+        sca_tensor = torch.tensor(sca_dix, dtype=torch.long)
+        x = torch.tensor(dix[:-1], dtype=torch.long)
+        y = torch.tensor(dix[1:], dtype=torch.long)
+        # prop = torch.tensor([prop], dtype=torch.long)
+        prop = torch.tensor([prop], dtype = torch.float)
+        return x, y, prop, sca_tensor

tool/comget/generator.py ADDED Viewed

	@@ -0,0 +1,206 @@

+# -*- coding: utf-8 -*-
+import pandas as pd
+import math
+from tqdm import tqdm
+import argparse
+from .model import GPT, GPTConfig
+import pandas as pd
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+#import seaborn as sns
+from .moses.utils import get_mol
+import re
+import json
+from rdkit.Chem import RDConfig
+import selfies as sf
+import os
+import sys
+sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
+from .utils import  sample, canonic_smiles
+import sascorer
+from rdkit import Chem
+from rdkit.Chem.rdMolDescriptors import CalcTPSA
+import os
+os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+def get_selfie_and_smiles_encodings_for_dataset(smiles):
+            """
+            Returns encoding, alphabet and length of largest molecule in SMILES and
+            SELFIES, given a file containing SMILES molecules.
+            input:
+                csv file with molecules. Column's name must be 'smiles'.
+            output:
+                - selfies encoding
+                - selfies alphabet
+                - longest selfies string
+                - smiles encoding (equivalent to file content)
+                - smiles alphabet (character based)
+                - longest smiles string
+            """
+            smiles_list = np.asanyarray(smiles)
+            smiles_alphabet = list(set("".join(smiles_list)))
+            smiles_alphabet.append(" ")  # for padding
+            largest_smiles_len = len(max(smiles_list, key=len))
+            print("--> Translating SMILES to SELFIES...")
+            selfies_list = list(map(sf.encoder, smiles_list))
+            all_selfies_symbols = sf.get_alphabet_from_selfies(selfies_list)
+            all_selfies_symbols.add("[nop]")
+            selfies_alphabet = list(all_selfies_symbols)
+            largest_selfies_len = max(sf.len_selfies(s) for s in selfies_list)
+            print("Finished translating SMILES to SELFIES.")
+            return selfies_list, selfies_alphabet, largest_selfies_len, \
+                   smiles_list, smiles_alphabet, largest_smiles_len
+def generation(value):
+        parser = argparse.ArgumentParser()
+        #parser.add_argument('--model_weight', type=str, help="path of model weights", required=True)
+        parser.add_argument('--scaffold', action='store_true', default=False, help='condition on scaffold')
+        parser.add_argument('--lstm', action='store_true', default=False, help='use lstm for transforming scaffold')
+        #parser.add_argument('--csv_name', type=str, help="name to save the generated mols in csv format", required=True)
+        parser.add_argument('--data_name', type=str, default = 'moses2', help="name of the dataset to train on", required=False)
+        parser.add_argument('--batch_size', type=int, default = 512, help="batch size", required=False)
+        parser.add_argument('--gen_size', type=int, default = 10000, help="number of times to generate from a batch", required=False)
+        parser.add_argument('--vocab_size', type=int, default = 26, help="number of layers", required=False)  # previously 28 .... 26 for moses. 94 for guacamol
+        parser.add_argument('--block_size', type=int, default = 54, help="number of layers", required=False)   # previously 57... 54 for moses. 100 for guacamol.
+        # parser.add_argument('--num_props', type=int, default = 0, help="number of properties to use for condition", required=False)
+        parser.add_argument('--props', nargs="+", default = [], help="properties to be used for condition", required=False)
+        parser.add_argument('--n_layer', type=int, default = 8, help="number of layers", required=False)
+        parser.add_argument('--n_head', type=int, default = 8, help="number of heads", required=False)
+        parser.add_argument('--n_embd', type=int, default = 256, help="embedding dimension", required=False)
+        parser.add_argument('--lstm_layers', type=int, default = 2, help="number of layers in lstm", required=False)
+        args = parser.parse_args()
+        args.data_name = 'ppcenos'
+        args.vocab_size = 29  #
+        args.block_size = 196 #max_len
+        args.gen_size = 20
+        args.batch_size = 5
+        args.csv_name = 'ppcenos'
+        args.props = ['pce']
+        context = "[C]"
+        args.scaffold = False
+        pattern =  "(\[[^\]]+]|<|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
+        regex = re.compile(pattern)
+        if ('moses' in args.data_name) and args.scaffold:
+            scaffold_max_len=48
+        elif ('guacamol' in args.data_name):
+            scaffold_max_len = 107
+        else:
+            scaffold_max_len = 181
+        stoi = json.load(open('tool/comget/' + f'{args.data_name}.json', 'r'))
+        # itos = { i:ch for i,ch in enumerate(chars) }
+        itos = { i:ch for ch,i in stoi.items() }
+        print(len(itos))
+        num_props = len(args.props)
+        mconf = GPTConfig(args.vocab_size, args.block_size, num_props = num_props,
+                       n_layer=args.n_layer, n_head=args.n_head, n_embd=args.n_embd, scaffold = args.scaffold, scaffold_maxlen = scaffold_max_len,
+                       lstm = args.lstm, lstm_layers = args.lstm_layers)
+        model = GPT(mconf)
+        args.model_weight =   f'{args.csv_name}.pt'
+        model.load_state_dict(torch.load('tool/comget/' + args.model_weight))
+        model.to('cuda')
+        print('Model loaded')
+        gen_iter = math.ceil(args.gen_size / args.batch_size)
+        # gen_iter = 2
+        if 'guacamol1' in args.data_name:
+            prop2value = {'qed': [0.3, 0.5, 0.7], 'sas': [2.0, 3.0, 4.0], 'logp': [2.0, 4.0, 6.0], 'tpsa': [40.0, 80.0, 120.0],
+                        'tpsa_logp': [[40.0, 2.0], [80.0, 2.0], [120.0, 2.0], [40.0, 4.0], [80.0, 4.0], [120.0, 4.0], [40.0, 6.0], [80.0, 6.0], [120.0, 6.0]],
+                        'sas_logp': [[2.0, 2.0], [2.0, 4.0], [2.0, 6.0], [3.0, 2.0], [3.0, 4.0], [3.0, 6.0], [4.0, 2.0], [4.0, 4.0], [4.0, 6.0]],
+                        'tpsa_sas': [[40.0, 2.0], [80.0, 2.0], [120.0, 2.0], [40.0, 3.0], [80.0, 3.0], [120.0, 3.0], [40.0, 4.0], [80.0, 4.0], [120.0, 4.0]],
+                        'tpsa_logp_sas': [[40.0, 2.0, 2.0], [40.0, 2.0, 4.0], [40.0, 6.0, 4.0], [40.0, 6.0, 2.0], [80.0, 6.0, 4.0], [80.0, 2.0, 4.0], [80.0, 2.0, 2.0], [80.0, 6.0, 2.0]]}
+        else:
+            prop2value =   {  'pce': [float(value)]}
+        prop_condition = None
+        if len(args.props) > 0:
+            prop_condition = prop2value['_'.join(args.props)]
+        scaf_condition = None
+        all_dfs = []
+        all_metrics = []
+        count = 0
+        if prop_condition is not None  and scaf_condition is None :
+            for c in prop_condition:
+                molecules = []
+                selfies = []
+                count += 1
+                for i in tqdm(range(gen_iter)):
+                        x = torch.tensor([stoi[s] for s in regex.findall(context)], dtype=torch.long)[None,...].repeat(args.batch_size, 1).to('cuda')
+                        p = None
+                        if len(args.props) == 1:
+                                p = torch.tensor([c]).repeat(args.batch_size, 1).to('cuda')   # for single condition
+                        else:
+                                p = torch.tensor([c]).repeat(args.batch_size, 1).unsqueeze(1).to('cuda')    # for multiple conditions
+                        sca = None
+                        y = sample(model, x, 300, temperature= 1.0, sample=True, top_k = 10, prop = p, scaffold = sca)
+                        for gen_mol in y:
+                                completion = ''.join([itos[int(i)] for i in gen_mol])
+                                completion = completion.replace('<', '')
+                                selfies.append(completion)
+                        file = pd.DataFrame(selfies)
+                for ind, i in enumerate( file[0]):
+                    smi = (sf.decoder(eval(repr(i))))
+                    mol = get_mol(smi)
+                    # gen_smiles.append(completion)
+                    if mol:
+                            molecules.append(mol)
+                    else:
+                            print(ind)
+                            print(i)
+                "Valid molecules % = {}".format(len(molecules))
+                mol_dict = []
+                for i in molecules:
+                        mol_dict.append({'molecule' : i, 'smiles': Chem.MolToSmiles(i)})
+                # for i in gen_smiles:
+                #       mol_dict.append({'temperature' : temp, 'smiles': i})
+                results = pd.DataFrame(mol_dict)
+                all_dfs.append(results)
+        results = pd.concat(all_dfs)
+        return results

tool/comget/model.py ADDED Viewed

	@@ -0,0 +1,301 @@

+"""
+GPT model:
+- the initial stem consists of a combination of token encoding and a positional encoding
+- the meat of it is a uniform sequence of Transformer blocks
+    - each Transformer is a sequential combination of a 1-hidden-layer MLP block and a self-attention block
+    - all blocks feed into a central residual pathway similar to resnets
+- the final decoder is a linear projection into a vanilla Softmax classifier
+"""
+import math
+import logging
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+logger = logging.getLogger(__name__)
+class GPTConfig:
+    """ base GPT config, params common to all GPT versions """
+    embd_pdrop = 0.1
+    resid_pdrop = 0.1
+    attn_pdrop = 0.1
+    def __init__(self, vocab_size, block_size, **kwargs):
+        self.vocab_size = vocab_size
+        self.block_size = block_size
+        for k,v in kwargs.items():
+            setattr(self, k, v)
+class GPT1Config(GPTConfig):
+    """ GPT-1 like network roughly 125M params """
+    n_layer = 12
+    n_head = 12
+    n_embd = 768
+class RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization.
+    Derived from https://github.com/bzhangGo/rmsnorm/blob/master/rmsnorm_torch.py. BSD 3-Clause License:
+    https://github.com/bzhangGo/rmsnorm/blob/master/LICENSE.
+    """
+    def __init__(self, size: int, dim: int = -1, eps: float = 1e-5) -> None:
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(size))
+        self.eps = eps
+        self.dim = dim
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # NOTE: the original RMSNorm paper implementation is not equivalent
+        # norm_x = x.norm(2, dim=self.dim, keepdim=True)
+        # rms_x = norm_x * d_x ** (-1. / 2)
+        # x_normed = x / (rms_x + self.eps)
+        # keep RMSNorm in float32
+        norm_x = x.to(torch.float32).pow(2).mean(dim=self.dim, keepdim=True)
+        x_normed = x * torch.rsqrt(norm_x + self.eps)
+        return (self.scale * x_normed).type_as(x)
+class CausalSelfAttention(nn.Module):
+    """
+    A vanilla multi-head masked self-attention layer with a projection at the end.
+    It is possible to use torch.nn.MultiheadAttention here but I am including an
+    explicit implementation here to show that there is nothing too scary here.
+    """
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads
+        self.key = nn.Linear(config.n_embd, config.n_embd)
+        self.query = nn.Linear(config.n_embd, config.n_embd)
+        self.value = nn.Linear(config.n_embd, config.n_embd)
+        self.q_proj = nn.Linear(
+            config.n_embd  ,
+            config.n_embd  ,
+            bias=False,
+        )
+        # key, value projections
+        self.kv_proj = nn.Linear(
+            config.n_embd ,
+            2 * config.n_embd ,
+            bias=False,
+        )
+        # output projection
+        self.c_proj = nn.Linear(
+            config.n_embd ,
+            config.n_embd  ,
+            bias=False,
+        )
+        # regularization
+        self.attn_drop = nn.Dropout(config.attn_pdrop)
+        self.resid_drop = nn.Dropout(config.resid_pdrop)
+        # output projection
+        self.proj = nn.Linear(config.n_embd, config.n_embd)
+        # causal mask to ensure that attention is only applied to the left in the input sequence
+        num = int(bool(config.num_props)) + int(config.scaffold_maxlen)   #int(config.lstm_layers)    #  int(config.scaffold)
+        # num = 1
+        self.register_buffer("mask", torch.tril(torch.ones(config.block_size + num, config.block_size + num))
+                                     .view(1, 1, config.block_size + num, config.block_size + num))
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        q = self.q_proj(x)
+        k, v = self.kv_proj(x).split(self.n_embd, dim=2)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        # y = F.scaled_dot_product_attention(
+        #         q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=True
+        #     )
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        attn_save = att
+        att = self.attn_drop(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        # output projection
+        y = self.c_proj(y)
+        return y, attn_save
+def find_multiple(n , k )  :
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+class MLP(nn.Module):
+    def __init__(self, config )  :
+        super().__init__()
+        hidden_dim = 4 * config.n_embd * config.n_head
+        n_hidden = int(2 * hidden_dim / 3)
+        n_hidden = find_multiple(n_hidden, 256)
+        self.c_fc1 = nn.Linear(
+            config.n_embd  , n_hidden, bias=False
+        )
+        self.c_fc2 = nn.Linear(
+            config.n_embd  , n_hidden, bias=False
+        )
+        self.c_proj = nn.Linear(
+            n_hidden, config.n_embd  , bias=False
+        )
+    def forward(self, x):
+        x = F.silu(self.c_fc1(x)) * self.c_fc2(x)
+        x = self.c_proj(x)
+        return x
+class Block(nn.Module):
+    """ an unassuming Transformer block """
+    def __init__(self, config):
+        super().__init__()
+        self.rms_1 = RMSNorm(config.n_embd  )
+        self.rms_2 = RMSNorm(config.n_embd  )
+        self.ln1 = nn.LayerNorm(config.n_embd)
+        self.ln2 = nn.LayerNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+        self.mlp = MLP(config)
+    def forward(self, x):
+        y, attn = self.attn(self.rms_1(x))
+        x = x + y
+        x = x + self.mlp(self.rms_2(x))
+        return x, attn
+class GPT(nn.Module):
+    """  the full GPT language model, with a context size of block_size """
+    def __init__(self, config):
+        super().__init__()
+        # input embedding stem
+        self.config = config
+        self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
+        self.type_emb = nn.Embedding(2, config.n_embd)
+        if config.num_props:
+            self.prop_nn = nn.Linear(config.num_props, config.n_embd)
+        self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
+        self.drop = nn.Dropout(config.embd_pdrop)
+        # transformer
+        self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)])
+        # decoder head
+        self.ln_f = RMSNorm(config.n_embd   )
+        self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.block_size = config.block_size
+        if config.lstm:
+            self.lstm = nn.LSTM(input_size = config.n_embd, hidden_size = config.n_embd, num_layers = config.lstm_layers, dropout = 0.3, bidirectional = False)
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+    def get_block_size(self):
+        return self.block_size
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(
+                module.weight, mean=0.0, std=0.02 / math.sqrt(2 * self.config.n_layer)
+            )
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(
+                module.weight, mean=0.0, std=0.02 / math.sqrt(2 * self.config.n_layer)
+            )
+    def configure_optimizers(self, parameters, train_config):
+        optimizer = torch.optim.AdamW(parameters, lr=train_config.learning_rate, betas=train_config.betas)
+        return optimizer
+    def forward(self, idx, targets=None, prop = None, scaffold = None):
+        b, t = idx.size()
+        assert t <= self.block_size, "Cannot forward, model block size is exhausted."
+        if self.config.num_props:
+            assert prop.size(-1) == self.config.num_props, "Num_props should be equal to last dim of property vector"
+        # forward the GPT model
+        token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector
+        position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector
+        type_embeddings = self.type_emb(torch.ones((
+            b,t), dtype = torch.long, device = idx.device))
+        x = self.drop(token_embeddings + position_embeddings + type_embeddings)
+        if self.config.num_props:
+            type_embd = self.type_emb(torch.zeros((b, 1), dtype = torch.long, device = idx.device))
+            if prop.ndim == 2:
+                p = self.prop_nn(prop.unsqueeze(1))    # for single property
+            else:
+                p = self.prop_nn(prop)    # for multiproperty
+            p += type_embd
+            x = torch.cat([p, x], 1)
+        if self.config.scaffold:
+            type_embd = self.type_emb(torch.zeros((b, 1), dtype = torch.long, device = idx.device))
+            scaffold_embeds = self.tok_emb(scaffold)     # .mean(1, keepdim = True)
+            if self.config.lstm:
+                scaffold_embeds = self.lstm(scaffold_embeds.permute(1,0,2))[1][0]
+                # scaffold_embeds = scaffold_embeds.reshape(scaffold_embeds.shape[1], scaffold_embeds.shape[0], 2, self.config.n_embd).mean(2)
+                scaffold_embeds = scaffold_embeds.permute(1,0,2)   # mean(0, keepdim = True)
+                # scaffold_embeds = scaffold_embeds.reshape(self.config.lstm_layers, 1, -1, self.config.n_embd)[-1].permute(1,0,2)
+                # scaffold_embeds = scaffold_embeds.reshape(scaffold_embeds.shape[1], scaffold_embeds.shape[0], self.config.n_embd)
+            scaffold_embeds += type_embd
+            x = torch.cat([scaffold_embeds, x], 1)
+        # x = self.blocks(x)
+        attn_maps = []
+        for layer in self.blocks:
+            x, attn = layer(x)
+            attn_maps.append(attn)
+        x = self.ln_f(x)
+        logits = self.head(x)
+        if self.config.num_props and self.config.scaffold:
+            num = int(bool(self.config.num_props)) + int(self.config.scaffold_maxlen)
+        elif self.config.num_props:
+            num = int(bool(self.config.num_props))
+        elif self.config.scaffold:
+            num = int(self.config.scaffold_maxlen)
+        else:
+            num = 0
+        logits = logits[:, num:, :]
+        # if self.config.num_props or self.config.scaffold:
+        #     num = int(bool(self.config.num_props)) + int(self.config.scaffold_maxlen)  #int(self.config.lstm_layers)   # int(self.config.scaffold)      # int(self.config.scaffold)
+        # print(logits.shape)
+        # if we are given some desired targets also calculate the loss
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss, attn_maps # (num_layers, batch_size, num_heads, max_seq_len, max_seq_len)

tool/comget/ppcenos.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"<": 0, "[#Branch1]": 1, "[#Branch2]": 2, "[#C]": 3, "[#N]": 4, "[=Branch1]": 5, "[=Branch2]": 6, "[=C]": 7, "[=N]": 8, "[=O]": 9, "[=Ring1]": 10, "[=Ring2]": 11, "[=S]": 12, "[Branch1]": 13, "[Branch2]": 14, "[C]": 15, "[Cl]": 16, "[F]": 17, "[GeH2]": 18, "[Ge]": 19, "[NH1]": 20, "[N]": 21, "[O]": 22, "[P]": 23, "[Ring1]": 24, "[Ring2]": 25, "[S]": 26, "[Se]": 27, "[nop]": 28}

tool/comget/ppcenos.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddee4e16df14ee00e9736c66755977774edf1259c46afb03469c99ca7659fbf5
+size 160173846

tool/comget/utils.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from .moses.utils import get_mol
+from rdkit import Chem
+import numpy as np
+import threading
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None, prop = None, scaffold = None):
+    """
+    take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in
+    the sequence, feeding the predictions back into the model each time. Clearly the sampling
+    has quadratic complexity unlike an RNN that is only linear, and has a finite context window
+    of block_size, unlike an RNN that has an infinite context window.
+    """
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:] # crop context if needed
+        logits, _, _ = model(x_cond, prop = prop, scaffold = scaffold)   # for liggpt
+        # logits, _, _ = model(x_cond)   # for char_rnn
+        # pluck the logits at the final step and scale by temperature
+        logits = logits[:, -1, :] / temperature
+        # optionally crop probabilities to only the top k options
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        # apply softmax to convert to probabilities
+        probs = F.softmax(logits, dim=-1)
+        # sample from the distribution or take the most likely
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        # append to the sequence and continue
+        x = torch.cat((x, ix), dim=1)
+    return x
+def check_novelty(gen_smiles, train_smiles): # gen: say 788, train: 120803
+    if len(gen_smiles) == 0:
+        novel_ratio = 0.
+    else:
+        duplicates = [1 for mol in gen_smiles if mol in train_smiles]  # [1]*45
+        novel = len(gen_smiles) - sum(duplicates)  # 788-45=743
+        novel_ratio = novel*100./len(gen_smiles)  # 743*100/788=94.289
+    print("novelty: {:.3f}%".format(novel_ratio))
+    return novel_ratio
+def canonic_smiles(smiles_or_mol):
+    mol = get_mol(smiles_or_mol)
+    if mol is None:
+        return None
+    return Chem.MolToSmiles(mol)
+    #Experimental Class for Smiles Enumeration, Iterator and SmilesIterator adapted from Keras 1.2.2
+class Iterator(object):
+    """Abstract base class for data iterators.
+    # Arguments
+        n: Integer, total number of samples in the dataset to loop over.
+        batch_size: Integer, size of a batch.
+        shuffle: Boolean, whether to shuffle the data between epochs.
+        seed: Random seeding for data shuffling.
+    """
+    def __init__(self, n, batch_size, shuffle, seed):
+        self.n = n
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.batch_index = 0
+        self.total_batches_seen = 0
+        self.lock = threading.Lock()
+        self.index_generator = self._flow_index(n, batch_size, shuffle, seed)
+        if n < batch_size:
+            raise ValueError('Input data length is shorter than batch_size\nAdjust batch_size')
+    def reset(self):
+        self.batch_index = 0
+    def _flow_index(self, n, batch_size=32, shuffle=False, seed=None):
+        # Ensure self.batch_index is 0.
+        self.reset()
+        while 1:
+            if seed is not None:
+                np.random.seed(seed + self.total_batches_seen)
+            if self.batch_index == 0:
+                index_array = np.arange(n)
+                if shuffle:
+                    index_array = np.random.permutation(n)
+            current_index = (self.batch_index * batch_size) % n
+            if n > current_index + batch_size:
+                current_batch_size = batch_size
+                self.batch_index += 1
+            else:
+                current_batch_size = n - current_index
+                self.batch_index = 0
+            self.total_batches_seen += 1
+            yield (index_array[current_index: current_index + current_batch_size],
+                   current_index, current_batch_size)
+    def __iter__(self):
+        # Needed if we want to do something like:
+        # for x, y in data_gen.flow(...):
+        return self
+    def __next__(self, *args, **kwargs):
+        return self.next(*args, **kwargs)
+class SmilesIterator(Iterator):
+    """Iterator yielding data from a SMILES array.
+    # Arguments
+        x: Numpy array of SMILES input data.
+        y: Numpy array of targets data.
+        smiles_data_generator: Instance of `SmilesEnumerator`
+            to use for random SMILES generation.
+        batch_size: Integer, size of a batch.
+        shuffle: Boolean, whether to shuffle the data between epochs.
+        seed: Random seed for data shuffling.
+        dtype: dtype to use for returned batch. Set to keras.backend.floatx if using Keras
+    """
+    def __init__(self, x, y, smiles_data_generator,
+                 batch_size=32, shuffle=False, seed=None,
+                 dtype=np.float32
+                 ):
+        if y is not None and len(x) != len(y):
+            raise ValueError('X (images tensor) and y (labels) '
+                             'should have the same length. '
+                             'Found: X.shape = %s, y.shape = %s' %
+                             (np.asarray(x).shape, np.asarray(y).shape))
+        self.x = np.asarray(x)
+        if y is not None:
+            self.y = np.asarray(y)
+        else:
+            self.y = None
+        self.smiles_data_generator = smiles_data_generator
+        self.dtype = dtype
+        super(SmilesIterator, self).__init__(x.shape[0], batch_size, shuffle, seed)
+    def next(self):
+        """For python 2.x.
+        # Returns
+            The next batch.
+        """
+        # Keeps under lock only the mechanism which advances
+        # the indexing of each batch.
+        with self.lock:
+            index_array, current_index, current_batch_size = next(self.index_generator)
+        # The transformation of images is not under thread lock
+        # so it can be done in parallel
+        batch_x = np.zeros(tuple([current_batch_size] + [ self.smiles_data_generator.pad, self.smiles_data_generator._charlen]), dtype=self.dtype)
+        for i, j in enumerate(index_array):
+            smiles = self.x[j:j+1]
+            x = self.smiles_data_generator.transform(smiles)
+            batch_x[i] = x
+        if self.y is None:
+            return batch_x
+        batch_y = self.y[index_array]
+        return batch_x, batch_y
+class SmilesEnumerator(object):
+    """SMILES Enumerator, vectorizer and devectorizer
+    #Arguments
+        charset: string containing the characters for the vectorization
+          can also be generated via the .fit() method
+        pad: Length of the vectorization
+        leftpad: Add spaces to the left of the SMILES
+        isomericSmiles: Generate SMILES containing information about stereogenic centers
+        enum: Enumerate the SMILES during transform
+        canonical: use canonical SMILES during transform (overrides enum)
+    """
+    def __init__(self, charset = '@C)(=cOn1S2/H[N]\\', pad=120, leftpad=True, isomericSmiles=True, enum=True, canonical=False):
+        self._charset = None
+        self.charset = charset
+        self.pad = pad
+        self.leftpad = leftpad
+        self.isomericSmiles = isomericSmiles
+        self.enumerate = enum
+        self.canonical = canonical
+    @property
+    def charset(self):
+        return self._charset
+    @charset.setter
+    def charset(self, charset):
+        self._charset = charset
+        self._charlen = len(charset)
+        self._char_to_int = dict((c,i) for i,c in enumerate(charset))
+        self._int_to_char = dict((i,c) for i,c in enumerate(charset))
+    def fit(self, smiles, extra_chars=[], extra_pad = 5):
+        """Performs extraction of the charset and length of a SMILES datasets and sets self.pad and self.charset
+        #Arguments
+            smiles: Numpy array or Pandas series containing smiles as strings
+            extra_chars: List of extra chars to add to the charset (e.g. "\\\\" when "/" is present)
+            extra_pad: Extra padding to add before or after the SMILES vectorization
+        """
+        charset = set("".join(list(smiles)))
+        self.charset = "".join(charset.union(set(extra_chars)))
+        self.pad = max([len(smile) for smile in smiles]) + extra_pad
+    def randomize_smiles(self, smiles):
+        """Perform a randomization of a SMILES string
+        must be RDKit sanitizable"""
+        m = Chem.MolFromSmiles(smiles)
+        ans = list(range(m.GetNumAtoms()))
+        np.random.shuffle(ans)
+        nm = Chem.RenumberAtoms(m,ans)
+        return Chem.MolToSmiles(nm, canonical=self.canonical, isomericSmiles=self.isomericSmiles)
+    def transform(self, smiles):
+        """Perform an enumeration (randomization) and vectorization of a Numpy array of smiles strings
+        #Arguments
+            smiles: Numpy array or Pandas series containing smiles as strings
+        """
+        one_hot =  np.zeros((smiles.shape[0], self.pad, self._charlen),dtype=np.int8)
+        if self.leftpad:
+            for i,ss in enumerate(smiles):
+                if self.enumerate: ss = self.randomize_smiles(ss)
+                l = len(ss)
+                diff = self.pad - l
+                for j,c in enumerate(ss):
+                    one_hot[i,j+diff,self._char_to_int[c]] = 1
+            return one_hot
+        else:
+            for i,ss in enumerate(smiles):
+                if self.enumerate: ss = self.randomize_smiles(ss)
+                for j,c in enumerate(ss):
+                    one_hot[i,j,self._char_to_int[c]] = 1
+            return one_hot
+    def reverse_transform(self, vect):
+        """ Performs a conversion of a vectorized SMILES to a smiles strings
+        charset must be the same as used for vectorization.
+        #Arguments
+            vect: Numpy array of vectorized SMILES.
+        """
+        smiles = []
+        for v in vect:
+            #mask v
+            v=v[v.sum(axis=1)==1]
+            #Find one hot encoded index with argmax, translate to char and join to string
+            smile = "".join(self._int_to_char[i] for i in v.argmax(axis=1))
+            smiles.append(smile)
+        return np.array(smiles)

tool/converters.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from langchain.tools import BaseTool
+from tool.chemspace import ChemSpace
+import pandas as pd
+from utils import (
+    is_multiple_smiles,
+    is_smiles,
+    pubchem_query2smiles,
+    query2cas,
+    smiles2name,
+)
+class Query2CAS(BaseTool):
+    name:str  = "Mol2CAS"
+    description:str  = "Input molecule (name or SMILES), returns CAS number."
+    url_cid: str = None
+    url_data: str = None
+    def __init__(
+        self,
+    ):
+        super().__init__()
+        self.url_cid = (
+            "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/{}/cids/JSON"
+        )
+        self.url_data = (
+            "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{}/JSON"
+        )
+    def _run(self, query: str) -> str:
+        try:
+            # if query is smiles
+            smiles = None
+            if is_smiles(query):
+                smiles = query
+            try:
+                cas = query2cas(query, self.url_cid, self.url_data)
+            except ValueError as e:
+                return str(e)
+            if smiles is None:
+                try:
+                    smiles = pubchem_query2smiles(cas, None)
+                except ValueError as e:
+                    return str(e)
+            return cas
+        except ValueError:
+            return "CAS number not found"
+    async def _arun(self, query: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()
+class Query2SMILES(BaseTool):
+    name:str  = "CAS2SMILES"
+    description :str = "Input a   CAS number, returns SMILES."
+    url: str = None
+    chemspace_api_key: str = None
+    def __init__(self, chemspace_api_key: str = None):
+        super().__init__()
+        self.chemspace_api_key = chemspace_api_key
+        self.url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}"
+    def _run(self, query: str) -> str:
+        """This function queries the given molecule name and returns a SMILES string from the record"""
+        """Useful to get the SMILES string of one molecule by searching the name of a molecule. Only query with one specific name."""
+        if is_smiles(query) and is_multiple_smiles(query):
+            return "Multiple SMILES strings detected, input one molecule at a time."
+        try:
+            smi = pubchem_query2smiles(query, self.url)
+        except Exception as e:
+            if self.chemspace_api_key:
+                try:
+                    chemspace = ChemSpace(self.chemspace_api_key)
+                    smi = chemspace.convert_mol_rep(query, "smiles")
+                    smi = smi.split(":")[1]
+                except Exception:
+                    return str(e)
+            else:
+                try:
+                    smi = chemspace.convert_mol_rep(query, "smiles")
+                    smi = smi.split(":")[1]
+                except Exception:
+                    return str(e)
+        return smi
+    async def _arun(self, query: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()
+class Mol2SMILES(BaseTool):
+    name:str  = "Mol2SMILES"
+    description :str = "Input a molecular name , returns SMILES."
+    def __init__(self, chemspace_api_key: str = None):
+        super().__init__()
+    def _run(self, query: str) -> str:
+        """This function queries the given molecule name and returns a SMILES string from the record"""
+        """Useful to get the SMILES string of one molecule by searching the name of a molecule. Only query with one specific name."""
+        if is_smiles(query) and is_multiple_smiles(query):
+            return "Multiple SMILES strings detected, input one molecule at a time."
+        try:
+            smi = pubchem_query2smiles(query  )
+            return smi
+        except Exception as e:
+            try:
+               csv_data = pd.read_csv('tool/dataset.csv',encoding='ISO-8859-1')
+               relevant_rows = csv_data[csv_data['Name']==(query)]
+               if not relevant_rows.empty:
+                   # Get the most relevant answer (assuming we return the first match)
+                   return relevant_rows.iloc[0]['SMILES']
+            except:
+                return str(e)
+    async def _arun(self, query: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()
+class SMILES2Name(BaseTool):
+    name:str  = "SMILES2Name"
+    description:str  = "Input SMILES, returns molecule name."
+    def __init__(self):
+        super().__init__()
+    def _run(self, query: str) -> str:
+        """Use the tool."""
+        try:
+            if not is_smiles(query):
+                try:
+                    query2smiles = Query2SMILES()
+                    query = query2smiles.run(query)
+                except:
+                    raise ValueError("Invalid molecule input, no Pubchem entry")
+            name = smiles2name(query)
+            return name
+        except Exception as e:
+            return "Error: " + str(e)
+    async def _arun(self, query: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()

tool/csv_search.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Mon Dec 23 16:18:29 2024
+@author: BM109X32G-10GPU-02
+"""
+from langchain.tools import BaseTool
+import pandas as pd
+class search_csv(BaseTool):
+    name = "csvsearch"
+    description = (
+        "input name, return the SMILES of materials "
+        "convert name to SMILES."
+    )
+    llm: BaseLanguageModel = None
+    openai_api_key: str = None
+    semantic_scholar_api_key: str = None
+    def __init__(self):
+        super().__init__()
+    def _run(self, smiles: str) -> str:
+        csv_data = pd.read_csv('dataset.csv',encoding='ISO-8859-1')
+        relevant_rows = csv_data[csv_data['Name']==(query)]
+        if not relevant_rows.empty:
+            # Get the most relevant answer (assuming we return the first match)
+            return relevant_rows.iloc[0]['SMILES']
+        return None
+    async def _arun(self, smiles: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()

tool/dap/.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

tool/dap/OSC/test.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1081233b2f0b3c77752a98b3c9e4ae065cb21aae4e3e5d31f8d673a1c2069ded
+size 81596523

tool/dap/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # database

tool/dap/config/config_hparam.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{	"name": "biomarker_log",
+	"d_model_name" : "DeepChem/ChemBERTa-10M-MTR",
+	"p_model_name" : "DeepChem/ChemBERTa-77M-MLM",
+	"gpu_ids" : "0",
+	"model_mode" : "train",
+	"load_checkpoint" : "./checkpoint/bindingDB/test.ckpt",
+	"prot_maxlength" : 360,
+	"layer_limit" : true,
+	"max_epoch": 16,
+	"batch_size": 40,
+	"num_workers": 0,
+	"task_name" : "OSC",
+	"lr": 1e-4,
+	"layer_features" : [512, 128, 64, 1],
+	"dropout" : 0.1,
+	"loss_fn" : "MSE",
+	"traindata_rate" : 1.0,
+	"pretrained": {"chem":true, "prot":true},
+	"num_seed" : 111
+}

tool/dap/config/predict.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{	"name": "biomarker_log",
+	"d_model_name" : "DeepChem/ChemBERTa-10M-MLM",
+	"p_model_name" : "DeepChem/ChemBERTa-10M-MTR",
+	"gpu_ids" : "0",
+	"model_mode" : "test",
+	"load_checkpoint" : "tool/dap/OSC/test.ckpt",
+	"prot_maxlength" : 360,
+	"layer_limit" : true,
+	"max_epoch": 16,
+	"batch_size": 40,
+	"num_workers": 0,
+	"task_name" : "OSC",
+	"lr": 1e-4,
+	"layer_features" : [128, 128, 128, 1],
+	"dropout" : 0.1,
+	"loss_fn" : "MSE",
+	"traindata_rate" : 1.0,
+	"pretrained": {"chem":true, "prot":true},
+	"num_seed" : 111
+}

tool/dap/requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+altair
+streamlit
+streamlit-ketcher
+torch
+tqdm
+transformers
+pytorch_lightning
+scipy
+pandas
+rdkit
+scikit-learn
+matplotlib
+easydict
+wandb
+networkx
+seaborn

tool/dap/run.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+import pandas as pd
+import torch
+from torch.nn import functional as F
+from transformers import AutoTokenizer
+from .util.utils import *
+from rdkit import Chem
+from tqdm import tqdm
+from .train import markerModel
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = '0 '
+device_count = torch.cuda.device_count()
+device_biomarker = torch.device('cuda' if torch.cuda.is_available() else "cpu")
+device = torch.device('cpu')
+a_model_name = 'DeepChem/ChemBERTa-10M-MLM'
+d_model_name = 'DeepChem/ChemBERTa-10M-MTR'
+tokenizer = AutoTokenizer.from_pretrained(a_model_name)
+d_tokenizer = AutoTokenizer.from_pretrained(d_model_name)
+#--biomarker Model
+##-- hyper param config file Load --##
+config = load_hparams('tool/dap/config/predict.json')
+config = DictX(config)
+model = markerModel(config.d_model_name, config.p_model_name,
+                              config.lr, config.dropout, config.layer_features, config.loss_fn, config.layer_limit, config.pretrained['chem'], config.pretrained['prot'])
+model = markerModel.load_from_checkpoint(config.load_checkpoint,strict=False)
+model.eval()
+model.freeze()
+if device_biomarker.type == 'cuda':
+    model = torch.nn.DataParallel(model)
+def get_marker(drug_inputs, prot_inputs):
+    output_preds = model(drug_inputs, prot_inputs)
+    predict = torch.squeeze( (output_preds)).tolist()
+    # output_preds = torch.relu(output_preds)
+    # predict = torch.tanh(output_preds)
+    # predict = predict.squeeze(dim=1).tolist()
+    return predict
+def marker_prediction(smiles, aas):
+    try:
+        aas_input = []
+        for ass_data in aas:
+            aas_input.append(' '.join(list(ass_data)))
+        a_inputs = tokenizer(smiles, padding='max_length', max_length=510, truncation=True, return_tensors="pt")
+        # d_inputs = tokenizer(smiles, truncation=True, return_tensors="pt")
+        a_input_ids = a_inputs['input_ids'].to(device)
+        a_attention_mask = a_inputs['attention_mask'].to(device)
+        a_inputs = {'input_ids': a_input_ids, 'attention_mask': a_attention_mask}
+        d_inputs = d_tokenizer(aas_input, padding='max_length', max_length=510, truncation=True, return_tensors="pt")
+        # p_inputs = prot_tokenizer(aas_input, truncation=True, return_tensors="pt")
+        d_input_ids = d_inputs['input_ids'].to(device)
+        d_attention_mask = d_inputs['attention_mask'].to(device)
+        d_inputs = {'input_ids': d_input_ids, 'attention_mask': d_attention_mask}
+        output_list = get_marker(a_inputs, d_inputs)
+        return output_list
+    except Exception as e:
+        print(e)
+        return {'Error_message': e}
+def smiles_aas_test(smile_acc,smile_don):
+    mola =  Chem.MolFromSmiles(smile_acc)
+    smile_acc = Chem.MolToSmiles(mola,   canonical=True)
+    mold =  Chem.MolFromSmiles(smile_don)
+    smile_don = Chem.MolToSmiles(mold, canonical=True)
+    batch_size = 1
+    datas = []
+    marker_list = []
+    marker_datas = []
+    marker_datas.append([smile_acc,smile_don])
+    if len(marker_datas) == batch_size:
+            marker_list.append(list(marker_datas))
+            marker_datas.clear()
+    if len(marker_datas) != 0:
+        marker_list.append(list(marker_datas))
+        marker_datas.clear()
+    for marker_datas in tqdm(marker_list, total=len(marker_list)):
+        smiles_d , smiles_a  = zip(*marker_datas)
+        output_pred = marker_prediction(list(smiles_d), list(smiles_a) )
+        if len(datas) == 0:
+            datas = output_pred
+        else:
+            datas = datas + output_pred
+    # ## -- Export result data to csv -- ##
+    # df = pd.DataFrame(datas)
+    # df.to_csv('./results/predictData_nontonon_bindingdb_test.csv', index=None)
+    # print(df)
+    return datas
+if __name__ == '__main__':
+    a = smiles_aas_test('CC(C)CCCC(C)CCC1=C(/C=C2\C(=O)C3=C(C=C(F)C(F)=C3)C2=C(C#N)C#N)SC2=C1N(CCC(C)CCCC(C)C)C1=C2C2=NSN=C2C2=C1N(CCC(C)CCCC(C)C)C1=C2SC(/C=C2\C(=O)C3=C(C=C(F)C(F)=C3)C2=C(C#N)C#N)=C1CCC(C)CCCC(C)C','CCCCC(CC)CC1=C(F)C=C(C2=C3C=C(C4=CC=C(C5=C6C(=O)C7=C(CC(CC)CCCC)SC(CC(CC)CCCC)=C7C(=O)C6=C(C6=CC=C(C)S6)S5)S4)SC3=C(C3=CC(F)=C(CC(CC)CCCC)S3)C3=C2SC(C)=C3)S1')

tool/dap/screen.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import os
+import pandas as pd
+import torch
+from torch.nn import functional as F
+from transformers import AutoTokenizer
+from rdkit import Chem
+from .util.utils import *
+from tqdm import tqdm
+from .train import markerModel
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = '0 '
+device_count = torch.cuda.device_count()
+device_biberta= torch.device('cuda' if torch.cuda.is_available() else "cpu")
+device = torch.device('cpu')
+a_model_name = 'DeepChem/ChemBERTa-10M-MLM'
+d_model_name = 'DeepChem/ChemBERTa-10M-MTR'
+tokenizer = AutoTokenizer.from_pretrained(a_model_name)
+d_tokenizer = AutoTokenizer.from_pretrained(d_model_name)
+#--bibertaModel
+##-- hyper param config file Load --##
+config = load_hparams('tool/dap/config/predict.json')
+config = DictX(config)
+model = markerModel(config.d_model_name, config.p_model_name,
+                              config.lr, config.dropout, config.layer_features, config.loss_fn, config.layer_limit, config.pretrained['chem'], config.pretrained['prot'])
+model = markerModel.load_from_checkpoint(config.load_checkpoint,strict=False)
+model.eval()
+model.freeze()
+if device_biberta.type == 'cuda':
+    model = torch.nn.DataParallel(model)
+def get_biberta(drug_inputs, prot_inputs):
+    output_preds = model(drug_inputs, prot_inputs)
+    predict = torch.squeeze( (output_preds)).tolist()
+    # output_preds = torch.relu(output_preds)
+    # predict = torch.tanh(output_preds)
+    # predict = predict.squeeze(dim=1).tolist()
+    return predict
+def biberta_prediction(smiles, aas):
+    try:
+        aas_input = []
+        for ass_data in aas:
+            aas_input.append(' '.join(list(ass_data)))
+        a_inputs = tokenizer(smiles, padding='max_length', max_length=510, truncation=True, return_tensors="pt")
+        # d_inputs = tokenizer(smiles, truncation=True, return_tensors="pt")
+        a_input_ids = a_inputs['input_ids'].to(device)
+        a_attention_mask = a_inputs['attention_mask'].to(device)
+        a_inputs = {'input_ids': a_input_ids, 'attention_mask': a_attention_mask}
+        d_inputs = d_tokenizer(aas_input, padding='max_length', max_length=510, truncation=True, return_tensors="pt")
+        # p_inputs = prot_tokenizer(aas_input, truncation=True, return_tensors="pt")
+        d_input_ids = d_inputs['input_ids'].to(device)
+        d_attention_mask = d_inputs['attention_mask'].to(device)
+        d_inputs = {'input_ids': d_input_ids, 'attention_mask': d_attention_mask}
+        output_predict = get_biberta(a_inputs, d_inputs)
+        output_list = [{'acceptor': smiles[i], 'donor': aas[i], 'predict': output_predict[i]} for i in range(0,len(aas))]
+        return output_list
+    except Exception as e:
+        print(e)
+        return {'Error_message': e}
+def smiles_aas_test(file):
+    batch_se = 80
+    try:
+        datas = []
+        biberta_list = []
+        biberta_datas = []
+        smiles_aas = pd.read_csv(file)
+        smiles_d , smiles_a  = (smiles_aas['donor'],smiles_aas['acceptor'])
+        donor,acceptor =[],[]
+        for i in smiles_d:
+            s = Chem.MolToSmiles(Chem.MolFromSmiles(i))
+            donor.append(s)
+        for i in smiles_a:
+            s = Chem.MolToSmiles(Chem.MolFromSmiles(i))
+            acceptor.append(s)
+        output_pred = biberta_prediction(list(acceptor), list(donor) )
+        if len(datas) == 0:
+            datas = output_pred
+        else:
+            datas = datas + output_pred
+        # ## -- Export result data to csv -- ##
+        df = pd.DataFrame(datas)
+        # print(df)
+        return datas
+    except Exception as e:
+        print(e)
+        return {'Error_message': e}

tool/dap/train.py ADDED Viewed

	@@ -0,0 +1,454 @@

+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+import gc, os
+import numpy as np
+import pandas as pd
+from scipy.stats import pearsonr
+from .util.utils import *
+#from .util.attention_flow import *
+import torch
+import torch.nn as nn
+import sklearn as sk
+from torch.utils.data import Dataset, DataLoader
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger, TensorBoardLogger
+from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
+from transformers import AutoConfig, AutoTokenizer, RobertaModel, BertModel
+from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error
+class markerDataset(Dataset):
+    def __init__(self, list_IDs, labels, df_dti, d_tokenizer, p_tokenizer):
+        'Initialization'
+        self.labels = labels
+        self.list_IDs = list_IDs
+        self.df = df_dti
+        self.d_tokenizer = d_tokenizer
+        self.p_tokenizer = p_tokenizer
+    def convert_data(self, acc_data, don_data):
+        d_inputs = self.d_tokenizer(acc_data, return_tensors="pt")
+        p_inputs = self.d_tokenizer(don_data, return_tensors="pt")
+        acc_input_ids = d_inputs['input_ids']
+        acc_attention_mask = d_inputs['attention_mask']
+        acc_inputs = {'input_ids': acc_input_ids, 'attention_mask': acc_attention_mask}
+        don_input_ids = p_inputs['input_ids']
+        don_attention_mask = p_inputs['attention_mask']
+        don_inputs = {'input_ids': don_input_ids, 'attention_mask': don_attention_mask}
+        return acc_inputs, don_inputs
+    def tokenize_data(self, acc_data, don_data):
+        tokenize_acc = ['[CLS]'] + self.d_tokenizer.tokenize(acc_data) + ['[SEP]']
+        tokenize_don = ['[CLS]'] + self.p_tokenizer.tokenize(don_data) + ['[SEP]']
+        return tokenize_acc, tokenize_don
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.list_IDs)
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        index = self.list_IDs[index]
+        acc_data = self.df.iloc[index]['acceptor']
+        don_data = self.df.iloc[index]['donor']
+        d_inputs = self.d_tokenizer(acc_data, padding='max_length', max_length=400, truncation=True, return_tensors="pt")
+        p_inputs = self.p_tokenizer(don_data, padding='max_length', max_length=400, truncation=True, return_tensors="pt")
+        d_input_ids = d_inputs['input_ids'].squeeze()
+        d_attention_mask = d_inputs['attention_mask'].squeeze()
+        p_input_ids = p_inputs['input_ids'].squeeze()
+        p_attention_mask = p_inputs['attention_mask'].squeeze()
+        labels = torch.as_tensor(self.labels[index], dtype=torch.float)
+        dataset = [d_input_ids, d_attention_mask, p_input_ids, p_attention_mask, labels]
+        return dataset
+class markerDataModule(pl.LightningDataModule):
+    def __init__(self, task_name, acc_model_name, don_model_name, num_workers, batch_size,  traindata_rate = 1.0):
+        super().__init__()
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.task_name = task_name
+        self.traindata_rate = traindata_rate
+        self.d_tokenizer = AutoTokenizer.from_pretrained(acc_model_name)
+        self.p_tokenizer = AutoTokenizer.from_pretrained(don_model_name)
+        self.df_train = None
+        self.df_val = None
+        self.df_test = None
+        self.load_testData = True
+        self.train_dataset = None
+        self.valid_dataset = None
+        self.test_dataset = None
+    def get_task(self, task_name):
+        if task_name.lower() == 'OSC':
+            return './dataset/OSC/'
+        elif task_name.lower() == 'merge':
+            self.load_testData = False
+            return './dataset/MergeDataset'
+    def prepare_data(self):
+        # Use this method to do things that might write to disk or that need to be done only from
+        # a single process in distributed settings.
+        dataFolder = './dataset/OSC'
+        self.df_train = pd.read_csv(dataFolder + '/train.csv')
+        self.df_val = pd.read_csv(dataFolder + '/val.csv')
+        ## -- Data Lenght Rate apply -- ##
+        traindata_length = int(len(self.df_train) * self.traindata_rate)
+        validdata_length = int(len(self.df_val) * self.traindata_rate)
+        self.df_train = self.df_train[:traindata_length]
+        self.df_val = self.df_val[:validdata_length]
+        if self.load_testData is True:
+            self.df_test = pd.read_csv(dataFolder + '/test.csv')
+    def setup(self, stage=None):
+        if stage == 'fit' or stage is None:
+            self.train_dataset = markerDataset(self.df_train.index.values, self.df_train.Label.values, self.df_train,
+                                                  self.d_tokenizer, self.p_tokenizer)
+            self.valid_dataset = markerDataset(self.df_val.index.values, self.df_val.Label.values, self.df_val,
+                                                  self.d_tokenizer, self.p_tokenizer)
+        if self.load_testData is True:
+            self.test_dataset = markerDataset(self.df_test.index.values, self.df_test.Label.values, self.df_test,
+                                                self.d_tokenizer, self.p_tokenizer)
+    def train_dataloader(self):
+        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers)
+    def val_dataloader(self):
+        return DataLoader(self.valid_dataset, batch_size=self.batch_size, num_workers=self.num_workers)
+    def test_dataloader(self):
+        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers)
+class markerModel(pl.LightningModule):
+    def __init__(self, acc_model_name, don_model_name, lr, dropout, layer_features, loss_fn = "smooth", layer_limit = True, d_pretrained=True, p_pretrained=True):
+        super().__init__()
+        self.lr = lr
+        self.loss_fn = loss_fn
+        self.criterion = torch.nn.MSELoss()
+        self.criterion_smooth = torch.nn.SmoothL1Loss()
+        # self.sigmoid = nn.Sigmoid()
+        #-- Pretrained Model Setting
+        acc_config = AutoConfig.from_pretrained("seyonec/SMILES_BPE_PubChem_100k_shard00")
+        if d_pretrained is False:
+            self.d_model = RobertaModel(acc_config)
+            print('acceptor model without pretraining')
+        else:
+            self.d_model = RobertaModel.from_pretrained(acc_model_name, num_labels=2,
+                                                        output_hidden_states=True,
+                                                        output_attentions=True)
+        don_config = AutoConfig.from_pretrained("seyonec/SMILES_BPE_PubChem_100k_shard00")
+        if p_pretrained is False:
+            self.p_model = RobertaModel(don_config)
+            print('donor model without pretraining')
+        else:
+            self.p_model = RobertaModel.from_pretrained(don_model_name,
+                                                        output_hidden_states=True,
+                                                        output_attentions=True)
+        #-- Decoder Layer Setting
+        layers = []
+        firstfeature = self.d_model.config.hidden_size + self.p_model.config.hidden_size
+        for feature_idx in range(0, len(layer_features) - 1):
+            layers.append(nn.Linear(firstfeature, layer_features[feature_idx]))
+            firstfeature = layer_features[feature_idx]
+            if feature_idx is len(layer_features)-2:
+                layers.append(nn.ReLU())
+            else:
+                layers.append(nn.ReLU())
+            if dropout > 0:
+                layers.append(nn.Dropout(dropout))
+        layers.append(nn.Linear(firstfeature, layer_features[-1]))
+        self.decoder = nn.Sequential(*layers)
+        self.save_hyperparameters()
+    def forward(self, acc_inputs, don_inputs):
+        d_outputs = self.d_model(acc_inputs['input_ids'], acc_inputs['attention_mask'])
+        p_outputs = self.p_model(don_inputs['input_ids'], don_inputs['attention_mask'])
+        outs = torch.cat((d_outputs.last_hidden_state[:, 0], p_outputs.last_hidden_state[:, 0]), dim=1)
+        outs = self.decoder(outs)
+        return outs
+    def attention_output(self, acc_inputs, don_inputs):
+        d_outputs = self.d_model(acc_inputs['input_ids'], acc_inputs['attention_mask'])
+        p_outputs = self.p_model(don_inputs['input_ids'], don_inputs['attention_mask'])
+        outs = torch.cat((d_outputs.last_hidden_state[:, 0], p_outputs.last_hidden_state[:, 0]), dim=1)
+        outs = self.decoder(outs)
+        return d_outputs['attentions'], p_outputs['attentions'], outs
+    def training_step(self, batch, batch_idx):
+        acc_inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
+        don_inputs = {'input_ids': batch[2], 'attention_mask': batch[3]}
+        labels = batch[4]
+        output = self(acc_inputs, don_inputs)
+        logits = output.squeeze(dim=1)
+        if self.loss_fn == 'MSE':
+            loss = self.criterion(logits, labels)
+        else:
+            loss = self.criterion_smooth(logits, labels)
+        self.log("train_loss", loss, on_step=False, on_epoch=True, logger=True)
+       # print("train_loss", loss)
+        return {"loss": loss}
+    def validation_step(self, batch, batch_idx):
+        acc_inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
+        don_inputs = {'input_ids': batch[2], 'attention_mask': batch[3]}
+        labels = batch[4]
+        output = self(acc_inputs, don_inputs)
+        logits = output.squeeze(dim=1)
+        if self.loss_fn == 'MSE':
+            loss = self.criterion(logits, labels)
+        else:
+            loss = self.criterion_smooth(logits, labels)
+        self.log("valid_loss", loss, on_step=False, on_epoch=True, logger=True)
+       # print("valid_loss", loss)
+        return {"logits": logits, "labels": labels}
+    def validation_step_end(self, outputs):
+        return {"logits": outputs['logits'], "labels": outputs['labels']}
+    def validation_epoch_end(self, outputs):
+        preds = self.convert_outputs_to_preds(outputs)
+        labels = torch.as_tensor(torch.cat([output['labels'] for output in outputs], dim=0), dtype=torch.int)
+        mae, mse, r2,r = self.log_score(preds, labels)
+        self.log("mae", mae, on_step=False, on_epoch=True, logger=True)
+        self.log("mse", mse, on_step=False, on_epoch=True, logger=True)
+        self.log("r2", r2, on_step=False, on_epoch=True, logger=True)
+    def test_step(self, batch, batch_idx):
+        acc_inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
+        don_inputs = {'input_ids': batch[2], 'attention_mask': batch[3]}
+        labels = batch[4]
+        output = self(acc_inputs, don_inputs)
+        logits = output.squeeze(dim=1)
+        if self.loss_fn == 'MSE':
+            loss = self.criterion(logits, labels)
+        else:
+            loss = self.criterion_smooth(logits, labels)
+        self.log("test_loss", loss, on_step=False, on_epoch=True, logger=True)
+        return {"logits": logits, "labels": labels}
+    def test_step_end(self, outputs):
+        return {"logits": outputs['logits'], "labels": outputs['labels']}
+    def test_epoch_end(self, outputs):
+        preds = self.convert_outputs_to_preds(outputs)
+        labels = torch.as_tensor(torch.cat([output['labels'] for output in outputs], dim=0), dtype=torch.int)
+        mae, mse, r2,r = self.log_score(preds, labels)
+        self.log("mae", mae, on_step=False, on_epoch=True, logger=True)
+        self.log("mse", mse, on_step=False, on_epoch=True, logger=True)
+        self.log("r2", r2, on_step=False, on_epoch=True, logger=True)
+        self.log("r", r, on_step=False, on_epoch=True, logger=True)
+    def configure_optimizers(self):
+        param_optimizer = list(self.named_parameters())
+        no_decay = ["bias", "gamma", "beta"]
+        optimizer_grouped_parameters = [
+            {
+                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+                "weight_decay_rate": 0.0001
+            },
+            {
+                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
+                "weight_decay_rate": 0.0
+            },
+        ]
+        optimizer = torch.optim.AdamW(
+            optimizer_grouped_parameters,
+            lr=self.lr,
+        )
+        return optimizer
+    def convert_outputs_to_preds(self, outputs):
+        logits = torch.cat([output['logits'] for output in outputs], dim=0)
+        return logits
+    def log_score(self, preds, labels):
+        y_pred = preds.detach().cpu().numpy()
+        y_label = labels.detach().cpu().numpy()
+        mae = mean_absolute_error(y_label, y_pred)
+        mse =  mean_squared_error(y_label, y_pred)
+        r2=r2_score(y_label, y_pred)
+        r = pearsonr(y_label, y_pred)
+        print(f'\nmae : {mae}')
+        print(f'mse : {mse}')
+        print(f'r2 : {r2}')
+        print(f'r : {r}')
+        return mae, mse, r2, r
+def main_wandb(config=None):
+    try:
+        if config is not None:
+            wandb.init(config=config, project=project_name)
+        else:
+            wandb.init(settings=wandb.Settings(console='off'))
+        config = wandb.config
+        pl.seed_everything(seed=config.num_seed)
+        dm = markerDataModule(config.task_name, config.d_model_name, config.p_model_name,
+                                 config.num_workers, config.batch_size, config.prot_maxlength, config.traindata_rate)
+        dm.prepare_data()
+        dm.setup()
+        model_type = str(config.pretrained['chem'])+"To"+str(config.pretrained['prot'])
+        #model_logger = WandbLogger(project=project_name)
+        checkpoint_callback = ModelCheckpoint(f"{config.task_name}_{model_type}_{config.lr}_{config.num_seed}", save_top_k=1, monitor="mae", mode="max")
+        trainer = pl.Trainer(
+                             max_epochs=config.max_epoch,
+                             precision=16,
+                             #logger=model_logger,
+                             callbacks=[checkpoint_callback],
+                             accelerator='cpu',log_every_n_steps=40
+                             )
+        if config.model_mode == "train":
+            model = markerModel(config.d_model_name, config.p_model_name,
+                               config.lr, config.dropout, config.layer_features, config.loss_fn, config.layer_limit, config.pretrained['chem'], config.pretrained['prot'])
+            model.train()
+            trainer.fit(model, datamodule=dm)
+            model.eval()
+            trainer.test(model, datamodule=dm)
+        else:
+            model = markerModel.load_from_checkpoint(config.load_checkpoint)
+            model.eval()
+            trainer.test(model, datamodule=dm)
+    except Exception as e:
+        print(e)
+def main_default(config):
+    try:
+        config = DictX(config)
+        pl.seed_everything(seed=config.num_seed)
+        dm = markerDataModule(config.task_name, config.d_model_name, config.p_model_name,
+                                 config.num_workers, config.batch_size, config.traindata_rate)
+        dm.prepare_data()
+        dm.setup()
+        model_type = str(config.pretrained['chem'])+"To"+str(config.pretrained['prot'])
+       # model_logger = TensorBoardLogger("./log", name=f"{config.task_name}_{model_type}_{config.num_seed}")
+        checkpoint_callback = ModelCheckpoint(f"{config.task_name}_{model_type}_{config.lr}_{config.num_seed}", save_top_k=1, monitor="mse", mode="max")
+        trainer = pl.Trainer(
+                             max_epochs=config.max_epoch,
+                             precision= 32,
+                            # logger=model_logger,
+                             callbacks=[checkpoint_callback],
+                             accelerator='cpu',log_every_n_steps=40
+                             )
+        if config.model_mode == "train":
+            model = markerModel(config.d_model_name, config.p_model_name,
+                               config.lr, config.dropout, config.layer_features, config.loss_fn, config.layer_limit, config.pretrained['chem'], config.pretrained['prot'])
+            model.train()
+            trainer.fit(model, datamodule=dm)
+            model.eval()
+            trainer.test(model, datamodule=dm)
+        else:
+            model = markerModel.load_from_checkpoint(config.load_checkpoint)
+            model.eval()
+            trainer.test(model, datamodule=dm)
+    except Exception as e:
+        print(e)
+if __name__ == '__main__':
+    using_wandb = False
+    if using_wandb == True:
+        #-- hyper param config file Load --##
+        config = load_hparams('config/config_hparam.json')
+        project_name = config["name"]
+        main_wandb(config)
+        ##-- wandb Sweep Hyper Param Tuning --##
+        # config = load_hparams('config/config_sweep_bindingDB.json')
+        # project_name = config["name"]
+        # sweep_id = wandb.sweep(config, project=project_name)
+        # wandb.agent(sweep_id, main_wandb)
+    else:
+        config = load_hparams('config/config_hparam.json')
+        main_default(config)

tool/dap/util/attention_flow.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import networkx as nx
+import numpy as np
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import seaborn as sns
+import itertools
+import matplotlib as mpl
+# import cugraph as cnx
+rc={'font.size': 10, 'axes.labelsize': 10, 'legend.fontsize': 10.0,
+    'axes.titlesize': 32, 'xtick.labelsize': 20, 'ytick.labelsize': 16}
+plt.rcParams.update(**rc)
+mpl.rcParams['axes.linewidth'] = .5 #set the value globally
+def plot_attention_heatmap(att, s_position, t_positions, input_tokens):
+    cls_att = np.flip(att[:,s_position, t_positions], axis=0)
+    xticklb = list(itertools.compress(input_tokens, [i in t_positions for i in np.arange(len(input_tokens))]))
+    yticklb = [str(i) if i%2 ==0 else '' for i in np.arange(att.shape[0],0, -1)]
+    ax = sns.heatmap(cls_att, xticklabels=xticklb, yticklabels=yticklb, cmap="YlOrRd")
+    return ax
+def convert_adjmat_tomats(adjmat, n_layers, l):
+    mats = np.zeros((n_layers,l,l))
+    for i in np.arange(n_layers):
+        mats[i] = adjmat[(i+1)*l:(i+2)*l,i*l:(i+1)*l]
+    return mats
+def make_residual_attention(attentions):
+    all_attention = [att.detach().cpu().numpy() for att in attentions]
+    attentions_mat = np.asarray(all_attention)[:,0]
+    res_att_mat = attentions_mat.sum(axis=1)/attentions_mat.shape[1]
+    res_att_mat = res_att_mat + np.eye(res_att_mat.shape[1])[None,...]
+    res_att_mat = res_att_mat / res_att_mat.sum(axis=-1)[...,None]
+    return attentions_mat, res_att_mat
+## -------------------------------------------------------- ##
+## -- Make flow network (No Print Node - edge Connection)-- ##
+## -------------------------------------------------------- ##
+def make_flow_network(mat, input_tokens):
+    n_layers, length, _ = mat.shape
+    adj_mat = np.zeros(((n_layers+1)*length, (n_layers+1)*length))
+    labels_to_index = {}
+    for k in np.arange(length):
+        labels_to_index[str(k)+"_"+input_tokens[k]] = k
+    for i in np.arange(1,n_layers+1):
+        for k_f in np.arange(length):
+            index_from = (i)*length+k_f
+            label = "L"+str(i)+"_"+str(k_f)
+            labels_to_index[label] = index_from
+            for k_t in np.arange(length):
+                index_to = (i-1)*length+k_t
+                adj_mat[index_from][index_to] = mat[i-1][k_f][k_t]
+    net_graph=nx.from_numpy_matrix(adj_mat, create_using=nx.DiGraph())
+    for i in np.arange(adj_mat.shape[0]):
+        for j in np.arange(adj_mat.shape[1]):
+            nx.set_edge_attributes(net_graph, {(i,j): adj_mat[i,j]}, 'capacity')
+    return net_graph, labels_to_index
+def make_input_node(attention_mat, res_labels_to_index):
+    input_nodes = []
+    for key in res_labels_to_index:
+        if res_labels_to_index[key] < attention_mat.shape[-1]:
+            input_nodes.append(key)
+    return input_nodes
+## ------------------------------------------------ ##
+## -- Draw Attention flow node - Edge Connection -- ##
+## ------------------------------------------------ ##
+##-- networkx graph Initation and Calculation flow --##
+def get_adjmat(mat, input_tokens):
+    n_layers, length, _ = mat.shape
+    adj_mat = np.zeros(((n_layers+1)*length, (n_layers+1)*length))
+    labels_to_index = {}
+    for k in np.arange(length):
+        labels_to_index[str(k)+"_"+input_tokens[k]] = k
+    for i in np.arange(1,n_layers+1):
+        for k_f in np.arange(length):
+            index_from = (i)*length+k_f
+            label = "L"+str(i)+"_"+str(k_f)
+            labels_to_index[label] = index_from
+            for k_t in np.arange(length):
+                index_to = (i-1)*length+k_t
+                adj_mat[index_from][index_to] = mat[i-1][k_f][k_t]
+    return adj_mat, labels_to_index
+def draw_attention_graph(adjmat, labels_to_index, n_layers, length):
+    A = adjmat
+    net_graph=nx.from_numpy_matrix(A, create_using=nx.DiGraph())
+    for i in np.arange(A.shape[0]):
+        for j in np.arange(A.shape[1]):
+            nx.set_edge_attributes(net_graph, {(i,j): A[i,j]}, 'capacity')
+    pos = {}
+    label_pos = {}
+    for i in np.arange(n_layers+1):
+        for k_f in np.arange(length):
+            pos[i*length+k_f] = ((i+0.4)*2, length - k_f)
+            label_pos[i*length+k_f] = (i*2, length - k_f)
+    index_to_labels = {}
+    for key in labels_to_index:
+        index_to_labels[labels_to_index[key]] = key.split("_")[-1]
+        if labels_to_index[key] >= length:
+            index_to_labels[labels_to_index[key]] = ''
+    #plt.figure(1,figsize=(20,12))
+    nx.draw_networkx_nodes(net_graph,pos,node_color='green', labels=index_to_labels, node_size=50)
+    nx.draw_networkx_labels(net_graph,pos=label_pos, labels=index_to_labels, font_size=18)
+    all_weights = []
+    #4 a. Iterate through the graph nodes to gather all the weights
+    for (node1,node2,data) in net_graph.edges(data=True):
+        all_weights.append(data['weight']) #we'll use this when determining edge thickness
+    #4 b. Get unique weights
+    unique_weights = list(set(all_weights))
+    #4 c. Plot the edges - one by one!
+    for weight in unique_weights:
+        #4 d. Form a filtered list with just the weight you want to draw
+        weighted_edges = [(node1,node2) for (node1,node2,edge_attr) in net_graph.edges(data=True) if edge_attr['weight']==weight]
+        #4 e. I think multiplying by [num_nodes/sum(all_weights)] makes the graphs edges look cleaner
+        w = weight #(weight - min(all_weights))/(max(all_weights) - min(all_weights))
+        width = w
+        nx.draw_networkx_edges(net_graph,pos,edgelist=weighted_edges,width=width, edge_color='darkblue')
+    return net_graph
+def compute_flows(G, labels_to_index, input_nodes, length):
+    number_of_nodes = len(labels_to_index)
+    flow_values=np.zeros((number_of_nodes,number_of_nodes))
+    for key in tqdm(labels_to_index, desc="flow algorithms", total=len(labels_to_index)):
+        if key not in input_nodes:
+            current_layer = int(labels_to_index[key] / length)
+            pre_layer = current_layer - 1
+            u = labels_to_index[key]
+            for inp_node_key in input_nodes:
+                v = labels_to_index[inp_node_key]
+                flow_value = nx.maximum_flow_value(G,u,v, flow_func=nx.algorithms.flow.edmonds_karp)
+                # flow_value = cnx
+                flow_values[u][pre_layer*length+v ] = flow_value
+            flow_values[u] /= flow_values[u].sum()
+    return flow_values
+def compute_node_flow(G, labels_to_index, input_nodes, output_nodes,length):
+    number_of_nodes = len(labels_to_index)
+    flow_values=np.zeros((number_of_nodes,number_of_nodes))
+    for key in output_nodes:
+        if key not in input_nodes:
+            current_layer = int(labels_to_index[key] / length)
+            pre_layer = current_layer - 1
+            u = labels_to_index[key]
+            for inp_node_key in input_nodes:
+                v = labels_to_index[inp_node_key]
+                flow_value = nx.maximum_flow_value(G,u,v, flow_func=nx.algorithms.flow.edmonds_karp)
+                flow_values[u][pre_layer*length+v ] = flow_value
+            flow_values[u] /= flow_values[u].sum()
+    return flow_values
+def compute_joint_attention(att_mat, add_residual=True):
+    if add_residual:
+        residual_att = np.eye(att_mat.shape[1])[None,...]
+        aug_att_mat = att_mat + residual_att
+        aug_att_mat = aug_att_mat / aug_att_mat.sum(axis=-1)[...,None]
+    else:
+       aug_att_mat =  att_mat
+    joint_attentions = np.zeros(aug_att_mat.shape)
+    layers = joint_attentions.shape[0]
+    joint_attentions[0] = aug_att_mat[0]
+    for i in np.arange(1,layers):
+        joint_attentions[i] = aug_att_mat[i].dot(joint_attentions[i-1])
+    return joint_attentions

tool/dap/util/attention_plot.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+def make_attention_table(att, tokens, numb, token_idx = 0, layerNumb = -1):
+    token_att = att[layerNumb, token_idx, range(1, len(tokens))]
+    token_label=[]
+    token_numb=[]
+    for idx, token in enumerate(tokens[1:]) :
+        token_label.append(f"<b>{token}</b>")
+        token_numb.append(f"{idx}")
+    pair = list(zip(token_numb, token_att))
+    df = pd.DataFrame(pair, columns=["Amino acid", "Attention rate"])
+    df.to_csv(f"amino_acid_seq_attention_{numb}.csv", index=None)
+    top3_idx = sorted(range(len(token_att)), key=lambda i: token_att[i], reverse=True)[:3]
+    colors = ['cornflowerblue', ] * len(token_numb)
+    for i in top3_idx:
+       colors[i] = 'crimson'
+    fig = go.Figure(data=[go.Bar(
+        x=df["Amino acid"],
+        y=df["Attention rate"],
+       #  range_y=[min(token_att), max(token_att)],
+        marker_color=colors  # marker color can be a single color value or an iterable
+    )])
+#     fig = px.histogram(df, x="Amino acid", y="Attention rate", range_y=[min(token_att), max(token_att)])
+    fig.update_layout(plot_bgcolor="white")
+    fig.update_xaxes(linecolor='rgba(0,0,0,0.25)', gridcolor='rgba(0,0,0,0)',mirror=False)
+    fig.update_yaxes(linecolor='rgba(0,0,0,0.25)', gridcolor='rgba(0,0,0,0.07)',mirror=False)
+    fig.update_layout(title={'text': "<b>Attention rate of amino acid sequence token</b>",
+                             'font':{'size':40},
+                             'y': 0.96,
+                             'x': 0.5,
+                             'xanchor': 'center',
+                             'yanchor': 'top'},
+                      xaxis=dict(tickmode='array',
+                                 tickvals=token_numb,
+                                 ticktext=token_label
+                                 ),
+                      xaxis_title={'text': "Amino acid sequence",
+                             'font':{'size':30}},
+                      yaxis_title={'text': "Attention rate",
+                             'font':{'size':30}},
+                      font=dict(family="Calibri, monospace",
+                                size=17
+                                ))
+    fig.write_image(f'figures/Amino_acid_seq_{numb}.png', width=1.5*1200, height=0.75*1200, scale=2)
+    fig.show()
+def read_attention():
+    df = pd.read_csv("../amino_acid_seq_attention.csv")
+        # d_flow_values = np.asarray(d_read_flow_values)
+    fig = px.bar(df, x="Amino acid", y="Attention rate", range_y=[min(df["Attention rate"]), max(df["Attention rate"])])
+    fig.update_layout(plot_bgcolor="white")
+    fig.update_xaxes(linecolor='rgba(0,0,0,0.25)', gridcolor='rgba(0,0,0,0)',mirror=False)
+    fig.update_yaxes(linecolor='rgba(0,0,0,0.25)', gridcolor='rgba(0,0,0,0.07)',mirror=False)
+    fig.update_layout(title={'text': "<b>Attention rate of amino acid sequence token</b>",
+                             'font':{'size':40},
+                             'y': 0.96,
+                             'x': 0.5,
+                             'xanchor': 'center',
+                             'yanchor': 'top'},
+                      xaxis_title={'text': "Amino acid sequence",
+                             'font':{'size':30}},
+                      yaxis_title={'text': "Attention rate",
+                             'font':{'size':30}},
+                      font=dict(family="Calibri, monospace",
+                                size=17
+                                ))
+    fig.write_image('figures/Amino_acid_seq.png', width=1.5*1200, height=0.75*1200, scale=2)
+    fig.show()
+if __name__ == '__main__':
+    read_attention()

tool/dap/util/boxplot.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import pandas as pd
+import numpy as np
+from scipy import stats
+import plotly.express as px
+from plotly.subplots import make_subplots
+import plotly.graph_objects as go
+ROC = 1
+PR = 2
+def add_p_value_annotation(fig, array_columns, subplot=None, _format=dict(interline=0.03, text_height=1.03, color='black')):
+    ''' Adds notations giving the p-value between two box plot data (t-test two-sided comparison)
+    Parameters:
+    ----------
+    fig: figure
+        plotly boxplot figure
+    array_columns: np.array
+        array of which columns to compare
+        e.g.: [[0,1], [1,2]] compares column 0 with 1 and 1 with 2
+    subplot: None or int
+        specifies if the figures has subplots and what subplot to add the notation to
+    _format: dict
+        format characteristics for the lines
+    Returns:
+    -------
+    fig: figure
+        figure with the added notation
+    '''
+    # Specify in what y_range to plot for each pair of columns
+    y_range = np.zeros([len(array_columns), 2])
+    for i in range(len(array_columns)):
+        y_range[i] = [1.03+i*_format['interline'], 1.04+i*_format['interline']]
+    # Get values from figure
+    fig_dict = fig.to_dict()
+    # Get indices if working with subplots
+    if subplot:
+        if subplot == 1:
+            subplot_str = ''
+        else:
+            subplot_str =str(subplot)
+        indices = [] #Change the box index to the indices of the data for that subplot
+        for index, data in enumerate(fig_dict['data']):
+            #print(index, data['xaxis'], 'x' + subplot_str)
+            if data['xaxis'] == 'x' + subplot_str:
+                indices = np.append(indices, index)
+        indices = [int(i) for i in indices]
+        print((indices))
+    else:
+        subplot_str = ''
+    # Print the p-values
+    for index, column_pair in enumerate(array_columns):
+        if subplot:
+            data_pair = [indices[column_pair[0]], indices[column_pair[1]]]
+        else:
+            data_pair = column_pair
+        # Mare sure it is selecting the data and subplot you want
+        #print('0:', fig_dict['data'][data_pair[0]]['name'], fig_dict['data'][data_pair[0]]['xaxis'])
+        #print('1:', fig_dict['data'][data_pair[1]]['name'], fig_dict['data'][data_pair[1]]['xaxis'])
+        # Get the p-value
+        pvalue = stats.ttest_ind(
+            fig_dict['data'][data_pair[0]]['y'],
+            fig_dict['data'][data_pair[1]]['y'],
+            equal_var=False,
+        )[1]
+        if pvalue >= 0.05:
+            symbol = 'ns'
+        elif pvalue >= 0.01:
+            symbol = '*'
+        elif pvalue >= 0.001:
+            symbol = '**'
+        else:
+            symbol = '***'
+        # Vertical line
+        fig.add_shape(type="line",
+            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
+            x0=column_pair[0], y0=y_range[index][0],
+            x1=column_pair[0], y1=y_range[index][1],
+            line=dict(color=_format['color'], width=1.5,)
+        )
+        # Horizontal line
+        fig.add_shape(type="line",
+            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
+            x0=column_pair[0], y0=y_range[index][1],
+            x1=column_pair[1], y1=y_range[index][1],
+            line=dict(color=_format['color'], width=1.5,)
+        )
+        # Vertical line
+        fig.add_shape(type="line",
+            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
+            x0=column_pair[1], y0=y_range[index][0],
+            x1=column_pair[1], y1=y_range[index][1],
+            line=dict(color=_format['color'], width=1.5,)
+        )
+        ## add text at the correct x, y coordinates
+        ## for bars, there is a direct mapping from the bar number to 0, 1, 2...
+        fig.add_annotation(dict(font=dict(color=_format['color'],size=14),
+            x=(column_pair[0] + column_pair[1])/2,
+            y=y_range[index][1]*_format['text_height'],
+            showarrow=False,
+            text=symbol,
+            textangle=0,
+            xref="x"+subplot_str,
+            yref="y"+subplot_str+" domain"
+        ))
+    return fig
+def box_plot(df):
+    fig = px.box(df, x = 'Task_name', y='test_auroc', color="Model")
+    fig.update_layout(plot_bgcolor="white")
+    fig.update_xaxes(linecolor='rgba(0,0,0,0.25)', gridcolor='rgba(0,0,0,0)',mirror=False)
+    fig.update_yaxes(linecolor='rgba(0,0,0,0.25)', gridcolor='rgba(0,0,0,0.07)',mirror=False)
+    fig.update_layout(title={'text': "<b>ROC-AUC score distribution</b>",
+                             'font':{'size':40},
+                             'y': 0.96,
+                             'x': 0.5,
+                             'xanchor': 'center',
+                             'yanchor': 'top'},
+                      xaxis_title={'text': "Datasets",
+                             'font':{'size':30}},
+                      yaxis_title={'text': "ROC-AUC",
+                             'font':{'size':30}},
+                      font=dict(family="Calibri, monospace",
+                                size=17
+                                ))
+    fig = add_p_value_annotation(fig, [[0,7], [3,7], [6,7]], subplot=1)
+    fig.write_image('../figures/box_plot_integration.png', width=1.5*1200, height=0.75*1200, scale=2)
+    fig.show()
+def go_box_plot(df, metric = ROC):
+    dataset_list = ['BIOSNAP', 'DAVIS', 'BindingDB']
+    model_list = ['LR', 'DNN', 'GNN-CPI', 'DeepDTI', 'DeepDTA', 'DeepConv-DTI', 'Moltrans', 'ours']
+    clr_list = ['red', 'orange', 'green', 'indianred', 'lightseagreen', 'goldenrod', 'magenta', 'blue']
+    if metric == ROC:
+        # fig_title = "<b>ROC-AUC score distribution</b>"
+        file_title = "boxplot_auroc.png"
+        select_metric = "test_auroc"
+    else:
+        # fig_title = "<b>PR-AUC score distribution</b>"
+        file_title = "boxplot_auprc.png"
+        select_metric = "test_auprc"
+    fig = make_subplots(rows=1, cols=3, subplot_titles=[c for c in dataset_list])
+    groups = df.groupby(df.Task_name)
+    Legand = True
+    for dataset_idx, dataset in enumerate(dataset_list):
+            df_modelgroup = groups.get_group(dataset)
+            model_groups = df_modelgroup.groupby(df_modelgroup.Model)
+            if dataset_idx != 0:
+                    Legand = False
+            for model_idx, model in enumerate(model_list):
+                    df_data = model_groups.get_group(model)
+                    fig.append_trace(go.Box(y=df_data[select_metric],
+                                name=model,
+                                marker_color=clr_list[model_idx],
+                                showlegend = Legand
+                                ),
+                                row=1,
+                                col=dataset_idx+1)
+    # fig.update_layout(title={'text': fig_title,
+    #                         'font':{'size':25},
+    #                         'y': 0.98,
+    #                         'x': 0.46,
+    #                         'xanchor': 'center',
+    #                         'yanchor': 'top'})
+    #    fig = add_p_value_annotation(fig, [[0,7], [3,7], [6,7]], subplot=1)
+    #    fig = add_p_value_annotation(fig, [[0,7], [3,7], [6,7]], subplot=2)
+    #    fig = add_p_value_annotation(fig, [[0,7], [3,7], [6,7]], subplot=3)
+    fig.write_image(f'../figures/{file_title}', width=1.5*1200, height=0.75*1200, scale=2)
+    fig.show()
+if __name__ == '__main__':
+    df = pd.read_csv("../dataset/wandb_export_boxplotdata.csv")
+    box_plot(df)

tool/dap/util/data/bindingdb_kd.tab ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b72a38ae07a75d5d4c269d2776b6e62e0edde29ff7cf8a323158c08951f808d1
+size 54432102

tool/dap/util/data/davis.tab ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d4c6809dcb7c5da2b91a32d594d6935b75484940bde4d18055eb5e1059262f4
+size 21376712

tool/dap/util/emetric.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import numpy as np
+def get_cindex(Y, P):
+    summ = 0
+    pair = 0
+    for i in range(1, len(Y)):
+        for j in range(0, i):
+            if i is not j:
+                if(Y[i] > Y[j]):
+                    pair +=1
+                    summ +=  1* (P[i] > P[j]) + 0.5 * (P[i] == P[j])
+    if pair is not 0:
+        return summ/pair
+    else:
+        return 0
+def r_squared_error(y_obs,y_pred):
+    y_obs = np.array(y_obs)
+    y_pred = np.array(y_pred)
+    y_obs_mean = [np.mean(y_obs) for y in y_obs]
+    y_pred_mean = [np.mean(y_pred) for y in y_pred]
+    mult = sum((y_pred - y_pred_mean) * (y_obs - y_obs_mean))
+    mult = mult * mult
+    y_obs_sq = sum((y_obs - y_obs_mean)*(y_obs - y_obs_mean))
+    y_pred_sq = sum((y_pred - y_pred_mean) * (y_pred - y_pred_mean) )
+    return mult / float(y_obs_sq * y_pred_sq)
+def get_k(y_obs,y_pred):
+    y_obs = np.array(y_obs)
+    y_pred = np.array(y_pred)
+    return sum(y_obs*y_pred) / float(sum(y_pred*y_pred))
+def squared_error_zero(y_obs,y_pred):
+    k = get_k(y_obs,y_pred)
+    y_obs = np.array(y_obs)
+    y_pred = np.array(y_pred)
+    y_obs_mean = [np.mean(y_obs) for y in y_obs]
+    upp = sum((y_obs - (k*y_pred)) * (y_obs - (k* y_pred)))
+    down= sum((y_obs - y_obs_mean)*(y_obs - y_obs_mean))
+    return 1 - (upp / float(down))
+def get_rm2(ys_orig,ys_line):
+    r2 = r_squared_error(ys_orig, ys_line)
+    r02 = squared_error_zero(ys_orig, ys_line)
+    return r2 * (1 - np.sqrt(np.absolute((r2*r2)-(r02*r02))))

tool/dap/util/load_dataset.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from tdc.multi_pred import DTI
+import pandas as pd
+import numpy as np
+if __name__ == '__main__':
+    bindingDB_data = DTI(name = 'BindingDB_Kd')
+    davis_data = DTI(name = 'DAVIS')
+    bindingDB_data.harmonize_affinities(mode = 'max_affinity')
+    bindingDB_data.convert_to_log(form = 'binding')
+    davis_data.convert_to_log(form = 'binding')
+    split_bindingDB = bindingDB_data.get_split()
+    split_davis = davis_data.get_split()
+    dataset_list = ["train", "valid", "test"]
+    for dataset_type in dataset_list:
+        df_bindingDB = pd.DataFrame(split_bindingDB[dataset_type])
+        df_davis = pd.DataFrame(split_davis[dataset_type])
+        df_bindingDB.to_csv(f"../dataset_kd/bindingDB_{dataset_type}.csv", index=False)
+        df_davis.to_csv(f"../dataset_kd/davis_{dataset_type}.csv", index=False)
+    Y_bindingDB = np.array(df_bindingDB.Y)
+    Y_davis = np.array(df_davis.Y)
+    Y_davis_log = [np.log10(Y_davis)]

tool/dap/util/make_external_validation.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import numpy as np
+import pandas as pd
+if __name__ == '__main__':
+    smiles = pd.read_csv("../dataset/external_smiles.csv")
+    ass = pd.read_csv("../dataset/external_aas.csv")
+    smiles_data = list(np.array(smiles['smiles']))
+    smiles_label = list(np.array(smiles['label'].tolist()))
+    smiles_label = [x.split() for x in smiles_label]
+    ass_data = list(np.array(ass['aas']))
+    cyp_type = list(np.array(ass['CYP_type']))
+    external_dataset = []
+    for smiles_idx in range(0, len(smiles_data)):
+        for ass_idx in range(0, len(ass_data)):
+            external_data = [smiles_data[smiles_idx], ass_data[ass_idx], cyp_type[ass_idx]]
+            external_dataset.append(external_data)
+    df = pd.DataFrame(external_dataset, columns=['smiles', 'aas', 'CYP_type'])
+    df.to_csv('../dataset/external_dataset.csv', index=False)
+    print(smiles['smiles'][0])
+    print(ass['CYP_type'][0])

tool/dap/util/utils.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import json, copy
+from easydict import EasyDict
+import torch.nn as nn
+class DictX(dict):
+    def __getattr__(self, key):
+        try:
+            return self[key]
+        except KeyError as k:
+            raise AttributeError(k)
+    def __setattr__(self, key, value):
+        self[key] = value
+    def __delattr__(self, key):
+        try:
+            del self[key]
+        except KeyError as k:
+            raise AttributeError(k)
+    def __repr__(self):
+        return '<DictX ' + dict.__repr__(self) + '>'
+def load_hparams(file_path):
+    hparams = EasyDict()
+    with open(file_path, 'r') as f:
+        hparams = json.load(f)
+    return hparams
+def deleteEncodingLayers(model, num_layers_to_keep):  # must pass in the full bert model
+    oldModuleList = model.encoder.layer
+    newModuleList = nn.ModuleList()
+    # Now iterate over all layers, only keepign only the relevant layers.
+    for i in range(num_layers_to_keep):
+        newModuleList.append(oldModuleList[i])
+    # create a copy of the model, modify it with the new list, and return
+    copyOfModel = copy.deepcopy(model)
+    copyOfModel.encoder.layer = newModuleList
+    return copyOfModel

tool/dataset.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

tool/deepacceptor/RF.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Mon Sep  4 10:38:59 2023
+@author: BM109X32G-10GPU-02
+"""
+from sklearn.metrics import confusion_matrix
+import matplotlib.pyplot as plt
+import numpy as np
+from rdkit.Chem import AllChem
+from sklearn.datasets import make_blobs
+import json
+import numpy as np
+import math
+from scipy import sparse
+from sklearn.metrics import median_absolute_error,r2_score, mean_absolute_error,mean_squared_error
+import pickle
+from tqdm import tqdm
+import pandas as pd
+import matplotlib.pyplot as plt
+from rdkit import Chem
+from sklearn.ensemble import RandomForestRegressor
+def split_string(string):
+    result = []
+    for char in string:
+        result.append(char)
+    return result
+def main(sm):
+        inchis = list([sm])
+        rts = list([0])
+        smiles, targets,features = [], [],[]
+        for i, inc in enumerate((inchis)):
+            mol = Chem.MolFromSmiles(inc)
+            if mol is None:
+                continue
+            else:
+                smi =AllChem. GetMorganFingerprintAsBitVect(mol,3,2048)
+                smi = smi.ToBitString()
+                a = split_string(smi)
+                a = np.array(a)
+                #smi = Chem.MolToSmiles(mol)
+                features.append(a)
+                targets.append(rts[i])
+        features = np.asarray(features)
+        targets = np.asarray(targets)
+        X_test=features
+        Y_test=targets
+        n_features=10
+        model = RandomForestRegressor(n_estimators=500)
+        load_model = pickle.load(open(r"tool\deepacceptor\deepacceptor.pkl","rb"))
+        Y_predict = load_model.predict(X_test)
+        return Y_predict

tool/deepacceptor/deepacceptor.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11753f5d925de9fdf0ff0afc05204bcb54ad26753f13e02e63696fa3c65e8029
+size 28084161

tool/deepacceptor/dict.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ [" ", "C", "1", "=", "(", "2", "F", ")", "3", "4", "5", "#", "N", "S", "/", "\\", "O", "6", "7", "8", "9", "%", "0", "[", "Se", "]", "Cl", "Br", "B", ".", "P", "I", "@", "H"]

tool/deepdonor/pm.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:155cb847cef95d069044c425c409ca8daff368bd2f3310f43965b6c65a2914e2
+size 8220594

tool/deepdonor/sm.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b27366195bfd2fcb74dbe20ccc1243e6297ee1f5c272947613f875141fdceb2
+size 31982999

tool/graphconverter.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Thu Nov  7 15:38:35 2024
+@author: BM109X32G-10GPU-02
+"""
+from DECIMER import predict_SMILES
+from langchain.tools import BaseTool
+class graphconverter(BaseTool):
+    name: str = "graphconverter"
+    description: str = (
+        "Input graph path , returns SMILES."
+        "It was used to convert graph/figure/image containing molecule to SMILES"
+    )
+    def __init__(self):
+        super().__init__()
+    def _run(self, paths: str) -> str:
+        try:
+            SMILES = predict_SMILES(paths)
+        except:
+            return 'Please recheck the graph path'
+        return SMILES
+    async def _arun(self, smiles: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()

tool/orbital.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Wed Oct 30 09:14:55 2024
+@author: BM109X32G-10GPU-02
+"""
+from sklearn.metrics import confusion_matrix
+import matplotlib.pyplot as plt
+import numpy as np
+from rdkit.Chem import AllChem
+from sklearn.datasets import make_blobs
+import json
+import numpy as np
+import math
+from scipy import sparse
+from sklearn.metrics import median_absolute_error,r2_score, mean_absolute_error,mean_squared_error
+from langchain.tools import BaseTool
+import pandas as pd
+import matplotlib.pyplot as plt
+from rdkit import Chem
+import pickle
+from sklearn.ensemble import RandomForestRegressor
+def split_string(string):
+    result = []
+    for char in string:
+        result.append(char)
+    return result
+def main(sm):
+        inchis = list([sm])
+        rts = list([0])
+        smiles, targets,features = [], [],[]
+        for i, inc in enumerate(inchis):
+            mol = Chem.MolFromSmiles(inc)
+            if mol is None:
+                continue
+            else:
+                smi =AllChem. GetMorganFingerprintAsBitVect(mol,1024)
+                smi = smi.ToBitString()
+                a = split_string(smi)
+                a = np.array(a)
+                #smi = Chem.MolToSmiles(mol)
+                features.append(a)
+                targets.append(rts[i])
+        features = np.asarray(features)
+        targets = np.asarray(targets)
+        X_test=features
+        Y_test=targets
+        n_features=10
+        model = RandomForestRegressor(n_estimators=100)
+        load_homo = pickle.load(open(r"tool/orbital/homo.pkl", 'rb'))
+        load_lumo = pickle.load(open(r"tool/orbital/lumo.pkl", 'rb'))
+     #   model = load_model('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5')
+        Y_homo= load_homo.predict(X_test)
+        Y_lumo = load_lumo.predict(X_test)
+        homo =  float(Y_homo)
+        lumo =  float(Y_lumo)
+        return homo, lumo
+class homolumo_predictor(BaseTool):
+    name: str = "homolumo_predictor"
+    description: str = (
+        "Input SMILES , returns the HOMO/LUMO  (Highest Occupied Molecular Orbital (HOMO) \
+        and  Lowest Unoccupied Molecular Orbital)."
+    )
+    def __init__(self):
+        super().__init__()
+    def _run(self, smiles: str) -> str:
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return "Invalid SMILES string"
+        Y_homo, Y_lumo = main( str(smiles) )
+        return f"The HOMO is predicted to be {'{:.2f}'.format(Y_homo)} eV , the LUMO is predicted to be {'{:.2f}'.format(Y_lumo)}  eV"
+    async def _arun(self, smiles: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()

tool/pdfreader.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Mon Dec 30 22:20:13 2024
+@author: BM109X32G-10GPU-02
+"""
+from langchain.chains import LLMChain, SimpleSequentialChain, RetrievalQA, ConversationalRetrievalChain
+from langchain import PromptTemplate
+from langchain.tools import BaseTool
+from langchain_core.messages import HumanMessage, SystemMessage
+from langchain.base_language import BaseLanguageModel
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.vectorstores import FAISS
+from langchain_openai import ChatOpenAI
+from langchain_openai import OpenAIEmbeddings
+template = """
+        You are an expert chemist and your task is to respond to the question or
+        solve the problem to the best of your ability. You need to answer in as much detail as possible.
+        You can only respond with a single "Final Answer" format.
+        Use the following pieces of context to answer the question at the end.
+        If you don't know the answer, just say that you don't know, don't try to make up an answer.
+        <context>
+        {context}
+        </context>
+        Question: {question}
+        Answer:
+        """
+class pdfreader(BaseTool):
+    name: str = "pdfreader"
+    description: str = (
+        "Used to read papers, summarize papers, Q&A based on papers, literature or publication"
+        "Input query , return the response"
+    )
+    llm: BaseLanguageModel = None
+    path : str = None
+    return_direct: bool = True
+    def __init__(self, path: str = None):
+        super().__init__(  )
+        self.llm =  ChatOpenAI(model="gpt-4o-2024-11-20",api_key='sk-itPrztYm9F6XZZpsBMJB9O7Vq0pYUABVVBSoThuBxEGTnDik',
+             base_url="https://www.dmxapi.com/v1")
+        self.path = path
+        # api keys
+    def _run(self, query ) -> str:
+        loader = PyPDFLoader(self.path)
+        documents = loader.load()
+        text_splitter = CharacterTextSplitter(chunk_size=6000, chunk_overlap=1000)
+        docs = text_splitter.split_documents(documents)
+        embeddings =  OpenAIEmbeddings(api_key='sk-itPrztYm9F6XZZpsBMJB9O7Vq0pYUABVVBSoThuBxEGTnDik',
+             base_url="https://www.dmxapi.com/v1")
+        vectorstore = FAISS.from_documents(docs, embeddings)
+        prompt = PromptTemplate(template=template, input_variables=[ "question"])
+        qa_chain = RetrievalQA.from_chain_type(
+            llm= self.llm,
+            chain_type="stuff",
+            retriever=vectorstore.as_retriever(search_kwargs={"k": 2}),
+            return_source_documents=True,
+           chain_type_kwargs={"prompt": prompt},
+        )
+        result = qa_chain.invoke(query)
+        return result['result']
+    async def _arun(self, query) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError("this tool does not support async")

tool/property.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Thu Sep  5 21:42:51 2024
+@author: BM109X32G-10GPU-02
+"""
+from langchain.tools import BaseTool
+from rdkit import Chem
+from rdkit.Chem import rdMolDescriptors
+from rdkit.Chem import Descriptors
+from utils import *
+from rdkit.Chem import RDConfig
+from rdkit.ML.Descriptors import MoleculeDescriptors
+from rdkit.Contrib.SA_Score import sascorer
+class MolSimilarity(BaseTool):
+    name: str = "MolSimilarity"
+    description: str = (
+        "Input two molecule SMILES (separated by '.'), returns Tanimoto similarity."
+    )
+    def __init__(self):
+        super().__init__()
+    def _run(self, smiles_pair: str) -> str:
+        smi_list = smiles_pair.split(".")
+        if len(smi_list) != 2:
+            return "Input error, please input two smiles strings separated by '.'"
+        else:
+            smiles1, smiles2 = smi_list
+        similarity = tanimoto(smiles1, smiles2)
+        if isinstance(similarity, str):
+            return similarity
+        if similarity == 1:
+            return "Error: Input Molecules Are Identical"
+        else:
+            message = f"The Tanimoto similarity between {smiles1} and {smiles2} is {round(similarity, 4)}"
+        return message
+    async def _arun(self, smiles_pair: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()
+class SMILES2Weight(BaseTool):
+    name: str = "SMILES2Weight"
+    description: str = "Input SMILES, returns molecular weight."
+    def __init__(
+        self,
+    ):
+        super().__init__()
+    def _run(self, smiles: str) -> str:
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return "Invalid SMILES string"
+        mol_weight = rdMolDescriptors.CalcExactMolWt(mol)
+        return mol_weight
+    async def _arun(self, smiles: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()
+class SMILES2LogP(BaseTool):
+    name: str = "SMILES2LogP"
+    description: str = "Input SMILES, returns molecular LogP."
+    def __init__(
+        self,
+    ):
+        super().__init__()
+    def _run(self, smiles: str) -> str:
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return "Invalid SMILES string"
+        MolLogP = Descriptors.MolLogP(mol)
+        return MolLogP
+    async def _arun(self, smiles: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()
+class SMILES2SAScore(BaseTool):
+    name: str = "SMILES2SAScore"
+    description: str = "Input SMILES, returns synthetic accessibility score to evaluate the difficulty of molecular synthesis."
+    def __init__(
+        self,
+    ):
+        super().__init__()
+    def _run(self, smiles: str) -> str:
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return "Invalid SMILES string"
+        SAScore = sascorer.calculateScore(mol)
+        return f"This SAScore of the molecule is {SAScore}."
+    async def _arun(self, smiles: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()
+class SMILES2Properties(BaseTool):
+    name: str = "SMILES2Properties"
+    description: str = "Input SMILES, returns basic physical and chemical properties."
+    def __init__(
+        self,
+    ):
+        super().__init__()
+    def _run(self, smiles: str) -> str:
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return "Invalid SMILES string"
+        SAScore = sascorer.calculateScore(mol)
+        des_list = ['MolWt','NOCount', 'NumHAcceptors', 'NumHDonors', 'MolLogP', 'NumRotatableBonds','RingCount','NumAromaticRings','TPSA']
+        calculator = MoleculeDescriptors.MolecularDescriptorCalculator(des_list)
+        results = calculator.CalcDescriptors(mol)
+        return f"SAScore: {'{:.2f}'.format(SAScore)}; molecular weight: {'{:.2f}'.format(results[0])}; number of Nitrogens and Oxygens: {results[1]}; number of Hydrogen Bond Acceptors: {results[2]}; number of Hydrogen Bond Donors:{results[3]}; LogP:{'{:.2f}'.format(results[4])}; number of Rotatable Bonds: {results[5]}; Ring count: {results[6]}; number of aromatic rings: {results[7]}; TPSA: {'{:.2f}'.format(results[8])}."
+    async def _arun(self, smiles: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()
+class FuncGroups(BaseTool):
+    name: str = "FunctionalGroups"
+    description: str = "Input SMILES, return list of functional groups in the molecule."
+    dict_fgs: dict = None
+    def __init__(
+        self,
+    ):
+        super().__init__()
+        # List obtained from https://github.com/rdkit/rdkit/blob/master/Data/FunctionalGroups.txt
+        self.dict_fgs = {
+            "furan": "o1cccc1",
+            "aldehydes": " [CX3H1](=O)[#6]",
+            "esters": " [#6][CX3](=O)[OX2H0][#6]",
+            "ketones": " [#6][CX3](=O)[#6]",
+            "amides": " C(=O)-N",
+            "thiol groups": " [SH]",
+            "alcohol groups": " [OH]",
+            "methylamide": "*-[N;D2]-[C;D3](=O)-[C;D1;H3]",
+            "carboxylic acids": "*-C(=O)[O;D1]",
+            "carbonyl methylester": "*-C(=O)[O;D2]-[C;D1;H3]",
+            "terminal aldehyde": "*-C(=O)-[C;D1]",
+            "amide": "*-C(=O)-[N;D1]",
+            "carbonyl methyl": "*-C(=O)-[C;D1;H3]",
+            "isocyanate": "*-[N;D2]=[C;D2]=[O;D1]",
+            "isothiocyanate": "*-[N;D2]=[C;D2]=[S;D1]",
+            "nitro": "*-[N;D3](=[O;D1])[O;D1]",
+            "nitroso": "*-[N;R0]=[O;D1]",
+            "oximes": "*=[N;R0]-[O;D1]",
+            "Imines": "*-[N;R0]=[C;D1;H2]",
+            "terminal azo": "*-[N;D2]=[N;D2]-[C;D1;H3]",
+            "hydrazines": "*-[N;D2]=[N;D1]",
+            "diazo": "*-[N;D2]#[N;D1]",
+            "cyano": "*-[C;D2]#[N;D1]",
+            "primary sulfonamide": "*-[S;D4](=[O;D1])(=[O;D1])-[N;D1]",
+            "methyl sulfonamide": "*-[N;D2]-[S;D4](=[O;D1])(=[O;D1])-[C;D1;H3]",
+            "sulfonic acid": "*-[S;D4](=O)(=O)-[O;D1]",
+            "methyl ester sulfonyl": "*-[S;D4](=O)(=O)-[O;D2]-[C;D1;H3]",
+            "methyl sulfonyl": "*-[S;D4](=O)(=O)-[C;D1;H3]",
+            "sulfonyl chloride": "*-[S;D4](=O)(=O)-[Cl]",
+            "methyl sulfinyl": "*-[S;D3](=O)-[C;D1]",
+            "methyl thio": "*-[S;D2]-[C;D1;H3]",
+            "thiols": "*-[S;D1]",
+            "thio carbonyls": "*=[S;D1]",
+            "halogens": "*-[#9,#17,#35,#53]",
+            "t-butyl": "*-[C;D4]([C;D1])([C;D1])-[C;D1]",
+            "tri fluoromethyl": "*-[C;D4](F)(F)F",
+            "acetylenes": "*-[C;D2]#[C;D1;H]",
+            "cyclopropyl": "*-[C;D3]1-[C;D2]-[C;D2]1",
+            "ethoxy": "*-[O;D2]-[C;D2]-[C;D1;H3]",
+            "methoxy": "*-[O;D2]-[C;D1;H3]",
+            "side-chain hydroxyls": "*-[O;D1]",
+            "ketones": "*=[O;D1]",
+            "primary amines": "*-[N;D1]",
+            "nitriles": "*#[N;D1]",
+        }
+    def _is_fg_in_mol(self, mol, fg):
+        fgmol = Chem.MolFromSmarts(fg)
+        mol = Chem.MolFromSmiles(mol.strip())
+        return len(Chem.Mol.GetSubstructMatches(mol, fgmol, uniquify=True)) > 0
+    def _run(self, smiles: str) -> str:
+        """
+        Input a molecule SMILES or name.
+        Returns a list of functional groups identified by their common name (in natural language).
+        """
+        try:
+            fgs_in_molec = [
+                name
+                for name, fg in self.dict_fgs.items()
+                if self._is_fg_in_mol(smiles, fg)
+            ]
+            if len(fgs_in_molec) > 1:
+                return f"This molecule contains {', '.join(fgs_in_molec[:-1])}, and {fgs_in_molec[-1]}."
+            else:
+                return f"This molecule contains {fgs_in_molec[0]}."
+        except:
+            return "Wrong argument. Please input a valid molecular SMILES."
+    async def _arun(self, smiles: str) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError()

tool/rag.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Sun Feb  2 20:31:22 2025
+@author: BM109X32G-10GPU-02
+"""
+from langchain.tools import BaseTool
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+    )
+from langchain import PromptTemplate
+from langchain import HuggingFacePipeline
+from langchain.base_language import BaseLanguageModel
+from langchain.chains import   RetrievalQA
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_openai import ChatOpenAI
+from langchain_community.vectorstores import FAISS
+from torch import cuda, bfloat16
+device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
+from langchain_openai import OpenAIEmbeddings
+embeddings = OpenAIEmbeddings(api_key='sk-itPrztYm9F6XZZpsBMJB9O7Vq0pYUABVVBSoThuBxEGTnDik',
+      base_url="https://www.dmxapi.com/v1")
+vectorstore=FAISS.load_local(r"J:\libray\osc\tool\rag", embeddings,allow_dangerous_deserialization =True)
+template = """
+You are an expert chemist and your task is to respond to the question or
+solve the problem to the best of your ability.You can only respond with a single "Final Answer" format.
+You need to list the key points  and explain them in detail and accurately
+Use the following pieces of context to answer the question at the end.
+If you don't know the answer, just say that you don't know, don't try to make up an answer.
+<context>
+{context}
+</context>
+Question: {question}
+Answer:
+"""
+class rag(BaseTool):
+    name: str = "RAG"
+    description: str= (
+        "Useful to answer questions that require technical "
+        "Provide specialized knowledge information for solving Q&A questions"
+        "Input query , return the response"
+    )
+    llm: BaseLanguageModel = None
+    path : str = None
+    def __init__(self, path: str = None):
+        super().__init__(  )
+        self.llm = ChatOpenAI(model="gpt-4o-2024-11-20",api_key='sk-itPrztYm9F6XZZpsBMJB9O7Vq0pYUABVVBSoThuBxEGTnDik',
+             base_url="https://www.dmxapi.com/v1")
+        self.path = path
+        # api keys
+    def _run(self, query ) -> str:
+        prompt = PromptTemplate(template=template, input_variables=[ "question"])
+        qa_chain = RetrievalQA.from_chain_type(
+            llm=self.llm,
+            chain_type="stuff",
+            retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
+            return_source_documents=False,
+           chain_type_kwargs={"prompt": prompt},
+        )
+        chat_history = []
+        result = qa_chain.invoke(query)
+        return result['result']
+    async def _arun(self, query) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError("this tool does not support async")

tool/rag/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50ab8fd7a0c8d9dd62ebb1592b542b2d2fb2730ce21b761d2b36e8b5087743cf
+size 6942765

tool/rag/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:908a85200a6d6e140c7d112f5c1b6b73376b42b2f85c3b3786f2050d6f1070d9
+size 5545448

tool/search.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import os
+import re
+import langchain
+import paperqa
+import paperscraper
+from langchain_community.utilities import SerpAPIWrapper
+from langchain.base_language import BaseLanguageModel
+from langchain.tools import BaseTool
+from langchain_openai import OpenAIEmbeddings
+from pypdf.errors import PdfReadError
+from rdkit import Chem, DataStructs
+from rdkit.Chem import AllChem
+def is_smiles(text):
+    try:
+        m = Chem.MolFromSmiles(text, sanitize=False)
+        if m is None:
+            return False
+        return True
+    except:
+        return False
+def is_multiple_smiles(text):
+    if is_smiles(text):
+        return "." in text
+    return False
+def split_smiles(text):
+    return text.split(".")
+def paper_scrap(search: str, pdir: str = "query", semantic_scholar_api_key: str = None) -> dict:
+    try:
+        return paperscraper.search_papers(
+            search,
+            pdir=pdir,
+            semantic_scholar_api_key=semantic_scholar_api_key,
+        )
+    except KeyError:
+        return {}
+def paper_search(llm, query, semantic_scholar_api_key=None):
+    prompt = langchain.prompts.PromptTemplate(
+        input_variables=["question"],
+        template="""
+        I would like to find scholarly papers to answer
+        this question: {question}. Your response must be at
+        most 10 words long.
+        'A search query that would bring up papers that can answer
+        this question would be: '""",
+    )
+    query_chain = langchain.chains.llm.LLMChain(llm=llm, prompt=prompt)
+    if not os.path.isdir("./query"):  # todo: move to ckpt
+        os.mkdir("query/")
+    search = query_chain.run(query)
+    print("\nSearch:", search)
+    papers = paper_scrap(search, pdir=f"query/{re.sub(' ', '', search)}", semantic_scholar_api_key=semantic_scholar_api_key)
+    return papers
+def scholar2result_llm(llm, query, k=5, max_sources=2, openai_api_key=None, semantic_scholar_api_key=None):
+    """Useful to answer questions that require
+    technical knowledge. Ask a specific question."""
+    papers = paper_search(llm, query, semantic_scholar_api_key=semantic_scholar_api_key)
+    if len(papers) == 0:
+        return "Not enough papers found"
+    docs = paperqa.Docs(
+        llm=llm,
+        summary_llm=llm,
+        embeddings=OpenAIEmbeddings(openai_api_key=openai_api_key),
+    )
+    not_loaded = 0
+    for path, data in papers.items():
+        try:
+            docs.add(path, data["citation"])
+        except (ValueError, FileNotFoundError, PdfReadError):
+            not_loaded += 1
+    if not_loaded > 0:
+        print(f"\nFound {len(papers.items())} papers but couldn't load {not_loaded}.")
+    else:
+        print(f"\nFound {len(papers.items())} papers and loaded all of them.")
+    answer = docs.query(query, k=k, max_sources=max_sources).formatted_answer
+    return answer
+class Scholar2ResultLLM(BaseTool):
+    name : str = "LiteratureSearch"
+    description: str = (
+        "Useful to answer questions that require technical "
+        "knowledge. Ask a specific question."
+    )
+    llm: BaseLanguageModel = None
+    openai_api_key: str = None
+    semantic_scholar_api_key: str = None
+    def __init__(self, llm, openai_api_key, semantic_scholar_api_key):
+        super().__init__()
+        self.llm = llm
+        # api keys
+        self.openai_api_key = openai_api_key
+        self.semantic_scholar_api_key = semantic_scholar_api_key
+    def _run(self, query) -> str:
+        return scholar2result_llm(
+            self.llm,
+            query,
+            openai_api_key=self.openai_api_key,
+            semantic_scholar_api_key=self.semantic_scholar_api_key
+        )
+    async def _arun(self, query) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError("this tool does not support async")
+def web_search(keywords, search_engine="google"):
+    try:
+        return SerpAPIWrapper(
+            serpapi_api_key='3795acda6a74ea15033d34b54eac82982b26f559147d9cf04aca4bfca91c3e9d', search_engine=search_engine
+        ).run(keywords)
+    except:
+        return "No results, try another search"
+class WebSearch(BaseTool):
+    name: str = "WebSearch"
+    description: str = (
+        "Input a specific question, returns an answer from web search. "
+        "Give more detailed information and use more general features to formulate your questions."
+    )
+    serp_api_key: str = None
+    def __init__(self, serp_api_key: str = None):
+        super().__init__()
+        self.serp_api_key = serp_api_key
+    def _run(self, query: str) -> str:
+        if not self.serp_api_key:
+            return (
+                "No SerpAPI key found. This tool may not be used without a SerpAPI key."
+            )
+        return web_search(query)
+    async def _arun(self, query: str) -> str:
+        raise NotImplementedError("Async not implemented")