add light install

Files changed (7) hide show

README.md +14 -0
basic_models.txt +10 -0
download_light.py +145 -0
environment.yml +0 -434
inference.py +1 -1
tokenizer/.ipynb_checkpoints/my_tokenizers-checkpoint.py +0 -398
tokenizer/__pycache__/my_tokenizers.cpython-310.pyc +0 -0

README.md CHANGED Viewed

@@ -27,7 +27,21 @@ This is the repository for [PeptiVerse: A Unified Platform for Therapeutic Pepti
 - [Citation](#citation)
 ## Quick Start
 ```bash
 # Clone repository
 git clone https://huggingface.co/ChatterjeeLab/PeptiVerse

 - [Citation](#citation)
 ## Quick Start
+- Light-weighted start (basic models, no cuML, read below for details)
+```bash
+# Ignore all LFS files
+GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ChatterjeeLab/PeptiVerse
+# Install basic pkgs
+pip install -r requirements.txt
+# Download basic model weights according to the basic_models.txt. Adjust which config you wanted as needed.
+python download_light.py
+# Test in inference
+python inference.py
+```
+- Full model clone (will clone all model weights)
 ```bash
 # Clone repository
 git clone https://huggingface.co/ChatterjeeLab/PeptiVerse

basic_models.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+Properties, Best_Model_WT, Best_Model_SMILES, Type, Threshold_WT, Threshold_SMILES,
+Hemolysis, XGB, Transformer, Classifier, 0.2521, 0.4343 ,
+Non-Fouling, MLP, XGB, Classifier, 0.57, 0.6969,
+Solubility, CNN, -, Classifier, 0.377, -,
+Permeability (Penetrance), XGB, -, Classifier, 0.5493, -,
+Toxicity, -, Transformer, Classifier, -, 0.3401,
+Binding_affinity, unpooled, unpooled, Regression, -, -,
+Permeability_PAMPA, -, CNN, Regression, -, -,
+Permeability_CACO2, -, SVR, Regression, -, -,
+Halflife, Transformer, XGB, Regression, -, -,

download_light.py ADDED Viewed

	@@ -0,0 +1,145 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import os
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+from huggingface_hub import snapshot_download
+from inference import (
+    PeptiVersePredictor,
+    read_best_manifest_csv,
+    canon_model,
+)
+# -----------------------------
+# Config
+# -----------------------------
+MODEL_REPO = "ChatterjeeLab/PeptiVerse"
+DEFAULT_ASSETS_DIR = Path("./")   # where downloaded models live
+DEFAULT_MANIFEST = Path("./basic_models.txt")
+BANNED_MODELS = {"svm", "enet", "svm_gpu", "enet_gpu"}
+def _norm_prop_disk(prop_key: str) -> str:
+    return "half_life" if prop_key == "halflife" else prop_key
+def _resolve_expected_model_dir(prop_key: str, model_name: str, mode: str) -> str:
+    disk_prop = _norm_prop_disk(prop_key)
+    base = f"training_classifiers/{disk_prop}"
+    # binding affinity is special: its label is pooled/unpooled and folder uses wt_<mode>_<pooled|unpooled>
+    if prop_key == "binding_affinity":
+        pooled_or_unpooled = model_name  # "pooled" or "unpooled"
+        return f"{base}/wt_{mode}_{pooled_or_unpooled}"
+    # halflife special folders
+    if prop_key == "halflife":
+        if model_name in {"xgb_wt_log", "xgb_smiles"}:
+            return f"{base}/{model_name}"
+        if mode == "wt" and model_name == "transformer":
+            return f"{base}/transformer_wt_log"
+        if model_name == "xgb":
+            return f"{base}/{'xgb_wt_log' if mode == 'wt' else 'xgb_smiles'}"
+    return f"{base}/{model_name}_{mode}"
+def build_allow_patterns_from_manifest(manifest_path: Path) -> List[str]:
+    best = read_best_manifest_csv(manifest_path)
+    allow: List[str] = []
+    # For each property, fetch best artifacts for wt + smiles
+    for prop_key, row in best.items():
+        for mode, label in [("wt", row.best_wt), ("smiles", row.best_smiles)]:
+            m = canon_model(label)
+            if m is None:
+                continue
+            if m in BANNED_MODELS:
+                m = "xgb"
+            model_dir = _resolve_expected_model_dir(prop_key, m, mode)
+            # fetch only "basic" artifacts, not everything in the folder
+            allow += [
+                f"{model_dir}/best_model.json",
+                f"{model_dir}/best_model.pt",
+                f"{model_dir}/best_model*.joblib",
+                f"{model_dir}/best_model*.json",
+            ]
+    seen = set()
+    out = []
+    for p in allow:
+        if p not in seen:
+            out.append(p)
+            seen.add(p)
+    return out
+def download_assets(
+    repo_id: str,
+    manifest_path: Path,
+    out_dir: Path,
+) -> Path:
+    out_dir = out_dir.resolve()
+    out_dir.mkdir(parents=True, exist_ok=True)
+    allow_patterns = build_allow_patterns_from_manifest(manifest_path)
+    snapshot_download(
+        repo_id=repo_id,
+        local_dir=str(out_dir),
+        local_dir_use_symlinks=False,
+        allow_patterns=allow_patterns,
+    )
+    return out_dir
+# -----------------------------
+# Main
+# -----------------------------
+def main():
+    import argparse
+    ap = argparse.ArgumentParser(description="Lightweight PeptiVerse inference with on-demand model download.")
+    ap.add_argument("--repo", default=MODEL_REPO, help="HF repo id containing weights/assets.")
+    ap.add_argument("--manifest", default=str(DEFAULT_MANIFEST), help="Path to best_models.txt")
+    ap.add_argument("--assets", default=str(DEFAULT_ASSETS_DIR), help="Where to store downloaded assets")
+    ap.add_argument("--device", default=None, help="cuda / cpu / cuda:0, etc")
+    ap.add_argument("--property", default="hemolysis", help="Property key (e.g. hemolysis, solubility, ...)")
+    ap.add_argument("--mode", default="wt", choices=["wt", "smiles"], help="Input type: wt=AA sequence, smiles=SMILES")
+    ap.add_argument("--input", default="GIGAVLKVLTTGLPALISWIKRKRQQ", help="Sequence or SMILES string")
+    ap.add_argument("--target_seq", default=None, help="Target WT sequence for binding_affinity")
+    ap.add_argument("--binder", default=None, help="Binder string (AA or SMILES) for binding_affinity")
+    args = ap.parse_args()
+    manifest_path = Path(args.manifest)
+    if not manifest_path.exists():
+        raise FileNotFoundError(f"Manifest not found: {manifest_path}")
+    assets_dir = download_assets(args.repo, manifest_path=manifest_path, out_dir=Path(args.assets))
+    """ OPTIONAL TEST CODE
+    predictor = PeptiVersePredictor(
+        manifest_path="basic_models.txt",  # use the downloaded copy to be consistent
+        classifier_weight_root=str(assets_dir),
+        device=args.device,
+    )
+    if args.property == "binding_affinity":
+        if not args.target_seq or not args.binder:
+            raise ValueError("For binding_affinity, provide --target_seq and --binder.")
+        out = predictor.predict_binding_affinity(args.mode, target_seq=args.target_seq, binder_str=args.binder)
+    else:
+        out = predictor.predict_property(args.property, args.mode, args.input)
+    print(out)
+    """
+if __name__ == "__main__":
+    main()

environment.yml DELETED Viewed

@@ -1,434 +0,0 @@
-name: metal
-channels:
-  - conda-forge
-  - omnia
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=main
-  - _openmp_mutex=5.1=1_gnu
-  - bzip2=1.0.8=h5eee18b_6
-  - ca-certificates=2025.11.12=hbd8a1cb_0
-  - certifi=2025.11.12=pyhd8ed1ab_0
-  - expat=2.7.1=h6a678d5_0
-  - git-lfs=3.7.1=h6138981_0
-  - ld_impl_linux-64=2.40=h12ee557_0
-  - libffi=3.4.4=h6a678d5_1
-  - libgcc-ng=11.2.0=h1234567_1
-  - libgomp=11.2.0=h1234567_1
-  - libstdcxx-ng=11.2.0=h1234567_1
-  - libuuid=1.41.5=h5eee18b_0
-  - libxcb=1.17.0=h9b100fa_0
-  - ncurses=6.5=h7934f7d_0
-  - openssl=3.0.17=h5eee18b_0
-  - pip=25.1=pyhc872135_2
-  - pthread-stubs=0.3=h0ce48e5_1
-  - python=3.10.18=h1a3bd86_0
-  - readline=8.3=hc2a1206_0
-  - ripgrep=13.0.0=h2f28480_2
-  - sqlite=3.50.2=hb25bd0a_1
-  - tk=8.6.15=h54e0aa7_0
-  - wheel=0.45.1=py310h06a4308_0
-  - xorg-libx11=1.8.12=h9b100fa_1
-  - xorg-libxau=1.0.12=h9b100fa_0
-  - xorg-libxdmcp=1.1.5=h9b100fa_0
-  - xorg-xorgproto=2024.1=h5eee18b_1
-  - xz=5.6.4=h5eee18b_1
-  - zlib=1.2.13=h5eee18b_1
-  - pip:
-      - about-time==4.2.1
-      - absl-py==2.3.1
-      - accelerate==1.10.0
-      - aiofiles==23.2.1
-      - aiohappyeyeballs==2.6.1
-      - aiohttp==3.12.15
-      - aiosignal==1.4.0
-      - alembic==1.17.2
-      - alive-progress==3.3.0
-      - anndata==0.11.4
-      - annotated-doc==0.0.3
-      - annotated-types==0.7.0
-      - antlr4-python3-runtime==4.9.3
-      - anyio==4.10.0
-      - argon2-cffi==25.1.0
-      - argon2-cffi-bindings==25.1.0
-      - array-api-compat==1.12.0
-      - arrow==1.3.0
-      - ase==3.26.0
-      - astroid==3.3.11
-      - asttokens==3.0.0
-      - astunparse==1.6.3
-      - async-lru==2.0.5
-      - async-timeout==5.0.1
-      - attrs==25.3.0
-      - autograd==1.8.0
-      - autopep8==2.0.4
-      - babel==2.17.0
-      - beautifulsoup4==4.13.4
-      - biopython==1.85
-      - biotite==0.41.2
-      - biotraj==1.2.2
-      - black==25.1.0
-      - bleach==6.2.0
-      - blosum==2.0.3
-      - bokeh==3.6.3
-      - brotli==1.2.0
-      - cachetools==6.2.3
-      - cffi==1.17.1
-      - cftime==1.6.4.post1
-      - charset-normalizer==3.4.3
-      - click==8.2.1
-      - cloudpathlib==0.23.0
-      - cloudpickle==3.1.2
-      - cma==4.4.0
-      - colorama==0.4.6
-      - colorcet==3.1.0
-      - colorlog==6.10.1
-      - comm==0.2.3
-      - contourpy==1.3.2
-      - cucim-cu12==25.12.0
-      - cuda-bindings==12.9.4
-      - cuda-core==0.3.2
-      - cuda-pathfinder==1.3.3
-      - cuda-python==12.9.4
-      - cuda-toolkit==12.9.1
-      - cudf-cu12==25.12.0
-      - cugraph-cu12==25.12.2
-      - cuml-cu12==25.12.0
-      - cupy-cuda12x==13.6.0
-      - cuvs-cu12==25.12.0
-      - cuxfilter-cu12==25.12.0
-      - cycler==0.12.1
-      - dask==2025.9.1
-      - dask-cuda==25.12.0
-      - dask-cudf-cu12==25.12.0
-      - datasets==4.0.0
-      - datashader==0.18.2
-      - debugpy==1.8.16
-      - decorator==5.2.1
-      - deeptime==0.4.5
-      - defusedxml==0.7.1
-      - deprecated==1.2.18
-      - dill==0.3.8
-      - distributed==2025.9.1
-      - distributed-ucxx-cu12==0.47.0
-      - docstring-to-markdown==0.17
-      - einops==0.8.1
-      - exceptiongroup==1.3.0
-      - executing==2.2.0
-      - fair-esm==2.0.0
-      - fastapi==0.121.0
-      - fastjsonschema==2.21.2
-      - fastprogress==1.0.3
-      - fastrlock==0.8.3
-      - ffmpy==0.6.4
-      - filelock==3.19.1
-      - flake8==7.1.2
-      - flatbuffers==25.9.23
-      - fonttools==4.60.0
-      - fqdn==1.5.1
-      - frozenlist==1.7.0
-      - fsspec==2025.3.0
-      - gast==0.6.0
-      - gensim==4.4.0
-      - geopandas==1.1.1
-      - gitdb==4.0.12
-      - gitpython==3.1.45
-      - google-pasta==0.2.0
-      - gradio==5.20.1
-      - gradio-client==1.7.2
-      - graphemeu==0.7.2
-      - greenlet==3.3.0
-      - groovy==0.1.2
-      - grpcio==1.76.0
-      - h11==0.16.0
-      - h5py==3.14.0
-      - hf-xet==1.1.8
-      - holoviews==1.20.2
-      - httpcore==1.0.9
-      - httpx==0.28.1
-      - huggingface-hub==0.35.3
-      - humanfriendly==10.0
-      - hydra-core==1.3.2
-      - idna==3.10
-      - igraph==1.0.0
-      - imageio==2.37.0
-      - importlib-metadata==8.7.0
-      - inquirerpy==0.3.4
-      - ipykernel==6.30.1
-      - ipython==8.37.0
-      - isoduration==20.11.0
-      - isort==6.0.1
-      - iterative-stratification==0.1.9
-      - jedi==0.19.2
-      - jinja2==3.1.6
-      - joblib==1.5.1
-      - json5==0.12.1
-      - jsonpointer==3.0.0
-      - jsonschema==4.25.1
-      - jsonschema-specifications==2025.4.1
-      - jupyter-client==8.6.3
-      - jupyter-core==5.8.1
-      - jupyter-events==0.12.0
-      - jupyter-lsp==2.2.6
-      - jupyter-server==2.16.0
-      - jupyter-server-mathjax==0.2.6
-      - jupyter-server-proxy==4.4.0
-      - jupyter-server-terminals==0.5.3
-      - jupyterlab==4.4.6
-      - jupyterlab-code-formatter==3.0.2
-      - jupyterlab-git==0.51.2
-      - jupyterlab-lsp==5.2.0
-      - jupyterlab-pygments==0.3.0
-      - jupyterlab-search-replace==1.1.0
-      - jupyterlab-server==2.27.3
-      - jupyterlab-spreadsheet-editor==0.7.2
-      - keras==3.12.0
-      - kiwisolver==1.4.9
-      - lark==1.2.2
-      - lazy-loader==0.4
-      - lckr-jupyterlab-variableinspector==3.2.4
-      - legacy-api-wrap==1.5
-      - leidenalg==0.11.0
-      - libclang==18.1.1
-      - libcudf-cu12==25.12.0
-      - libcugraph-cu12==25.12.2
-      - libcuml-cu12==25.12.0
-      - libcuvs-cu12==25.12.0
-      - libkvikio-cu12==25.12.0
-      - libraft-cu12==25.12.0
-      - librmm-cu12==25.12.0
-      - libucx-cu12==1.19.0
-      - libucxx-cu12==0.47.0
-      - lightning==2.5.5
-      - lightning-utilities==0.15.2
-      - linearboost==0.1.4
-      - linkify-it-py==2.0.3
-      - llvmlite==0.44.0
-      - locket==1.0.0
-      - loguru==0.7.3
-      - mako==1.3.10
-      - markdown==3.9
-      - markdown-it-py==4.0.0
-      - markupsafe==2.1.5
-      - matplotlib==3.10.6
-      - matplotlib-inline==0.1.7
-      - mccabe==0.7.0
-      - mdit-py-plugins==0.5.0
-      - mdshare==0.4.2
-      - mdtraj==1.10.3
-      - mdurl==0.1.2
-      - mistune==3.1.3
-      - ml-dtypes==0.5.3
-      - mpmath==1.3.0
-      - msgpack==1.1.2
-      - msgpack-numpy==0.4.8
-      - multidict==6.6.4
-      - multipledispatch==1.0.0
-      - multiprocess==0.70.16
-      - mypy-extensions==1.1.0
-      - namex==0.1.0
-      - natsort==8.4.0
-      - nbclient==0.10.2
-      - nbconvert==7.16.6
-      - nbdime==4.0.2
-      - nbformat==5.10.4
-      - nest-asyncio==1.6.0
-      - netcdf4==1.7.2
-      - networkx==3.4.2
-      - notebook==7.4.5
-      - notebook-shim==0.2.4
-      - numba==0.61.2
-      - numba-cuda==0.19.1
-      - numpy==1.26.4
-      - nvidia-cublas-cu12==12.8.4.1
-      - nvidia-cuda-cccl-cu12==12.9.27
-      - nvidia-cuda-cupti-cu12==12.8.90
-      - nvidia-cuda-nvcc-cu12==12.9.86
-      - nvidia-cuda-nvrtc-cu12==12.8.93
-      - nvidia-cuda-runtime-cu12==12.8.90
-      - nvidia-cudnn-cu12==9.10.2.21
-      - nvidia-cufft-cu12==11.3.3.83
-      - nvidia-cufile-cu12==1.13.1.3
-      - nvidia-curand-cu12==10.3.9.90
-      - nvidia-cusolver-cu12==11.7.3.90
-      - nvidia-cusparse-cu12==12.5.8.93
-      - nvidia-cusparselt-cu12==0.7.1
-      - nvidia-ml-py==13.590.44
-      - nvidia-nccl-cu12==2.27.3
-      - nvidia-nvimgcodec-cu12==0.6.1.37
-      - nvidia-nvjitlink-cu12==12.8.93
-      - nvidia-nvtx-cu12==12.8.90
-      - nvtx==0.2.14
-      - nx-cugraph-cu12==25.12.0
-      - omegaconf==2.3.0
-      - opt-einsum==3.4.0
-      - optree==0.17.0
-      - optuna==4.6.0
-      - orjson==3.11.4
-      - overrides==7.7.0
-      - p2smi==1.1.1
-      - packaging==25.0
-      - pandas==2.3.2
-      - pandocfilters==1.5.1
-      - panel==1.7.5
-      - param==2.3.1
-      - paretoflow==0.1.5
-      - parso==0.8.4
-      - partd==1.4.2
-      - pathos==0.3.2
-      - pathspec==0.12.1
-      - patsy==1.0.2
-      - peft==0.17.1
-      - pexpect==4.9.0
-      - pfzy==0.3.4
-      - pillow==11.3.0
-      - platformdirs==4.3.8
-      - pluggy==1.6.0
-      - pox==0.3.6
-      - ppft==1.7.7
-      - prdc==0.2
-      - prometheus-client==0.22.1
-      - prompt-toolkit==3.0.51
-      - propcache==0.3.2
-      - protobuf==6.32.0
-      - psutil==7.0.0
-      - ptyprocess==0.7.0
-      - pure-eval==0.2.3
-      - pyarrow==21.0.0
-      - pycodestyle==2.12.1
-      - pycparser==2.22
-      - pyct==0.6.0
-      - pydantic==2.11.9
-      - pydantic-core==2.33.2
-      - pydocstyle==6.3.0
-      - pydub==0.25.1
-      - pyemma==2.5.12
-      - pyflakes==3.2.0
-      - pygments==2.19.2
-      - pylibcudf-cu12==25.12.0
-      - pylibcugraph-cu12==25.12.2
-      - pylibraft-cu12==25.12.0
-      - pylint==3.3.8
-      - pymoo==0.6.1.5
-      - pynndescent==0.5.13
-      - pyogrio==0.12.1
-      - pyparsing==3.2.5
-      - pyproj==3.7.1
-      - python-dateutil==2.9.0.post0
-      - python-json-logger==3.3.0
-      - python-lsp-jsonrpc==1.1.2
-      - python-lsp-server==1.13.0
-      - python-multipart==0.0.20
-      - pytoolconfig==1.3.1
-      - pytorch-lightning==2.5.5
-      - pytorch-lightning-bolts==0.3.2.post1
-      - pytorch-metric-learning==2.9.0
-      - pytz==2025.2
-      - pyviz-comms==3.0.6
-      - pyyaml==6.0.2
-      - pyzmq==27.0.1
-      - raft-dask-cu12==25.12.0
-      - rapids-dask-dependency==25.12.0
-      - rapids-logger==0.2.3
-      - rdkit==2025.9.1
-      - referencing==0.36.2
-      - regex==2025.7.34
-      - requests==2.32.5
-      - requests-toolbelt==1.0.0
-      - rfc3339-validator==0.1.4
-      - rfc3986-validator==0.1.1
-      - rfc3987-syntax==1.1.0
-      - rich==14.1.0
-      - rmm-cu12==25.12.0
-      - rope==1.14.0
-      - rpds-py==0.27.0
-      - ruff==0.14.3
-      - safehttpx==0.1.7
-      - safetensors==0.6.2
-      - scanpy==1.11.5
-      - schedulefree==1.4.1
-      - scikit-image==0.25.2
-      - scikit-learn==1.7.1
-      - scipy==1.15.3
-      - seaborn==0.13.2
-      - semantic-version==2.10.0
-      - send2trash==1.8.3
-      - sentry-sdk==2.35.0
-      - session-info2==0.2.3
-      - setuptools==80.9.0
-      - shapely==2.0.7
-      - shellingham==1.5.4
-      - simpervisor==1.0.0
-      - six==1.17.0
-      - smart-open==7.4.3
-      - smilespe==0.0.3
-      - smmap==5.0.2
-      - sniffio==1.3.1
-      - snowballstemmer==3.0.1
-      - sortedcontainers==2.4.0
-      - soupsieve==2.7
-      - sqlalchemy==2.0.45
-      - stack-data==0.6.3
-      - starlette==0.49.3
-      - statsmodels==0.14.5
-      - sympy==1.14.0
-      - tblib==3.2.2
-      - tenacity==9.1.2
-      - tensorboard==2.20.0
-      - tensorboard-data-server==0.7.2
-      - tensorflow==2.20.0
-      - termcolor==3.2.0
-      - terminado==0.18.1
-      - texttable==1.7.0
-      - threadpoolctl==3.6.0
-      - tifffile==2025.5.10
-      - timm==1.0.22
-      - tinycss2==1.4.0
-      - tokenizers==0.20.3
-      - tomli==2.2.1
-      - tomlkit==0.13.3
-      - toolz==1.1.0
-      - torch==2.8.0
-      - torch-geometric==2.6.1
-      - torchaudio==2.8.0+cu128
-      - torchmetrics==1.8.2
-      - torchtext==0.18.0
-      - torchvision==0.23.0+cu128
-      - tornado==6.5.2
-      - tqdm==4.67.1
-      - traitlets==5.14.3
-      - transformers==4.46.0
-      - treelite==4.6.1
-      - triton==3.4.0
-      - typer==0.20.0
-      - types-python-dateutil==2.9.0.20250809
-      - typing-extensions==4.15.0
-      - typing-inspection==0.4.1
-      - tzdata==2025.2
-      - uc-micro-py==1.0.3
-      - ucxx-cu12==0.47.0
-      - ujson==5.11.0
-      - umap-learn==0.5.9.post2
-      - uri-template==1.3.0
-      - urllib3==2.5.0
-      - uvicorn==0.38.0
-      - vampnet==0.1.4.dev16+gc88ed3f0f.d20251028
-      - wandb==0.21.1
-      - wcwidth==0.2.13
-      - webcolors==24.11.1
-      - webencodings==0.5.1
-      - websocket-client==1.8.0
-      - websockets==15.0.1
-      - werkzeug==3.1.3
-      - whatthepatch==1.0.7
-      - wrapt==1.17.3
-      - xarray==2025.6.1
-      - xgboost==3.0.4
-      - xxhash==3.5.0
-      - xyzservices==2025.11.0
-      - yapf==0.43.0
-      - yarl==1.20.1
-      - zict==3.0.0
-      - zipp==3.23.0
-      - zstd==1.5.7.2
-prefix: /vast/projects/pranam/lab/yz927/envs/metal

inference.py CHANGED Viewed

@@ -965,7 +965,7 @@ class PeptiVersePredictor:
 if __name__ == "__main__":
     predictor = PeptiVersePredictor(
-       manifest_path="best_models.txt",
        classifier_weight_root="./"
      )
     print(predictor.predict_property("hemolysis", "wt", "GIGAVLKVLTTGLPALISWIKRKRQQ"))

 if __name__ == "__main__":
     predictor = PeptiVersePredictor(
+       manifest_path="basic_models.txt",
        classifier_weight_root="./"
      )
     print(predictor.predict_property("hemolysis", "wt", "GIGAVLKVLTTGLPALISWIKRKRQQ"))

tokenizer/.ipynb_checkpoints/my_tokenizers-checkpoint.py DELETED Viewed

@@ -1,398 +0,0 @@
-import collections
-import logging
-import os
-import re
-import codecs
-import unicodedata
-from typing import List, Optional
-from transformers import PreTrainedTokenizer
-from SmilesPE.tokenizer import SPE_Tokenizer
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n")
-        vocab[token] = index
-    return vocab
-class Atomwise_Tokenizer(object):
-    """Run atom-level SMILES tokenization"""
-    def __init__(self):
-        """ Constructs a atom-level Tokenizer.
-        """
-        # self.regex_pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
-        self.regex_pattern = r"(\([^\(\)]{0,4}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/\/?|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
-        self.regex = re.compile(self.regex_pattern)
-    def tokenize(self, text):
-        """ Basic Tokenization of a SMILES.
-        """
-        tokens = [token for token in self.regex.findall(text)]
-        return tokens
-class SMILES_SPE_Tokenizer(PreTrainedTokenizer):
-    r"""
-    Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
-    should refer to the superclass for more information regarding methods.
-    Args:
-        vocab_file (:obj:`string`):
-            File containing the vocabulary.
-        spe_file (:obj:`string`):
-            File containing the trained SMILES Pair Encoding vocabulary.
-        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
-            for sequence classification or for a text and a question for question answering.
-            It is also used as the last token of a sequence built with special tokens.
-        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
-            The classifier token which is used when doing sequence classification (classification of the whole
-            sequence instead of per-token classification). It is the first token of the sequence when built with
-            special tokens.
-        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-    """
-    def __init__(self, vocab_file, spe_file,
-                unk_token="[UNK]",
-                sep_token="[SEP]",
-                pad_token="[PAD]",
-                cls_token="[CLS]",
-                mask_token="[MASK]",
-                **kwargs):
-        if not os.path.isfile(vocab_file):
-            raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file))
-        if not os.path.isfile(spe_file):
-            raise ValueError("Can't find a SPE vocabulary file at path '{}'.".format(spe_file))
-        self.vocab = load_vocab(vocab_file)
-        self.spe_vocab = open(spe_file, 'r', encoding='utf-8')
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab)
-        super().__init__(
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            **kwargs)
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-    def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
-    def _tokenize(self, text):
-        return self.spe_tokenizer.tokenize(text).split(' ')
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
-        text = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
-        return self.convert_tokens_to_string(text)
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A BERT sequence has the following format:
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-        Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True if the token list is already formatted with special tokens for the model
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        A BERT sequence pair mask has the following format:
-        ::
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
-            sequence(s).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
-    def save_vocabulary(self, vocab_path):
-        """
-        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
-        Args:
-            vocab_path (:obj:`str`):
-                The directory in which to save the vocabulary.
-        Returns:
-            :obj:`Tuple(str)`: Paths to the files saved.
-        """
-        index = 0
-        if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
-        else:
-            vocab_file = vocab_path
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)
-class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer):
-    r"""
-    Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
-    should refer to the superclass for more information regarding methods.
-    Args:
-        vocab_file (:obj:`string`):
-            File containing the vocabulary.
-        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
-            for sequence classification or for a text and a question for question answering.
-            It is also used as the last token of a sequence built with special tokens.
-        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
-            The classifier token which is used when doing sequence classification (classification of the whole
-            sequence instead of per-token classification). It is the first token of the sequence when built with
-            special tokens.
-        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-    """
-    def __init__(
-        self,
-        vocab_file,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        **kwargs
-    ):
-        super().__init__(
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            **kwargs,
-        )
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                "Can't find a vocabulary file at path '{}'.".format(vocab_file)
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.tokenizer = Atomwise_Tokenizer()
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-    def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
-    def _tokenize(self, text):
-        return self.tokenizer.tokenize(text)
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A BERT sequence has the following format:
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-        Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True if the token list is already formatted with special tokens for the model
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        A BERT sequence pair mask has the following format:
-        ::
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
-            sequence(s).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
-    def save_vocabulary(self, vocab_path):
-        """
-        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
-        Args:
-            vocab_path (:obj:`str`):
-                The directory in which to save the vocabulary.
-        Returns:
-            :obj:`Tuple(str)`: Paths to the files saved.
-        """
-        index = 0
-        if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
-        else:
-            vocab_file = vocab_path
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)

tokenizer/__pycache__/my_tokenizers.cpython-310.pyc DELETED Viewed

Binary file (15.5 kB)