ynuozhang commited on
Commit ·
e66c5e2
1
Parent(s): 78e29df
add light install
Browse files- README.md +14 -0
- basic_models.txt +10 -0
- download_light.py +145 -0
- environment.yml +0 -434
- inference.py +1 -1
- tokenizer/.ipynb_checkpoints/my_tokenizers-checkpoint.py +0 -398
- tokenizer/__pycache__/my_tokenizers.cpython-310.pyc +0 -0
README.md
CHANGED
|
@@ -27,7 +27,21 @@ This is the repository for [PeptiVerse: A Unified Platform for Therapeutic Pepti
|
|
| 27 |
- [Citation](#citation)
|
| 28 |
|
| 29 |
## Quick Start
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
```bash
|
| 32 |
# Clone repository
|
| 33 |
git clone https://huggingface.co/ChatterjeeLab/PeptiVerse
|
|
|
|
| 27 |
- [Citation](#citation)
|
| 28 |
|
| 29 |
## Quick Start
|
| 30 |
+
- Light-weighted start (basic models, no cuML, read below for details)
|
| 31 |
+
```bash
|
| 32 |
+
# Ignore all LFS files
|
| 33 |
+
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ChatterjeeLab/PeptiVerse
|
| 34 |
+
|
| 35 |
+
# Install basic pkgs
|
| 36 |
+
pip install -r requirements.txt
|
| 37 |
|
| 38 |
+
# Download basic model weights according to the basic_models.txt. Adjust which config you wanted as needed.
|
| 39 |
+
python download_light.py
|
| 40 |
+
|
| 41 |
+
# Test in inference
|
| 42 |
+
python inference.py
|
| 43 |
+
```
|
| 44 |
+
- Full model clone (will clone all model weights)
|
| 45 |
```bash
|
| 46 |
# Clone repository
|
| 47 |
git clone https://huggingface.co/ChatterjeeLab/PeptiVerse
|
basic_models.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Properties, Best_Model_WT, Best_Model_SMILES, Type, Threshold_WT, Threshold_SMILES,
|
| 2 |
+
Hemolysis, XGB, Transformer, Classifier, 0.2521, 0.4343 ,
|
| 3 |
+
Non-Fouling, MLP, XGB, Classifier, 0.57, 0.6969,
|
| 4 |
+
Solubility, CNN, -, Classifier, 0.377, -,
|
| 5 |
+
Permeability (Penetrance), XGB, -, Classifier, 0.5493, -,
|
| 6 |
+
Toxicity, -, Transformer, Classifier, -, 0.3401,
|
| 7 |
+
Binding_affinity, unpooled, unpooled, Regression, -, -,
|
| 8 |
+
Permeability_PAMPA, -, CNN, Regression, -, -,
|
| 9 |
+
Permeability_CACO2, -, SVR, Regression, -, -,
|
| 10 |
+
Halflife, Transformer, XGB, Regression, -, -,
|
download_light.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Dict, List, Optional, Tuple
|
| 7 |
+
|
| 8 |
+
from huggingface_hub import snapshot_download
|
| 9 |
+
from inference import (
|
| 10 |
+
PeptiVersePredictor,
|
| 11 |
+
read_best_manifest_csv,
|
| 12 |
+
canon_model,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
# -----------------------------
|
| 16 |
+
# Config
|
| 17 |
+
# -----------------------------
|
| 18 |
+
MODEL_REPO = "ChatterjeeLab/PeptiVerse"
|
| 19 |
+
DEFAULT_ASSETS_DIR = Path("./") # where downloaded models live
|
| 20 |
+
DEFAULT_MANIFEST = Path("./basic_models.txt")
|
| 21 |
+
|
| 22 |
+
BANNED_MODELS = {"svm", "enet", "svm_gpu", "enet_gpu"}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _norm_prop_disk(prop_key: str) -> str:
|
| 26 |
+
return "half_life" if prop_key == "halflife" else prop_key
|
| 27 |
+
|
| 28 |
+
def _resolve_expected_model_dir(prop_key: str, model_name: str, mode: str) -> str:
|
| 29 |
+
disk_prop = _norm_prop_disk(prop_key)
|
| 30 |
+
base = f"training_classifiers/{disk_prop}"
|
| 31 |
+
|
| 32 |
+
# binding affinity is special: its label is pooled/unpooled and folder uses wt_<mode>_<pooled|unpooled>
|
| 33 |
+
if prop_key == "binding_affinity":
|
| 34 |
+
pooled_or_unpooled = model_name # "pooled" or "unpooled"
|
| 35 |
+
return f"{base}/wt_{mode}_{pooled_or_unpooled}"
|
| 36 |
+
|
| 37 |
+
# halflife special folders
|
| 38 |
+
if prop_key == "halflife":
|
| 39 |
+
if model_name in {"xgb_wt_log", "xgb_smiles"}:
|
| 40 |
+
return f"{base}/{model_name}"
|
| 41 |
+
if mode == "wt" and model_name == "transformer":
|
| 42 |
+
return f"{base}/transformer_wt_log"
|
| 43 |
+
if model_name == "xgb":
|
| 44 |
+
return f"{base}/{'xgb_wt_log' if mode == 'wt' else 'xgb_smiles'}"
|
| 45 |
+
|
| 46 |
+
return f"{base}/{model_name}_{mode}"
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def build_allow_patterns_from_manifest(manifest_path: Path) -> List[str]:
|
| 50 |
+
best = read_best_manifest_csv(manifest_path)
|
| 51 |
+
|
| 52 |
+
allow: List[str] = []
|
| 53 |
+
|
| 54 |
+
# For each property, fetch best artifacts for wt + smiles
|
| 55 |
+
for prop_key, row in best.items():
|
| 56 |
+
for mode, label in [("wt", row.best_wt), ("smiles", row.best_smiles)]:
|
| 57 |
+
m = canon_model(label)
|
| 58 |
+
if m is None:
|
| 59 |
+
continue
|
| 60 |
+
|
| 61 |
+
if m in BANNED_MODELS:
|
| 62 |
+
m = "xgb"
|
| 63 |
+
|
| 64 |
+
model_dir = _resolve_expected_model_dir(prop_key, m, mode)
|
| 65 |
+
|
| 66 |
+
# fetch only "basic" artifacts, not everything in the folder
|
| 67 |
+
allow += [
|
| 68 |
+
f"{model_dir}/best_model.json",
|
| 69 |
+
f"{model_dir}/best_model.pt",
|
| 70 |
+
f"{model_dir}/best_model*.joblib",
|
| 71 |
+
f"{model_dir}/best_model*.json",
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
seen = set()
|
| 75 |
+
out = []
|
| 76 |
+
for p in allow:
|
| 77 |
+
if p not in seen:
|
| 78 |
+
out.append(p)
|
| 79 |
+
seen.add(p)
|
| 80 |
+
return out
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def download_assets(
|
| 84 |
+
repo_id: str,
|
| 85 |
+
manifest_path: Path,
|
| 86 |
+
out_dir: Path,
|
| 87 |
+
) -> Path:
|
| 88 |
+
out_dir = out_dir.resolve()
|
| 89 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 90 |
+
|
| 91 |
+
allow_patterns = build_allow_patterns_from_manifest(manifest_path)
|
| 92 |
+
|
| 93 |
+
snapshot_download(
|
| 94 |
+
repo_id=repo_id,
|
| 95 |
+
local_dir=str(out_dir),
|
| 96 |
+
local_dir_use_symlinks=False,
|
| 97 |
+
allow_patterns=allow_patterns,
|
| 98 |
+
)
|
| 99 |
+
return out_dir
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
# -----------------------------
|
| 103 |
+
# Main
|
| 104 |
+
# -----------------------------
|
| 105 |
+
def main():
|
| 106 |
+
import argparse
|
| 107 |
+
|
| 108 |
+
ap = argparse.ArgumentParser(description="Lightweight PeptiVerse inference with on-demand model download.")
|
| 109 |
+
ap.add_argument("--repo", default=MODEL_REPO, help="HF repo id containing weights/assets.")
|
| 110 |
+
ap.add_argument("--manifest", default=str(DEFAULT_MANIFEST), help="Path to best_models.txt")
|
| 111 |
+
ap.add_argument("--assets", default=str(DEFAULT_ASSETS_DIR), help="Where to store downloaded assets")
|
| 112 |
+
ap.add_argument("--device", default=None, help="cuda / cpu / cuda:0, etc")
|
| 113 |
+
|
| 114 |
+
ap.add_argument("--property", default="hemolysis", help="Property key (e.g. hemolysis, solubility, ...)")
|
| 115 |
+
ap.add_argument("--mode", default="wt", choices=["wt", "smiles"], help="Input type: wt=AA sequence, smiles=SMILES")
|
| 116 |
+
ap.add_argument("--input", default="GIGAVLKVLTTGLPALISWIKRKRQQ", help="Sequence or SMILES string")
|
| 117 |
+
ap.add_argument("--target_seq", default=None, help="Target WT sequence for binding_affinity")
|
| 118 |
+
ap.add_argument("--binder", default=None, help="Binder string (AA or SMILES) for binding_affinity")
|
| 119 |
+
args = ap.parse_args()
|
| 120 |
+
|
| 121 |
+
manifest_path = Path(args.manifest)
|
| 122 |
+
if not manifest_path.exists():
|
| 123 |
+
raise FileNotFoundError(f"Manifest not found: {manifest_path}")
|
| 124 |
+
|
| 125 |
+
assets_dir = download_assets(args.repo, manifest_path=manifest_path, out_dir=Path(args.assets))
|
| 126 |
+
|
| 127 |
+
""" OPTIONAL TEST CODE
|
| 128 |
+
predictor = PeptiVersePredictor(
|
| 129 |
+
manifest_path="basic_models.txt", # use the downloaded copy to be consistent
|
| 130 |
+
classifier_weight_root=str(assets_dir),
|
| 131 |
+
device=args.device,
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
if args.property == "binding_affinity":
|
| 135 |
+
if not args.target_seq or not args.binder:
|
| 136 |
+
raise ValueError("For binding_affinity, provide --target_seq and --binder.")
|
| 137 |
+
out = predictor.predict_binding_affinity(args.mode, target_seq=args.target_seq, binder_str=args.binder)
|
| 138 |
+
else:
|
| 139 |
+
out = predictor.predict_property(args.property, args.mode, args.input)
|
| 140 |
+
|
| 141 |
+
print(out)
|
| 142 |
+
"""
|
| 143 |
+
|
| 144 |
+
if __name__ == "__main__":
|
| 145 |
+
main()
|
environment.yml
DELETED
|
@@ -1,434 +0,0 @@
|
|
| 1 |
-
name: metal
|
| 2 |
-
channels:
|
| 3 |
-
- conda-forge
|
| 4 |
-
- omnia
|
| 5 |
-
- defaults
|
| 6 |
-
dependencies:
|
| 7 |
-
- _libgcc_mutex=0.1=main
|
| 8 |
-
- _openmp_mutex=5.1=1_gnu
|
| 9 |
-
- bzip2=1.0.8=h5eee18b_6
|
| 10 |
-
- ca-certificates=2025.11.12=hbd8a1cb_0
|
| 11 |
-
- certifi=2025.11.12=pyhd8ed1ab_0
|
| 12 |
-
- expat=2.7.1=h6a678d5_0
|
| 13 |
-
- git-lfs=3.7.1=h6138981_0
|
| 14 |
-
- ld_impl_linux-64=2.40=h12ee557_0
|
| 15 |
-
- libffi=3.4.4=h6a678d5_1
|
| 16 |
-
- libgcc-ng=11.2.0=h1234567_1
|
| 17 |
-
- libgomp=11.2.0=h1234567_1
|
| 18 |
-
- libstdcxx-ng=11.2.0=h1234567_1
|
| 19 |
-
- libuuid=1.41.5=h5eee18b_0
|
| 20 |
-
- libxcb=1.17.0=h9b100fa_0
|
| 21 |
-
- ncurses=6.5=h7934f7d_0
|
| 22 |
-
- openssl=3.0.17=h5eee18b_0
|
| 23 |
-
- pip=25.1=pyhc872135_2
|
| 24 |
-
- pthread-stubs=0.3=h0ce48e5_1
|
| 25 |
-
- python=3.10.18=h1a3bd86_0
|
| 26 |
-
- readline=8.3=hc2a1206_0
|
| 27 |
-
- ripgrep=13.0.0=h2f28480_2
|
| 28 |
-
- sqlite=3.50.2=hb25bd0a_1
|
| 29 |
-
- tk=8.6.15=h54e0aa7_0
|
| 30 |
-
- wheel=0.45.1=py310h06a4308_0
|
| 31 |
-
- xorg-libx11=1.8.12=h9b100fa_1
|
| 32 |
-
- xorg-libxau=1.0.12=h9b100fa_0
|
| 33 |
-
- xorg-libxdmcp=1.1.5=h9b100fa_0
|
| 34 |
-
- xorg-xorgproto=2024.1=h5eee18b_1
|
| 35 |
-
- xz=5.6.4=h5eee18b_1
|
| 36 |
-
- zlib=1.2.13=h5eee18b_1
|
| 37 |
-
- pip:
|
| 38 |
-
- about-time==4.2.1
|
| 39 |
-
- absl-py==2.3.1
|
| 40 |
-
- accelerate==1.10.0
|
| 41 |
-
- aiofiles==23.2.1
|
| 42 |
-
- aiohappyeyeballs==2.6.1
|
| 43 |
-
- aiohttp==3.12.15
|
| 44 |
-
- aiosignal==1.4.0
|
| 45 |
-
- alembic==1.17.2
|
| 46 |
-
- alive-progress==3.3.0
|
| 47 |
-
- anndata==0.11.4
|
| 48 |
-
- annotated-doc==0.0.3
|
| 49 |
-
- annotated-types==0.7.0
|
| 50 |
-
- antlr4-python3-runtime==4.9.3
|
| 51 |
-
- anyio==4.10.0
|
| 52 |
-
- argon2-cffi==25.1.0
|
| 53 |
-
- argon2-cffi-bindings==25.1.0
|
| 54 |
-
- array-api-compat==1.12.0
|
| 55 |
-
- arrow==1.3.0
|
| 56 |
-
- ase==3.26.0
|
| 57 |
-
- astroid==3.3.11
|
| 58 |
-
- asttokens==3.0.0
|
| 59 |
-
- astunparse==1.6.3
|
| 60 |
-
- async-lru==2.0.5
|
| 61 |
-
- async-timeout==5.0.1
|
| 62 |
-
- attrs==25.3.0
|
| 63 |
-
- autograd==1.8.0
|
| 64 |
-
- autopep8==2.0.4
|
| 65 |
-
- babel==2.17.0
|
| 66 |
-
- beautifulsoup4==4.13.4
|
| 67 |
-
- biopython==1.85
|
| 68 |
-
- biotite==0.41.2
|
| 69 |
-
- biotraj==1.2.2
|
| 70 |
-
- black==25.1.0
|
| 71 |
-
- bleach==6.2.0
|
| 72 |
-
- blosum==2.0.3
|
| 73 |
-
- bokeh==3.6.3
|
| 74 |
-
- brotli==1.2.0
|
| 75 |
-
- cachetools==6.2.3
|
| 76 |
-
- cffi==1.17.1
|
| 77 |
-
- cftime==1.6.4.post1
|
| 78 |
-
- charset-normalizer==3.4.3
|
| 79 |
-
- click==8.2.1
|
| 80 |
-
- cloudpathlib==0.23.0
|
| 81 |
-
- cloudpickle==3.1.2
|
| 82 |
-
- cma==4.4.0
|
| 83 |
-
- colorama==0.4.6
|
| 84 |
-
- colorcet==3.1.0
|
| 85 |
-
- colorlog==6.10.1
|
| 86 |
-
- comm==0.2.3
|
| 87 |
-
- contourpy==1.3.2
|
| 88 |
-
- cucim-cu12==25.12.0
|
| 89 |
-
- cuda-bindings==12.9.4
|
| 90 |
-
- cuda-core==0.3.2
|
| 91 |
-
- cuda-pathfinder==1.3.3
|
| 92 |
-
- cuda-python==12.9.4
|
| 93 |
-
- cuda-toolkit==12.9.1
|
| 94 |
-
- cudf-cu12==25.12.0
|
| 95 |
-
- cugraph-cu12==25.12.2
|
| 96 |
-
- cuml-cu12==25.12.0
|
| 97 |
-
- cupy-cuda12x==13.6.0
|
| 98 |
-
- cuvs-cu12==25.12.0
|
| 99 |
-
- cuxfilter-cu12==25.12.0
|
| 100 |
-
- cycler==0.12.1
|
| 101 |
-
- dask==2025.9.1
|
| 102 |
-
- dask-cuda==25.12.0
|
| 103 |
-
- dask-cudf-cu12==25.12.0
|
| 104 |
-
- datasets==4.0.0
|
| 105 |
-
- datashader==0.18.2
|
| 106 |
-
- debugpy==1.8.16
|
| 107 |
-
- decorator==5.2.1
|
| 108 |
-
- deeptime==0.4.5
|
| 109 |
-
- defusedxml==0.7.1
|
| 110 |
-
- deprecated==1.2.18
|
| 111 |
-
- dill==0.3.8
|
| 112 |
-
- distributed==2025.9.1
|
| 113 |
-
- distributed-ucxx-cu12==0.47.0
|
| 114 |
-
- docstring-to-markdown==0.17
|
| 115 |
-
- einops==0.8.1
|
| 116 |
-
- exceptiongroup==1.3.0
|
| 117 |
-
- executing==2.2.0
|
| 118 |
-
- fair-esm==2.0.0
|
| 119 |
-
- fastapi==0.121.0
|
| 120 |
-
- fastjsonschema==2.21.2
|
| 121 |
-
- fastprogress==1.0.3
|
| 122 |
-
- fastrlock==0.8.3
|
| 123 |
-
- ffmpy==0.6.4
|
| 124 |
-
- filelock==3.19.1
|
| 125 |
-
- flake8==7.1.2
|
| 126 |
-
- flatbuffers==25.9.23
|
| 127 |
-
- fonttools==4.60.0
|
| 128 |
-
- fqdn==1.5.1
|
| 129 |
-
- frozenlist==1.7.0
|
| 130 |
-
- fsspec==2025.3.0
|
| 131 |
-
- gast==0.6.0
|
| 132 |
-
- gensim==4.4.0
|
| 133 |
-
- geopandas==1.1.1
|
| 134 |
-
- gitdb==4.0.12
|
| 135 |
-
- gitpython==3.1.45
|
| 136 |
-
- google-pasta==0.2.0
|
| 137 |
-
- gradio==5.20.1
|
| 138 |
-
- gradio-client==1.7.2
|
| 139 |
-
- graphemeu==0.7.2
|
| 140 |
-
- greenlet==3.3.0
|
| 141 |
-
- groovy==0.1.2
|
| 142 |
-
- grpcio==1.76.0
|
| 143 |
-
- h11==0.16.0
|
| 144 |
-
- h5py==3.14.0
|
| 145 |
-
- hf-xet==1.1.8
|
| 146 |
-
- holoviews==1.20.2
|
| 147 |
-
- httpcore==1.0.9
|
| 148 |
-
- httpx==0.28.1
|
| 149 |
-
- huggingface-hub==0.35.3
|
| 150 |
-
- humanfriendly==10.0
|
| 151 |
-
- hydra-core==1.3.2
|
| 152 |
-
- idna==3.10
|
| 153 |
-
- igraph==1.0.0
|
| 154 |
-
- imageio==2.37.0
|
| 155 |
-
- importlib-metadata==8.7.0
|
| 156 |
-
- inquirerpy==0.3.4
|
| 157 |
-
- ipykernel==6.30.1
|
| 158 |
-
- ipython==8.37.0
|
| 159 |
-
- isoduration==20.11.0
|
| 160 |
-
- isort==6.0.1
|
| 161 |
-
- iterative-stratification==0.1.9
|
| 162 |
-
- jedi==0.19.2
|
| 163 |
-
- jinja2==3.1.6
|
| 164 |
-
- joblib==1.5.1
|
| 165 |
-
- json5==0.12.1
|
| 166 |
-
- jsonpointer==3.0.0
|
| 167 |
-
- jsonschema==4.25.1
|
| 168 |
-
- jsonschema-specifications==2025.4.1
|
| 169 |
-
- jupyter-client==8.6.3
|
| 170 |
-
- jupyter-core==5.8.1
|
| 171 |
-
- jupyter-events==0.12.0
|
| 172 |
-
- jupyter-lsp==2.2.6
|
| 173 |
-
- jupyter-server==2.16.0
|
| 174 |
-
- jupyter-server-mathjax==0.2.6
|
| 175 |
-
- jupyter-server-proxy==4.4.0
|
| 176 |
-
- jupyter-server-terminals==0.5.3
|
| 177 |
-
- jupyterlab==4.4.6
|
| 178 |
-
- jupyterlab-code-formatter==3.0.2
|
| 179 |
-
- jupyterlab-git==0.51.2
|
| 180 |
-
- jupyterlab-lsp==5.2.0
|
| 181 |
-
- jupyterlab-pygments==0.3.0
|
| 182 |
-
- jupyterlab-search-replace==1.1.0
|
| 183 |
-
- jupyterlab-server==2.27.3
|
| 184 |
-
- jupyterlab-spreadsheet-editor==0.7.2
|
| 185 |
-
- keras==3.12.0
|
| 186 |
-
- kiwisolver==1.4.9
|
| 187 |
-
- lark==1.2.2
|
| 188 |
-
- lazy-loader==0.4
|
| 189 |
-
- lckr-jupyterlab-variableinspector==3.2.4
|
| 190 |
-
- legacy-api-wrap==1.5
|
| 191 |
-
- leidenalg==0.11.0
|
| 192 |
-
- libclang==18.1.1
|
| 193 |
-
- libcudf-cu12==25.12.0
|
| 194 |
-
- libcugraph-cu12==25.12.2
|
| 195 |
-
- libcuml-cu12==25.12.0
|
| 196 |
-
- libcuvs-cu12==25.12.0
|
| 197 |
-
- libkvikio-cu12==25.12.0
|
| 198 |
-
- libraft-cu12==25.12.0
|
| 199 |
-
- librmm-cu12==25.12.0
|
| 200 |
-
- libucx-cu12==1.19.0
|
| 201 |
-
- libucxx-cu12==0.47.0
|
| 202 |
-
- lightning==2.5.5
|
| 203 |
-
- lightning-utilities==0.15.2
|
| 204 |
-
- linearboost==0.1.4
|
| 205 |
-
- linkify-it-py==2.0.3
|
| 206 |
-
- llvmlite==0.44.0
|
| 207 |
-
- locket==1.0.0
|
| 208 |
-
- loguru==0.7.3
|
| 209 |
-
- mako==1.3.10
|
| 210 |
-
- markdown==3.9
|
| 211 |
-
- markdown-it-py==4.0.0
|
| 212 |
-
- markupsafe==2.1.5
|
| 213 |
-
- matplotlib==3.10.6
|
| 214 |
-
- matplotlib-inline==0.1.7
|
| 215 |
-
- mccabe==0.7.0
|
| 216 |
-
- mdit-py-plugins==0.5.0
|
| 217 |
-
- mdshare==0.4.2
|
| 218 |
-
- mdtraj==1.10.3
|
| 219 |
-
- mdurl==0.1.2
|
| 220 |
-
- mistune==3.1.3
|
| 221 |
-
- ml-dtypes==0.5.3
|
| 222 |
-
- mpmath==1.3.0
|
| 223 |
-
- msgpack==1.1.2
|
| 224 |
-
- msgpack-numpy==0.4.8
|
| 225 |
-
- multidict==6.6.4
|
| 226 |
-
- multipledispatch==1.0.0
|
| 227 |
-
- multiprocess==0.70.16
|
| 228 |
-
- mypy-extensions==1.1.0
|
| 229 |
-
- namex==0.1.0
|
| 230 |
-
- natsort==8.4.0
|
| 231 |
-
- nbclient==0.10.2
|
| 232 |
-
- nbconvert==7.16.6
|
| 233 |
-
- nbdime==4.0.2
|
| 234 |
-
- nbformat==5.10.4
|
| 235 |
-
- nest-asyncio==1.6.0
|
| 236 |
-
- netcdf4==1.7.2
|
| 237 |
-
- networkx==3.4.2
|
| 238 |
-
- notebook==7.4.5
|
| 239 |
-
- notebook-shim==0.2.4
|
| 240 |
-
- numba==0.61.2
|
| 241 |
-
- numba-cuda==0.19.1
|
| 242 |
-
- numpy==1.26.4
|
| 243 |
-
- nvidia-cublas-cu12==12.8.4.1
|
| 244 |
-
- nvidia-cuda-cccl-cu12==12.9.27
|
| 245 |
-
- nvidia-cuda-cupti-cu12==12.8.90
|
| 246 |
-
- nvidia-cuda-nvcc-cu12==12.9.86
|
| 247 |
-
- nvidia-cuda-nvrtc-cu12==12.8.93
|
| 248 |
-
- nvidia-cuda-runtime-cu12==12.8.90
|
| 249 |
-
- nvidia-cudnn-cu12==9.10.2.21
|
| 250 |
-
- nvidia-cufft-cu12==11.3.3.83
|
| 251 |
-
- nvidia-cufile-cu12==1.13.1.3
|
| 252 |
-
- nvidia-curand-cu12==10.3.9.90
|
| 253 |
-
- nvidia-cusolver-cu12==11.7.3.90
|
| 254 |
-
- nvidia-cusparse-cu12==12.5.8.93
|
| 255 |
-
- nvidia-cusparselt-cu12==0.7.1
|
| 256 |
-
- nvidia-ml-py==13.590.44
|
| 257 |
-
- nvidia-nccl-cu12==2.27.3
|
| 258 |
-
- nvidia-nvimgcodec-cu12==0.6.1.37
|
| 259 |
-
- nvidia-nvjitlink-cu12==12.8.93
|
| 260 |
-
- nvidia-nvtx-cu12==12.8.90
|
| 261 |
-
- nvtx==0.2.14
|
| 262 |
-
- nx-cugraph-cu12==25.12.0
|
| 263 |
-
- omegaconf==2.3.0
|
| 264 |
-
- opt-einsum==3.4.0
|
| 265 |
-
- optree==0.17.0
|
| 266 |
-
- optuna==4.6.0
|
| 267 |
-
- orjson==3.11.4
|
| 268 |
-
- overrides==7.7.0
|
| 269 |
-
- p2smi==1.1.1
|
| 270 |
-
- packaging==25.0
|
| 271 |
-
- pandas==2.3.2
|
| 272 |
-
- pandocfilters==1.5.1
|
| 273 |
-
- panel==1.7.5
|
| 274 |
-
- param==2.3.1
|
| 275 |
-
- paretoflow==0.1.5
|
| 276 |
-
- parso==0.8.4
|
| 277 |
-
- partd==1.4.2
|
| 278 |
-
- pathos==0.3.2
|
| 279 |
-
- pathspec==0.12.1
|
| 280 |
-
- patsy==1.0.2
|
| 281 |
-
- peft==0.17.1
|
| 282 |
-
- pexpect==4.9.0
|
| 283 |
-
- pfzy==0.3.4
|
| 284 |
-
- pillow==11.3.0
|
| 285 |
-
- platformdirs==4.3.8
|
| 286 |
-
- pluggy==1.6.0
|
| 287 |
-
- pox==0.3.6
|
| 288 |
-
- ppft==1.7.7
|
| 289 |
-
- prdc==0.2
|
| 290 |
-
- prometheus-client==0.22.1
|
| 291 |
-
- prompt-toolkit==3.0.51
|
| 292 |
-
- propcache==0.3.2
|
| 293 |
-
- protobuf==6.32.0
|
| 294 |
-
- psutil==7.0.0
|
| 295 |
-
- ptyprocess==0.7.0
|
| 296 |
-
- pure-eval==0.2.3
|
| 297 |
-
- pyarrow==21.0.0
|
| 298 |
-
- pycodestyle==2.12.1
|
| 299 |
-
- pycparser==2.22
|
| 300 |
-
- pyct==0.6.0
|
| 301 |
-
- pydantic==2.11.9
|
| 302 |
-
- pydantic-core==2.33.2
|
| 303 |
-
- pydocstyle==6.3.0
|
| 304 |
-
- pydub==0.25.1
|
| 305 |
-
- pyemma==2.5.12
|
| 306 |
-
- pyflakes==3.2.0
|
| 307 |
-
- pygments==2.19.2
|
| 308 |
-
- pylibcudf-cu12==25.12.0
|
| 309 |
-
- pylibcugraph-cu12==25.12.2
|
| 310 |
-
- pylibraft-cu12==25.12.0
|
| 311 |
-
- pylint==3.3.8
|
| 312 |
-
- pymoo==0.6.1.5
|
| 313 |
-
- pynndescent==0.5.13
|
| 314 |
-
- pyogrio==0.12.1
|
| 315 |
-
- pyparsing==3.2.5
|
| 316 |
-
- pyproj==3.7.1
|
| 317 |
-
- python-dateutil==2.9.0.post0
|
| 318 |
-
- python-json-logger==3.3.0
|
| 319 |
-
- python-lsp-jsonrpc==1.1.2
|
| 320 |
-
- python-lsp-server==1.13.0
|
| 321 |
-
- python-multipart==0.0.20
|
| 322 |
-
- pytoolconfig==1.3.1
|
| 323 |
-
- pytorch-lightning==2.5.5
|
| 324 |
-
- pytorch-lightning-bolts==0.3.2.post1
|
| 325 |
-
- pytorch-metric-learning==2.9.0
|
| 326 |
-
- pytz==2025.2
|
| 327 |
-
- pyviz-comms==3.0.6
|
| 328 |
-
- pyyaml==6.0.2
|
| 329 |
-
- pyzmq==27.0.1
|
| 330 |
-
- raft-dask-cu12==25.12.0
|
| 331 |
-
- rapids-dask-dependency==25.12.0
|
| 332 |
-
- rapids-logger==0.2.3
|
| 333 |
-
- rdkit==2025.9.1
|
| 334 |
-
- referencing==0.36.2
|
| 335 |
-
- regex==2025.7.34
|
| 336 |
-
- requests==2.32.5
|
| 337 |
-
- requests-toolbelt==1.0.0
|
| 338 |
-
- rfc3339-validator==0.1.4
|
| 339 |
-
- rfc3986-validator==0.1.1
|
| 340 |
-
- rfc3987-syntax==1.1.0
|
| 341 |
-
- rich==14.1.0
|
| 342 |
-
- rmm-cu12==25.12.0
|
| 343 |
-
- rope==1.14.0
|
| 344 |
-
- rpds-py==0.27.0
|
| 345 |
-
- ruff==0.14.3
|
| 346 |
-
- safehttpx==0.1.7
|
| 347 |
-
- safetensors==0.6.2
|
| 348 |
-
- scanpy==1.11.5
|
| 349 |
-
- schedulefree==1.4.1
|
| 350 |
-
- scikit-image==0.25.2
|
| 351 |
-
- scikit-learn==1.7.1
|
| 352 |
-
- scipy==1.15.3
|
| 353 |
-
- seaborn==0.13.2
|
| 354 |
-
- semantic-version==2.10.0
|
| 355 |
-
- send2trash==1.8.3
|
| 356 |
-
- sentry-sdk==2.35.0
|
| 357 |
-
- session-info2==0.2.3
|
| 358 |
-
- setuptools==80.9.0
|
| 359 |
-
- shapely==2.0.7
|
| 360 |
-
- shellingham==1.5.4
|
| 361 |
-
- simpervisor==1.0.0
|
| 362 |
-
- six==1.17.0
|
| 363 |
-
- smart-open==7.4.3
|
| 364 |
-
- smilespe==0.0.3
|
| 365 |
-
- smmap==5.0.2
|
| 366 |
-
- sniffio==1.3.1
|
| 367 |
-
- snowballstemmer==3.0.1
|
| 368 |
-
- sortedcontainers==2.4.0
|
| 369 |
-
- soupsieve==2.7
|
| 370 |
-
- sqlalchemy==2.0.45
|
| 371 |
-
- stack-data==0.6.3
|
| 372 |
-
- starlette==0.49.3
|
| 373 |
-
- statsmodels==0.14.5
|
| 374 |
-
- sympy==1.14.0
|
| 375 |
-
- tblib==3.2.2
|
| 376 |
-
- tenacity==9.1.2
|
| 377 |
-
- tensorboard==2.20.0
|
| 378 |
-
- tensorboard-data-server==0.7.2
|
| 379 |
-
- tensorflow==2.20.0
|
| 380 |
-
- termcolor==3.2.0
|
| 381 |
-
- terminado==0.18.1
|
| 382 |
-
- texttable==1.7.0
|
| 383 |
-
- threadpoolctl==3.6.0
|
| 384 |
-
- tifffile==2025.5.10
|
| 385 |
-
- timm==1.0.22
|
| 386 |
-
- tinycss2==1.4.0
|
| 387 |
-
- tokenizers==0.20.3
|
| 388 |
-
- tomli==2.2.1
|
| 389 |
-
- tomlkit==0.13.3
|
| 390 |
-
- toolz==1.1.0
|
| 391 |
-
- torch==2.8.0
|
| 392 |
-
- torch-geometric==2.6.1
|
| 393 |
-
- torchaudio==2.8.0+cu128
|
| 394 |
-
- torchmetrics==1.8.2
|
| 395 |
-
- torchtext==0.18.0
|
| 396 |
-
- torchvision==0.23.0+cu128
|
| 397 |
-
- tornado==6.5.2
|
| 398 |
-
- tqdm==4.67.1
|
| 399 |
-
- traitlets==5.14.3
|
| 400 |
-
- transformers==4.46.0
|
| 401 |
-
- treelite==4.6.1
|
| 402 |
-
- triton==3.4.0
|
| 403 |
-
- typer==0.20.0
|
| 404 |
-
- types-python-dateutil==2.9.0.20250809
|
| 405 |
-
- typing-extensions==4.15.0
|
| 406 |
-
- typing-inspection==0.4.1
|
| 407 |
-
- tzdata==2025.2
|
| 408 |
-
- uc-micro-py==1.0.3
|
| 409 |
-
- ucxx-cu12==0.47.0
|
| 410 |
-
- ujson==5.11.0
|
| 411 |
-
- umap-learn==0.5.9.post2
|
| 412 |
-
- uri-template==1.3.0
|
| 413 |
-
- urllib3==2.5.0
|
| 414 |
-
- uvicorn==0.38.0
|
| 415 |
-
- vampnet==0.1.4.dev16+gc88ed3f0f.d20251028
|
| 416 |
-
- wandb==0.21.1
|
| 417 |
-
- wcwidth==0.2.13
|
| 418 |
-
- webcolors==24.11.1
|
| 419 |
-
- webencodings==0.5.1
|
| 420 |
-
- websocket-client==1.8.0
|
| 421 |
-
- websockets==15.0.1
|
| 422 |
-
- werkzeug==3.1.3
|
| 423 |
-
- whatthepatch==1.0.7
|
| 424 |
-
- wrapt==1.17.3
|
| 425 |
-
- xarray==2025.6.1
|
| 426 |
-
- xgboost==3.0.4
|
| 427 |
-
- xxhash==3.5.0
|
| 428 |
-
- xyzservices==2025.11.0
|
| 429 |
-
- yapf==0.43.0
|
| 430 |
-
- yarl==1.20.1
|
| 431 |
-
- zict==3.0.0
|
| 432 |
-
- zipp==3.23.0
|
| 433 |
-
- zstd==1.5.7.2
|
| 434 |
-
prefix: /vast/projects/pranam/lab/yz927/envs/metal
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inference.py
CHANGED
|
@@ -965,7 +965,7 @@ class PeptiVersePredictor:
|
|
| 965 |
|
| 966 |
if __name__ == "__main__":
|
| 967 |
predictor = PeptiVersePredictor(
|
| 968 |
-
manifest_path="
|
| 969 |
classifier_weight_root="./"
|
| 970 |
)
|
| 971 |
print(predictor.predict_property("hemolysis", "wt", "GIGAVLKVLTTGLPALISWIKRKRQQ"))
|
|
|
|
| 965 |
|
| 966 |
if __name__ == "__main__":
|
| 967 |
predictor = PeptiVersePredictor(
|
| 968 |
+
manifest_path="basic_models.txt",
|
| 969 |
classifier_weight_root="./"
|
| 970 |
)
|
| 971 |
print(predictor.predict_property("hemolysis", "wt", "GIGAVLKVLTTGLPALISWIKRKRQQ"))
|
tokenizer/.ipynb_checkpoints/my_tokenizers-checkpoint.py
DELETED
|
@@ -1,398 +0,0 @@
|
|
| 1 |
-
import collections
|
| 2 |
-
import logging
|
| 3 |
-
import os
|
| 4 |
-
import re
|
| 5 |
-
import codecs
|
| 6 |
-
import unicodedata
|
| 7 |
-
from typing import List, Optional
|
| 8 |
-
from transformers import PreTrainedTokenizer
|
| 9 |
-
from SmilesPE.tokenizer import SPE_Tokenizer
|
| 10 |
-
|
| 11 |
-
def load_vocab(vocab_file):
|
| 12 |
-
"""Loads a vocabulary file into a dictionary."""
|
| 13 |
-
vocab = collections.OrderedDict()
|
| 14 |
-
with open(vocab_file, "r", encoding="utf-8") as reader:
|
| 15 |
-
tokens = reader.readlines()
|
| 16 |
-
for index, token in enumerate(tokens):
|
| 17 |
-
token = token.rstrip("\n")
|
| 18 |
-
vocab[token] = index
|
| 19 |
-
return vocab
|
| 20 |
-
|
| 21 |
-
class Atomwise_Tokenizer(object):
|
| 22 |
-
"""Run atom-level SMILES tokenization"""
|
| 23 |
-
|
| 24 |
-
def __init__(self):
|
| 25 |
-
""" Constructs a atom-level Tokenizer.
|
| 26 |
-
"""
|
| 27 |
-
# self.regex_pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
|
| 28 |
-
self.regex_pattern = r"(\([^\(\)]{0,4}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/\/?|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
|
| 29 |
-
|
| 30 |
-
self.regex = re.compile(self.regex_pattern)
|
| 31 |
-
|
| 32 |
-
def tokenize(self, text):
|
| 33 |
-
""" Basic Tokenization of a SMILES.
|
| 34 |
-
"""
|
| 35 |
-
tokens = [token for token in self.regex.findall(text)]
|
| 36 |
-
return tokens
|
| 37 |
-
|
| 38 |
-
class SMILES_SPE_Tokenizer(PreTrainedTokenizer):
|
| 39 |
-
r"""
|
| 40 |
-
Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
|
| 41 |
-
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
| 42 |
-
should refer to the superclass for more information regarding methods.
|
| 43 |
-
Args:
|
| 44 |
-
vocab_file (:obj:`string`):
|
| 45 |
-
File containing the vocabulary.
|
| 46 |
-
spe_file (:obj:`string`):
|
| 47 |
-
File containing the trained SMILES Pair Encoding vocabulary.
|
| 48 |
-
unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
|
| 49 |
-
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
| 50 |
-
token instead.
|
| 51 |
-
sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
|
| 52 |
-
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
|
| 53 |
-
for sequence classification or for a text and a question for question answering.
|
| 54 |
-
It is also used as the last token of a sequence built with special tokens.
|
| 55 |
-
pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
|
| 56 |
-
The token used for padding, for example when batching sequences of different lengths.
|
| 57 |
-
cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
|
| 58 |
-
The classifier token which is used when doing sequence classification (classification of the whole
|
| 59 |
-
sequence instead of per-token classification). It is the first token of the sequence when built with
|
| 60 |
-
special tokens.
|
| 61 |
-
mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
|
| 62 |
-
The token used for masking values. This is the token used when training this model with masked language
|
| 63 |
-
modeling. This is the token which the model will try to predict.
|
| 64 |
-
"""
|
| 65 |
-
|
| 66 |
-
def __init__(self, vocab_file, spe_file,
|
| 67 |
-
unk_token="[UNK]",
|
| 68 |
-
sep_token="[SEP]",
|
| 69 |
-
pad_token="[PAD]",
|
| 70 |
-
cls_token="[CLS]",
|
| 71 |
-
mask_token="[MASK]",
|
| 72 |
-
**kwargs):
|
| 73 |
-
if not os.path.isfile(vocab_file):
|
| 74 |
-
raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file))
|
| 75 |
-
if not os.path.isfile(spe_file):
|
| 76 |
-
raise ValueError("Can't find a SPE vocabulary file at path '{}'.".format(spe_file))
|
| 77 |
-
|
| 78 |
-
self.vocab = load_vocab(vocab_file)
|
| 79 |
-
self.spe_vocab = open(spe_file, 'r', encoding='utf-8')
|
| 80 |
-
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
|
| 81 |
-
self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab)
|
| 82 |
-
|
| 83 |
-
super().__init__(
|
| 84 |
-
unk_token=unk_token,
|
| 85 |
-
sep_token=sep_token,
|
| 86 |
-
pad_token=pad_token,
|
| 87 |
-
cls_token=cls_token,
|
| 88 |
-
mask_token=mask_token,
|
| 89 |
-
**kwargs)
|
| 90 |
-
|
| 91 |
-
@property
|
| 92 |
-
def vocab_size(self):
|
| 93 |
-
return len(self.vocab)
|
| 94 |
-
|
| 95 |
-
def get_vocab(self):
|
| 96 |
-
return dict(self.vocab, **self.added_tokens_encoder)
|
| 97 |
-
|
| 98 |
-
def _tokenize(self, text):
|
| 99 |
-
return self.spe_tokenizer.tokenize(text).split(' ')
|
| 100 |
-
|
| 101 |
-
def _convert_token_to_id(self, token):
|
| 102 |
-
""" Converts a token (str) in an id using the vocab. """
|
| 103 |
-
return self.vocab.get(token, self.vocab.get(self.unk_token))
|
| 104 |
-
|
| 105 |
-
def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
|
| 106 |
-
text = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
|
| 107 |
-
return self.convert_tokens_to_string(text)
|
| 108 |
-
|
| 109 |
-
def _convert_id_to_token(self, index):
|
| 110 |
-
"""Converts an index (integer) in a token (str) using the vocab."""
|
| 111 |
-
return self.ids_to_tokens.get(index, self.unk_token)
|
| 112 |
-
|
| 113 |
-
def convert_tokens_to_string(self, tokens):
|
| 114 |
-
""" Converts a sequence of tokens (string) in a single string. """
|
| 115 |
-
out_string = " ".join(tokens).replace(" ##", "").strip()
|
| 116 |
-
return out_string
|
| 117 |
-
|
| 118 |
-
def build_inputs_with_special_tokens(
|
| 119 |
-
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
| 120 |
-
) -> List[int]:
|
| 121 |
-
"""
|
| 122 |
-
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
| 123 |
-
by concatenating and adding special tokens.
|
| 124 |
-
A BERT sequence has the following format:
|
| 125 |
-
- single sequence: ``[CLS] X [SEP]``
|
| 126 |
-
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
|
| 127 |
-
Args:
|
| 128 |
-
token_ids_0 (:obj:`List[int]`):
|
| 129 |
-
List of IDs to which the special tokens will be added
|
| 130 |
-
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
| 131 |
-
Optional second list of IDs for sequence pairs.
|
| 132 |
-
Returns:
|
| 133 |
-
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
| 134 |
-
"""
|
| 135 |
-
if token_ids_1 is None:
|
| 136 |
-
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
| 137 |
-
cls = [self.cls_token_id]
|
| 138 |
-
sep = [self.sep_token_id]
|
| 139 |
-
return cls + token_ids_0 + sep + token_ids_1 + sep
|
| 140 |
-
|
| 141 |
-
def get_special_tokens_mask(
|
| 142 |
-
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
| 143 |
-
) -> List[int]:
|
| 144 |
-
"""
|
| 145 |
-
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
| 146 |
-
special tokens using the tokenizer ``prepare_for_model`` method.
|
| 147 |
-
Args:
|
| 148 |
-
token_ids_0 (:obj:`List[int]`):
|
| 149 |
-
List of ids.
|
| 150 |
-
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
| 151 |
-
Optional second list of IDs for sequence pairs.
|
| 152 |
-
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
| 153 |
-
Set to True if the token list is already formatted with special tokens for the model
|
| 154 |
-
Returns:
|
| 155 |
-
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
| 156 |
-
"""
|
| 157 |
-
|
| 158 |
-
if already_has_special_tokens:
|
| 159 |
-
if token_ids_1 is not None:
|
| 160 |
-
raise ValueError(
|
| 161 |
-
"You should not supply a second sequence if the provided sequence of "
|
| 162 |
-
"ids is already formated with special tokens for the model."
|
| 163 |
-
)
|
| 164 |
-
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
| 165 |
-
|
| 166 |
-
if token_ids_1 is not None:
|
| 167 |
-
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
| 168 |
-
return [1] + ([0] * len(token_ids_0)) + [1]
|
| 169 |
-
|
| 170 |
-
def create_token_type_ids_from_sequences(
|
| 171 |
-
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
| 172 |
-
) -> List[int]:
|
| 173 |
-
"""
|
| 174 |
-
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
| 175 |
-
A BERT sequence pair mask has the following format:
|
| 176 |
-
::
|
| 177 |
-
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
| 178 |
-
| first sequence | second sequence |
|
| 179 |
-
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
| 180 |
-
Args:
|
| 181 |
-
token_ids_0 (:obj:`List[int]`):
|
| 182 |
-
List of ids.
|
| 183 |
-
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
| 184 |
-
Optional second list of IDs for sequence pairs.
|
| 185 |
-
Returns:
|
| 186 |
-
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
|
| 187 |
-
sequence(s).
|
| 188 |
-
"""
|
| 189 |
-
sep = [self.sep_token_id]
|
| 190 |
-
cls = [self.cls_token_id]
|
| 191 |
-
if token_ids_1 is None:
|
| 192 |
-
return len(cls + token_ids_0 + sep) * [0]
|
| 193 |
-
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
| 194 |
-
|
| 195 |
-
def save_vocabulary(self, vocab_path):
|
| 196 |
-
"""
|
| 197 |
-
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
|
| 198 |
-
Args:
|
| 199 |
-
vocab_path (:obj:`str`):
|
| 200 |
-
The directory in which to save the vocabulary.
|
| 201 |
-
Returns:
|
| 202 |
-
:obj:`Tuple(str)`: Paths to the files saved.
|
| 203 |
-
"""
|
| 204 |
-
index = 0
|
| 205 |
-
if os.path.isdir(vocab_path):
|
| 206 |
-
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
|
| 207 |
-
else:
|
| 208 |
-
vocab_file = vocab_path
|
| 209 |
-
with open(vocab_file, "w", encoding="utf-8") as writer:
|
| 210 |
-
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
|
| 211 |
-
if index != token_index:
|
| 212 |
-
logger.warning(
|
| 213 |
-
"Saving vocabulary to {}: vocabulary indices are not consecutive."
|
| 214 |
-
" Please check that the vocabulary is not corrupted!".format(vocab_file)
|
| 215 |
-
)
|
| 216 |
-
index = token_index
|
| 217 |
-
writer.write(token + "\n")
|
| 218 |
-
index += 1
|
| 219 |
-
return (vocab_file,)
|
| 220 |
-
|
| 221 |
-
class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer):
|
| 222 |
-
r"""
|
| 223 |
-
Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
|
| 224 |
-
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
| 225 |
-
should refer to the superclass for more information regarding methods.
|
| 226 |
-
Args:
|
| 227 |
-
vocab_file (:obj:`string`):
|
| 228 |
-
File containing the vocabulary.
|
| 229 |
-
unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
|
| 230 |
-
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
| 231 |
-
token instead.
|
| 232 |
-
sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
|
| 233 |
-
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
|
| 234 |
-
for sequence classification or for a text and a question for question answering.
|
| 235 |
-
It is also used as the last token of a sequence built with special tokens.
|
| 236 |
-
pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
|
| 237 |
-
The token used for padding, for example when batching sequences of different lengths.
|
| 238 |
-
cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
|
| 239 |
-
The classifier token which is used when doing sequence classification (classification of the whole
|
| 240 |
-
sequence instead of per-token classification). It is the first token of the sequence when built with
|
| 241 |
-
special tokens.
|
| 242 |
-
mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
|
| 243 |
-
The token used for masking values. This is the token used when training this model with masked language
|
| 244 |
-
modeling. This is the token which the model will try to predict.
|
| 245 |
-
"""
|
| 246 |
-
|
| 247 |
-
def __init__(
|
| 248 |
-
self,
|
| 249 |
-
vocab_file,
|
| 250 |
-
unk_token="[UNK]",
|
| 251 |
-
sep_token="[SEP]",
|
| 252 |
-
pad_token="[PAD]",
|
| 253 |
-
cls_token="[CLS]",
|
| 254 |
-
mask_token="[MASK]",
|
| 255 |
-
**kwargs
|
| 256 |
-
):
|
| 257 |
-
super().__init__(
|
| 258 |
-
unk_token=unk_token,
|
| 259 |
-
sep_token=sep_token,
|
| 260 |
-
pad_token=pad_token,
|
| 261 |
-
cls_token=cls_token,
|
| 262 |
-
mask_token=mask_token,
|
| 263 |
-
**kwargs,
|
| 264 |
-
)
|
| 265 |
-
|
| 266 |
-
if not os.path.isfile(vocab_file):
|
| 267 |
-
raise ValueError(
|
| 268 |
-
"Can't find a vocabulary file at path '{}'.".format(vocab_file)
|
| 269 |
-
)
|
| 270 |
-
self.vocab = load_vocab(vocab_file)
|
| 271 |
-
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
|
| 272 |
-
self.tokenizer = Atomwise_Tokenizer()
|
| 273 |
-
|
| 274 |
-
@property
|
| 275 |
-
def vocab_size(self):
|
| 276 |
-
return len(self.vocab)
|
| 277 |
-
|
| 278 |
-
def get_vocab(self):
|
| 279 |
-
return dict(self.vocab, **self.added_tokens_encoder)
|
| 280 |
-
|
| 281 |
-
def _tokenize(self, text):
|
| 282 |
-
return self.tokenizer.tokenize(text)
|
| 283 |
-
|
| 284 |
-
def _convert_token_to_id(self, token):
|
| 285 |
-
""" Converts a token (str) in an id using the vocab. """
|
| 286 |
-
return self.vocab.get(token, self.vocab.get(self.unk_token))
|
| 287 |
-
|
| 288 |
-
def _convert_id_to_token(self, index):
|
| 289 |
-
"""Converts an index (integer) in a token (str) using the vocab."""
|
| 290 |
-
return self.ids_to_tokens.get(index, self.unk_token)
|
| 291 |
-
|
| 292 |
-
def convert_tokens_to_string(self, tokens):
|
| 293 |
-
""" Converts a sequence of tokens (string) in a single string. """
|
| 294 |
-
out_string = " ".join(tokens).replace(" ##", "").strip()
|
| 295 |
-
return out_string
|
| 296 |
-
|
| 297 |
-
def build_inputs_with_special_tokens(
|
| 298 |
-
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
| 299 |
-
) -> List[int]:
|
| 300 |
-
"""
|
| 301 |
-
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
| 302 |
-
by concatenating and adding special tokens.
|
| 303 |
-
A BERT sequence has the following format:
|
| 304 |
-
- single sequence: ``[CLS] X [SEP]``
|
| 305 |
-
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
|
| 306 |
-
Args:
|
| 307 |
-
token_ids_0 (:obj:`List[int]`):
|
| 308 |
-
List of IDs to which the special tokens will be added
|
| 309 |
-
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
| 310 |
-
Optional second list of IDs for sequence pairs.
|
| 311 |
-
Returns:
|
| 312 |
-
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
| 313 |
-
"""
|
| 314 |
-
if token_ids_1 is None:
|
| 315 |
-
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
| 316 |
-
cls = [self.cls_token_id]
|
| 317 |
-
sep = [self.sep_token_id]
|
| 318 |
-
return cls + token_ids_0 + sep + token_ids_1 + sep
|
| 319 |
-
|
| 320 |
-
def get_special_tokens_mask(
|
| 321 |
-
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
| 322 |
-
) -> List[int]:
|
| 323 |
-
"""
|
| 324 |
-
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
| 325 |
-
special tokens using the tokenizer ``prepare_for_model`` method.
|
| 326 |
-
Args:
|
| 327 |
-
token_ids_0 (:obj:`List[int]`):
|
| 328 |
-
List of ids.
|
| 329 |
-
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
| 330 |
-
Optional second list of IDs for sequence pairs.
|
| 331 |
-
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
| 332 |
-
Set to True if the token list is already formatted with special tokens for the model
|
| 333 |
-
Returns:
|
| 334 |
-
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
| 335 |
-
"""
|
| 336 |
-
|
| 337 |
-
if already_has_special_tokens:
|
| 338 |
-
if token_ids_1 is not None:
|
| 339 |
-
raise ValueError(
|
| 340 |
-
"You should not supply a second sequence if the provided sequence of "
|
| 341 |
-
"ids is already formated with special tokens for the model."
|
| 342 |
-
)
|
| 343 |
-
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
| 344 |
-
|
| 345 |
-
if token_ids_1 is not None:
|
| 346 |
-
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
| 347 |
-
return [1] + ([0] * len(token_ids_0)) + [1]
|
| 348 |
-
|
| 349 |
-
def create_token_type_ids_from_sequences(
|
| 350 |
-
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
| 351 |
-
) -> List[int]:
|
| 352 |
-
"""
|
| 353 |
-
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
| 354 |
-
A BERT sequence pair mask has the following format:
|
| 355 |
-
::
|
| 356 |
-
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
| 357 |
-
| first sequence | second sequence |
|
| 358 |
-
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
| 359 |
-
Args:
|
| 360 |
-
token_ids_0 (:obj:`List[int]`):
|
| 361 |
-
List of ids.
|
| 362 |
-
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
| 363 |
-
Optional second list of IDs for sequence pairs.
|
| 364 |
-
Returns:
|
| 365 |
-
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
|
| 366 |
-
sequence(s).
|
| 367 |
-
"""
|
| 368 |
-
sep = [self.sep_token_id]
|
| 369 |
-
cls = [self.cls_token_id]
|
| 370 |
-
if token_ids_1 is None:
|
| 371 |
-
return len(cls + token_ids_0 + sep) * [0]
|
| 372 |
-
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
| 373 |
-
|
| 374 |
-
def save_vocabulary(self, vocab_path):
|
| 375 |
-
"""
|
| 376 |
-
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
|
| 377 |
-
Args:
|
| 378 |
-
vocab_path (:obj:`str`):
|
| 379 |
-
The directory in which to save the vocabulary.
|
| 380 |
-
Returns:
|
| 381 |
-
:obj:`Tuple(str)`: Paths to the files saved.
|
| 382 |
-
"""
|
| 383 |
-
index = 0
|
| 384 |
-
if os.path.isdir(vocab_path):
|
| 385 |
-
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
|
| 386 |
-
else:
|
| 387 |
-
vocab_file = vocab_path
|
| 388 |
-
with open(vocab_file, "w", encoding="utf-8") as writer:
|
| 389 |
-
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
|
| 390 |
-
if index != token_index:
|
| 391 |
-
logger.warning(
|
| 392 |
-
"Saving vocabulary to {}: vocabulary indices are not consecutive."
|
| 393 |
-
" Please check that the vocabulary is not corrupted!".format(vocab_file)
|
| 394 |
-
)
|
| 395 |
-
index = token_index
|
| 396 |
-
writer.write(token + "\n")
|
| 397 |
-
index += 1
|
| 398 |
-
return (vocab_file,)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer/__pycache__/my_tokenizers.cpython-310.pyc
DELETED
|
Binary file (15.5 kB)
|
|
|