Spaces:

danielprobst
/

tox21_mhfp

Build error

App Files Files Community

Daniel Probst commited on Nov 21, 2025

Commit

444d15c

1 Parent(s): 9678908

Initial commit

Browse files

Files changed (27) hide show

Dockerfile +16 -0
LICENSE +407 -0
MODEL_CARD.md +31 -0
README.md +112 -6
app.py +78 -0
checkpoints/.gitkeep +0 -0
config/config.json +121 -0
logs/train_2025-11-21_17-59-45.log +15 -0
logs/train_2025-11-21_21-13-33.log +15 -0
logs/train_2025-11-21_21-29-10.log +18 -0
logs/train_2025-11-21_21-38-38.log +15 -0
logs/train_2025-11-21_21-42-58.log +30 -0
logs/train_2025-11-21_21-45-08.log +32 -0
predict.py +86 -0
preprocess.py +70 -0
pyproject.toml +19 -0
requirements.txt +11 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-313.pyc +0 -0
src/__pycache__/model.cpython-313.pyc +0 -0
src/__pycache__/preprocess.cpython-313.pyc +0 -0
src/__pycache__/utils.cpython-313.pyc +0 -0
src/model.py +80 -0
src/preprocess.py +704 -0
src/utils.py +524 -0
train.py +132 -0
uv.lock +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.11
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,407 @@

+Attribution-NonCommercial 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

MODEL_CARD.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# Model card - tox21_rf_classifier
+### Model details
+- Model name: Random Forest Tox21 Baseline
+- Developer: JKU Linz
+- Paper URL: https://link.springer.com/article/10.1023/A:1010933404324
+- Model type / architecture:
+    - Random Forest implemented using sklearn.RandomForestClassifier.
+    - Hyperparameters: [link to config](https://huggingface.co/spaces/ml-jku/tox21_rf_classifier/blob/main/config/config.json)
+    - A separate single-task RF is trained for each Tox21 target.
+- Inference: Access via FastAPI. Upon a Tox21 prediction request, a target-specific RF
+model is called separately for each target; outputs are collected across all single-task
+models and returned.
+- Model version: v0
+- Model date: 14.10.2025
+- Reproducibility: Code for full training is available and enables retraining from
+scratch.
+### Intended use
+This model serves as a baseline for evaluating and comparing toxicity prediction methods
+across the 12 Tox21 pathway assays. It is not intended for clinical decision-making without
+experimental validation.
+### Metric
+Each Tox21 task is evaluated using the area under the receiver operating characteristic curve
+(AUC). Overall performance is reported as the mean AUC across all tasks.
+### Training data
+Tox21 training and validation sets.
+### Evaluation data
+Tox21 test set.

README.md CHANGED Viewed

@@ -1,11 +1,117 @@
 ---
-title: Tox21 Mhfp
-emoji: 😻
-colorFrom: green
-colorTo: red
 sdk: docker
 pinned: false
-license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Tox21 Random Forest Classifier
+emoji: 🚀
+colorFrom: red
+colorTo: purple
 sdk: docker
 pinned: false
+license: cc-by-nc-4.0
+short_description: Random Forest Baseline for Tox21
 ---
+# Tox21 Random Forest Classifier
+This repository hosts a Hugging Face Space that provides an API for submitting models to the [Tox21 Leaderboard](https://huggingface.co/spaces/ml-jku/tox21_leaderboard).
+Here **Random Forest (RF)** models are trained on the Tox21 dataset, and the trained models are provided for inference. For each of the twelve toxic effects, a separate RF model is trained. The input to the model is a **SMILES** string of the small molecule, and the output are 12 numeric values for each of the toxic effects of the Tox21 dataset.
+**Important:** For leaderboard submission, your Space needs to include training code. The file `train.py` should train the model using the config specified inside the `config/` folder and save the final model parameters into a file inside the `checkpoints/` folder. The model should be trained using the [Tox21_dataset](https://huggingface.co/datasets/ml-jku/tox21) provided on Hugging Face. The datasets can be loaded like this:
+```python
+from datasets import load_dataset
+ds = load_dataset("ml-jku/tox21", token=token)
+train_df = ds["train"].to_pandas()
+val_df = ds["validation"].to_pandas()
+```
+Additionally, the Space needs to implement inference in the `predict()` function inside `predict.py`. The `predict()` function must keep the provided skeleton: it should take a list of SMILES strings as input and return a nested prediction dictionary as output, with SMILES as keys and dictionaries containing targetname-prediction pairs as values. Therefore, any preprocessing of SMILES strings must be executed on-the-fly during inference.
+# Repository Structure
+- `predict.py` - Defines the `predict()` function required by the leaderboard (entry point for inference).
+- `app.py` - FastAPI application wrapper (can be used as-is).
+- `preprocess.py` - preprocesses SMILES strings to generate feature descriptors and saves results as NPZ files in `data/`.
+- `train.py` - trains and saves a model using the config in the `config/` folder.
+- `config/` - the config file used by `train.py`.
+- `logs/` - all the logs of `train.py`, the saved model, and predictions on the validation set.
+- `data/` - RF uses numerical data. During preprocessing in `preprocess.py` two NPZ files containing molecule features are created and saved here.
+- `checkpoints/` - the saved model that is used in `predict.py` is here.
+- `src/` - Core model & preprocessing logic:
+    - `preprocess.py` - SMILES preprocessing logic
+    - `model.py` - RF model class with processing, saving and loading logic
+    - `utils.py` - utility functions
+# Quickstart with Spaces
+You can easily adapt this project in your own Hugging Face account:
+- Open this Space on Hugging Face.
+- Click "Duplicate this Space" (top-right corner).
+- Modify `src/` for your preprocessing pipeline and model class
+- Modify `predict()` inside `predict.py` to perform model inference while keeping the function skeleton unchanged to remain compatible with the leaderboard.
+- Modify `train.py` and/or `preprocess.py` according to your model and preprocessing pipeline.
+- Modify the file inside `config/` to contain all hyperparameters that are set in `train.py`.
+That’s it, your model will be available as an API endpoint for the Tox21 Leaderboard.
+# Installation
+To run (and train) the random forest, clone the repository and install dependencies:
+```bash
+git clone https://huggingface.co/spaces/ml-jku/tox21_rf_classifier
+cd tox_21_rf_classifier
+conda create -n tox21_rf_cls python=3.11
+conda activate tox21_rf_cls
+pip install -r requirements.txt
+```
+# Training
+To train the Random Forest model from scratch, run:
+```bash
+python preprocess.py
+python train.py
+```
+These commands will:
+1. Load and preprocess the Tox21 training dataset
+2. Train a Random Forest classifier
+3. Store the resulting model in the `checkpoints/` directory.
+# Inference
+For inference, you only need `predict.py`.
+Example usage inside Python:
+```python
+from predict import predict
+smiles_list = ["CCO", "c1ccccc1", "CC(=O)O"]
+results = predict(smiles_list)
+print(results)
+```
+The output will be a nested dictionary in the format:
+```python
+{
+    "CCO": {"target1": 0, "target2": 1, ..., "target12": 0},
+    "c1ccccc1": {"target1": 1, "target2": 0, ..., "target12": 1},
+    "CC(=O)O": {"target1": 0, "target2": 0, ..., "target12": 0}
+}
+```
+# Notes
+- Adapting `predict.py`, `train.py`, `config/`, and `checkpoints/` is required for leaderboard submission.
+- Preprocessing must be done inside `predict.py` not just `train.py`.

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""
+This is the main entry point for the FastAPI application.
+The app handles the request to predict toxicity for a list of SMILES strings.
+"""
+# ---------------------------------------------------------------------------------------
+# Dependencies and global variable definition
+import os
+from typing import List, Dict, Optional
+from fastapi import FastAPI, Header, HTTPException
+from pydantic import BaseModel, Field
+from predict import predict as predict_func
+API_KEY = os.getenv("API_KEY")  # set via Space Secrets
+# ---------------------------------------------------------------------------------------
+class Request(BaseModel):
+    smiles: List[str] = Field(min_items=1, max_items=1000)
+class Response(BaseModel):
+    predictions: dict
+    model_info: Dict[str, str] = {}
+app = FastAPI(title="toxicity-api")
+@app.get("/")
+def root():
+    return {
+        "message": "Toxicity Prediction API",
+        "endpoints": {
+            "/metadata": "GET - API metadata and capabilities",
+            "/healthz": "GET - Health check",
+            "/predict": "POST - Predict toxicity for SMILES",
+        },
+        "usage": "Send POST to /predict with {'smiles': ['your_smiles_here']} and Authorization header",
+    }
+@app.get("/metadata")
+def metadata():
+    return {
+        "name": "Tox21 Random Forest Classifier",
+        "version": "0.1.0",
+        "max_batch_size": 256,
+        "tox_endpoints": [
+            "NR-AR",
+            "NR-AR-LBD",
+            "NR-AhR",
+            "NR-Aromatase",
+            "NR-ER",
+            "NR-ER-LBD",
+            "NR-PPAR-gamma",
+            "SR-ARE",
+            "SR-ATAD5",
+            "SR-HSE",
+            "SR-MMP",
+            "SR-p53",
+        ],
+    }
+@app.get("/healthz")
+def healthz():
+    return {"ok": True}
+@app.post("/predict", response_model=Response)
+def predict(request: Request):
+    predictions = predict_func(request.smiles)
+    return {
+        "predictions": predictions,
+        "model_info": {"name": "Tox21 Random Forest Classifier", "version": "0.1.0"},
+    }

checkpoints/.gitkeep ADDED Viewed

File without changes

config/config.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+    "seed": 0,
+    "debug": "false",
+    "device": "cpu",
+    "log_folder": "logs/",
+    "data_folder": "data/",
+    "cvfold": 4,
+    "ecfp" : {
+        "radius": 3,
+        "fpsize": 8192
+    },
+    "merge_train_val": "true",
+    "descriptors": ["mhfps", "tox", "maccs", "rdkit_descrs"],
+    "feature_selection": {
+        "use": "true",
+        "min_var": 0.01,
+        "max_corr": 0.95,
+        "max_features": -1,
+        "min_var__feature_keys": ["mhfp", "tox", "maccs", "rdkit_descrs"],
+        "max_corr__feature_keys": ["mhfp", "tox", "maccs", "rdkit_descrs"],
+        "min_var__independent_keys": "false",
+        "max_corr__independent_keys": "false"
+    },
+    "feature_quantilization": {
+        "use": "true",
+        "feature_keys": ["rdkit_descrs"]
+    },
+    "max_samples": -1,
+    "scaler": "standard",
+    "preprocessor_path": "checkpoints/preprocessor.joblib",
+    "ckpt_path": "checkpoints/rf_alltasks.joblib",
+    "model_config": {
+        "NR-AR": {
+            "max_depth": "none",
+            "max_features": "sqrt",
+            "min_samples_leaf": 1,
+            "min_samples_split": 5,
+            "n_estimators": 1000
+        },
+        "NR-AR-LBD": {
+            "max_depth": 12,
+            "max_features": "sqrt",
+            "min_samples_leaf": 1,
+            "min_samples_split": 5,
+            "n_estimators": 1000
+        },
+        "NR-AhR": {
+            "max_depth": "none",
+            "max_features": "log2",
+            "min_samples_leaf": 1,
+            "min_samples_split": 2,
+            "n_estimators": 1000
+        },
+        "NR-Aromatase": {
+            "max_depth": "none",
+            "max_features": "sqrt",
+            "min_samples_leaf": 4,
+            "min_samples_split": 12,
+            "n_estimators": 1000
+        },
+        "NR-ER": {
+            "max_depth": 10,
+            "max_features": "sqrt",
+            "min_samples_leaf": 1,
+            "min_samples_split": 2,
+            "n_estimators": 1000
+        },
+        "NR-ER-LBD": {
+            "max_depth": 8,
+            "max_features": "sqrt",
+            "min_samples_leaf": 2,
+            "min_samples_split": 5,
+            "n_estimators": 1000
+        },
+        "NR-PPAR-gamma": {
+            "max_depth": "none",
+            "max_features": "log2",
+            "min_samples_leaf": 1,
+            "min_samples_split": 2,
+            "n_estimators": 1000
+        },
+        "SR-ARE": {
+            "max_depth": "none",
+            "max_features": "sqrt",
+            "min_samples_leaf": 1,
+            "min_samples_split": 5,
+            "n_estimators": 1000
+        },
+        "SR-ATAD5": {
+            "max_depth": "none",
+            "max_features": "sqrt",
+            "min_samples_leaf": 1,
+            "min_samples_split": 2,
+            "n_estimators": 1000
+        },
+        "SR-HSE": {
+            "max_depth": 16,
+            "max_features": "log2",
+            "min_samples_leaf": 1,
+            "min_samples_split": 2,
+            "n_estimators": 1000
+        },
+        "SR-MMP": {
+            "max_depth": "none",
+            "max_features": "sqrt",
+            "min_samples_leaf": 2,
+            "min_samples_split": 2,
+            "n_estimators": 1000
+        },
+        "SR-p53": {
+            "max_depth": "none",
+            "max_features": "sqrt",
+            "min_samples_leaf": 1,
+            "min_samples_split": 2,
+            "n_estimators": 1000
+        }
+    }
+}

logs/train_2025-11-21_17-59-45.log ADDED Viewed

	@@ -0,0 +1,15 @@

+2025-11-21 17:59:45,120 [INFO] Config: {'seed': 0, 'debug': False, 'device': 'cpu', 'log_folder': 'logs/', 'data_folder': 'data/', 'cvfold': 4, 'ecfp': {'radius': 3, 'fpsize': 8192}, 'merge_train_val': True, 'descriptors': ['mhfps', 'tox', 'maccs', 'rdkit_descrs'], 'feature_selection': {'use': True, 'min_var': 0.01, 'max_corr': 0.95, 'max_features': -1, 'min_var__feature_keys': ['mhfp', 'tox', 'maccs', 'rdkit_descrs'], 'max_corr__feature_keys': ['mhfp', 'tox', 'maccs', 'rdkit_descrs'], 'min_var__independent_keys': False, 'max_corr__independent_keys': False}, 'feature_quantilization': {'use': True, 'feature_keys': ['rdkit_descrs']}, 'max_samples': -1, 'scaler': 'standard', 'preprocessor_path': 'checkpoints/preprocessor.joblib', 'ckpt_path': 'checkpoints/rf_alltasks.joblib', 'model_config': {'NR-AR': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-AR-LBD': {'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-AhR': {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'NR-Aromatase': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 1000}, 'NR-ER': {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'NR-ER-LBD': {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-PPAR-gamma': {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-ARE': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'SR-ATAD5': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-HSE': {'max_depth': 16, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-MMP': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-p53': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}}}
+2025-11-21 17:59:45,120 [INFO] Model config:
+Model config:
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 1000}
+{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': 16, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}

logs/train_2025-11-21_21-13-33.log ADDED Viewed

	@@ -0,0 +1,15 @@

+2025-11-21 21:13:33,779 [INFO] Config: {'seed': 0, 'debug': False, 'device': 'cpu', 'log_folder': 'logs/', 'data_folder': 'data/', 'cvfold': 4, 'ecfp': {'radius': 3, 'fpsize': 8192}, 'merge_train_val': True, 'descriptors': ['mhfps', 'tox', 'maccs', 'rdkit_descrs'], 'feature_selection': {'use': True, 'min_var': 0.01, 'max_corr': 0.95, 'max_features': -1, 'min_var__feature_keys': ['mhfp', 'tox', 'maccs', 'rdkit_descrs'], 'max_corr__feature_keys': ['mhfp', 'tox', 'maccs', 'rdkit_descrs'], 'min_var__independent_keys': False, 'max_corr__independent_keys': False}, 'feature_quantilization': {'use': True, 'feature_keys': ['rdkit_descrs']}, 'max_samples': -1, 'scaler': 'standard', 'preprocessor_path': 'checkpoints/preprocessor.joblib', 'ckpt_path': 'checkpoints/rf_alltasks.joblib', 'model_config': {'NR-AR': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-AR-LBD': {'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-AhR': {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'NR-Aromatase': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 1000}, 'NR-ER': {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'NR-ER-LBD': {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-PPAR-gamma': {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-ARE': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'SR-ATAD5': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-HSE': {'max_depth': 16, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-MMP': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-p53': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}}}
+2025-11-21 21:13:33,779 [INFO] Model config:
+Model config:
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 1000}
+{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': 16, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}

logs/train_2025-11-21_21-29-10.log ADDED Viewed

	@@ -0,0 +1,18 @@

+2025-11-21 21:29:10,146 [INFO] Config: {'seed': 0, 'debug': False, 'device': 'cpu', 'log_folder': 'logs/', 'data_folder': 'data/', 'cvfold': 4, 'ecfp': {'radius': 3, 'fpsize': 8192}, 'merge_train_val': True, 'descriptors': ['ecfps', 'tox', 'maccs', 'rdkit_descrs'], 'feature_selection': {'use': True, 'min_var': 0.01, 'max_corr': 0.95, 'max_features': -1, 'min_var__feature_keys': ['ecfp', 'tox', 'maccs', 'rdkit_descrs'], 'max_corr__feature_keys': ['ecfp', 'tox', 'maccs', 'rdkit_descrs'], 'min_var__independent_keys': False, 'max_corr__independent_keys': False}, 'feature_quantilization': {'use': True, 'feature_keys': ['rdkit_descrs']}, 'max_samples': -1, 'scaler': 'standard', 'preprocessor_path': 'checkpoints/preprocessor.joblib', 'ckpt_path': 'checkpoints/rf_alltasks.joblib', 'model_config': {'NR-AR': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-AR-LBD': {'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-AhR': {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'NR-Aromatase': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 1000}, 'NR-ER': {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'NR-ER-LBD': {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-PPAR-gamma': {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-ARE': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'SR-ATAD5': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-HSE': {'max_depth': 16, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-MMP': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-p53': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}}}
+2025-11-21 21:29:10,146 [INFO] Model config:
+Model config:
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 1000}
+{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': 16, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+2025-11-21 21:29:10,953 [INFO] Fitted RandomForestClassifier will be saved as: checkpoints/rf_alltasks.joblib
+2025-11-21 21:29:12,299 [INFO] Start training.
+2025-11-21 21:29:12,299 [INFO] Fit task NR-AR using 9645 samples

logs/train_2025-11-21_21-38-38.log ADDED Viewed

	@@ -0,0 +1,15 @@

+2025-11-21 21:38:38,350 [INFO] Config: {'seed': 0, 'debug': False, 'device': 'cpu', 'log_folder': 'logs/', 'data_folder': 'data/', 'cvfold': 4, 'ecfp': {'radius': 3, 'fpsize': 8192}, 'merge_train_val': True, 'descriptors': ['mhfps', 'tox', 'maccs', 'rdkit_descrs'], 'feature_selection': {'use': True, 'min_var': 0.01, 'max_corr': 0.95, 'max_features': -1, 'min_var__feature_keys': ['mhfp', 'tox', 'maccs', 'rdkit_descrs'], 'max_corr__feature_keys': ['mhfp', 'tox', 'maccs', 'rdkit_descrs'], 'min_var__independent_keys': False, 'max_corr__independent_keys': False}, 'feature_quantilization': {'use': True, 'feature_keys': ['rdkit_descrs']}, 'max_samples': -1, 'scaler': 'standard', 'preprocessor_path': 'checkpoints/preprocessor.joblib', 'ckpt_path': 'checkpoints/rf_alltasks.joblib', 'model_config': {'NR-AR': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-AR-LBD': {'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-AhR': {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'NR-Aromatase': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 1000}, 'NR-ER': {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'NR-ER-LBD': {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-PPAR-gamma': {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-ARE': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'SR-ATAD5': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-HSE': {'max_depth': 16, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-MMP': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-p53': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}}}
+2025-11-21 21:38:38,350 [INFO] Model config:
+Model config:
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 1000}
+{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': 16, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}

logs/train_2025-11-21_21-42-58.log ADDED Viewed

	@@ -0,0 +1,30 @@

+2025-11-21 21:42:58,767 [INFO] Config: {'seed': 0, 'debug': False, 'device': 'cpu', 'log_folder': 'logs/', 'data_folder': 'data/', 'cvfold': 4, 'ecfp': {'radius': 3, 'fpsize': 8192}, 'merge_train_val': True, 'descriptors': ['mhfps', 'tox', 'maccs', 'rdkit_descrs'], 'feature_selection': {'use': True, 'min_var': 0.01, 'max_corr': 0.95, 'max_features': -1, 'min_var__feature_keys': ['mhfp', 'tox', 'maccs', 'rdkit_descrs'], 'max_corr__feature_keys': ['mhfp', 'tox', 'maccs', 'rdkit_descrs'], 'min_var__independent_keys': False, 'max_corr__independent_keys': False}, 'feature_quantilization': {'use': True, 'feature_keys': ['rdkit_descrs']}, 'max_samples': -1, 'scaler': 'standard', 'preprocessor_path': 'checkpoints/preprocessor.joblib', 'ckpt_path': 'checkpoints/rf_alltasks.joblib', 'model_config': {'NR-AR': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-AR-LBD': {'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-AhR': {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'NR-Aromatase': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 1000}, 'NR-ER': {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'NR-ER-LBD': {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-PPAR-gamma': {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-ARE': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'SR-ATAD5': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-HSE': {'max_depth': 16, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-MMP': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-p53': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}}}
+2025-11-21 21:42:58,767 [INFO] Model config:
+Model config:
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 1000}
+{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': 16, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+2025-11-21 21:42:59,576 [INFO] Fitted RandomForestClassifier will be saved as: checkpoints/rf_alltasks.joblib
+2025-11-21 21:43:00,826 [INFO] Start training.
+2025-11-21 21:43:00,826 [INFO] Fit task NR-AR using 9645 samples
+2025-11-21 21:43:07,192 [INFO] Fit task NR-AR-LBD using 8844 samples
+2025-11-21 21:43:12,861 [INFO] Fit task NR-AhR using 8432 samples
+2025-11-21 21:43:15,695 [INFO] Fit task NR-Aromatase using 7431 samples
+2025-11-21 21:43:20,528 [INFO] Fit task NR-ER using 7953 samples
+2025-11-21 21:43:26,434 [INFO] Fit task NR-ER-LBD using 9031 samples
+2025-11-21 21:43:31,881 [INFO] Fit task NR-PPAR-gamma using 8442 samples
+2025-11-21 21:43:33,798 [INFO] Fit task SR-ARE using 7395 samples
+2025-11-21 21:43:40,050 [INFO] Fit task SR-ATAD5 using 9354 samples
+2025-11-21 21:43:46,811 [INFO] Fit task SR-HSE using 8409 samples
+2025-11-21 21:43:49,279 [INFO] Fit task SR-MMP using 7551 samples
+2025-11-21 21:43:55,132 [INFO] Fit task SR-p53 using 8894 samples
+2025-11-21 21:44:02,332 [INFO] Finished training.

logs/train_2025-11-21_21-45-08.log ADDED Viewed

	@@ -0,0 +1,32 @@

+2025-11-21 21:45:08,091 [INFO] Config: {'seed': 0, 'debug': False, 'device': 'cpu', 'log_folder': 'logs/', 'data_folder': 'data/', 'cvfold': 4, 'ecfp': {'radius': 3, 'fpsize': 8192}, 'merge_train_val': True, 'descriptors': ['mhfps', 'tox', 'maccs', 'rdkit_descrs'], 'feature_selection': {'use': True, 'min_var': 0.01, 'max_corr': 0.95, 'max_features': -1, 'min_var__feature_keys': ['mhfp', 'tox', 'maccs', 'rdkit_descrs'], 'max_corr__feature_keys': ['mhfp', 'tox', 'maccs', 'rdkit_descrs'], 'min_var__independent_keys': False, 'max_corr__independent_keys': False}, 'feature_quantilization': {'use': True, 'feature_keys': ['rdkit_descrs']}, 'max_samples': -1, 'scaler': 'standard', 'preprocessor_path': 'checkpoints/preprocessor.joblib', 'ckpt_path': 'checkpoints/rf_alltasks.joblib', 'model_config': {'NR-AR': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-AR-LBD': {'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-AhR': {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'NR-Aromatase': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 1000}, 'NR-ER': {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'NR-ER-LBD': {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}, 'NR-PPAR-gamma': {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-ARE': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}, 'SR-ATAD5': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-HSE': {'max_depth': 16, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-MMP': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}, 'SR-p53': {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}}}
+2025-11-21 21:45:08,092 [INFO] Model config:
+Model config:
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 1000}
+{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': 16, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}
+{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
+2025-11-21 21:45:08,925 [INFO] Fitted RandomForestClassifier will be saved as: checkpoints/rf_alltasks.joblib
+2025-11-21 21:45:10,035 [INFO] Start training.
+2025-11-21 21:45:10,036 [INFO] Fit task NR-AR using 9645 samples
+2025-11-21 21:45:16,392 [INFO] Fit task NR-AR-LBD using 8844 samples
+2025-11-21 21:45:22,091 [INFO] Fit task NR-AhR using 8432 samples
+2025-11-21 21:45:24,931 [INFO] Fit task NR-Aromatase using 7431 samples
+2025-11-21 21:45:29,749 [INFO] Fit task NR-ER using 7953 samples
+2025-11-21 21:45:35,716 [INFO] Fit task NR-ER-LBD using 9031 samples
+2025-11-21 21:45:41,136 [INFO] Fit task NR-PPAR-gamma using 8442 samples
+2025-11-21 21:45:43,052 [INFO] Fit task SR-ARE using 7395 samples
+2025-11-21 21:45:49,365 [INFO] Fit task SR-ATAD5 using 9354 samples
+2025-11-21 21:45:56,269 [INFO] Fit task SR-HSE using 8409 samples
+2025-11-21 21:45:58,780 [INFO] Fit task SR-MMP using 7551 samples
+2025-11-21 21:46:04,725 [INFO] Fit task SR-p53 using 8894 samples
+2025-11-21 21:46:12,055 [INFO] Finished training.
+2025-11-21 21:46:13,441 [INFO] Save model as: checkpoints/rf_alltasks.joblib
+2025-11-21 21:46:13,459 [INFO] Save preprocessor as: checkpoints/preprocessor.joblib

predict.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+This files includes a predict function for the Tox21.
+As an input it takes a list of SMILES and it outputs a nested dictionary with
+SMILES and target names as keys.
+"""
+# ---------------------------------------------------------------------------------------
+# Dependencies
+import json
+import copy
+from collections import defaultdict
+import joblib
+import numpy as np
+from tqdm import tqdm
+from src.model import Tox21RFClassifier
+from src.preprocess import create_descriptors, FeaturePreprocessor
+from src.utils import TASKS, normalize_config
+# ---------------------------------------------------------------------------------------
+CONFIG_FILE = "./config/config.json"
+def predict(
+    smiles_list: list[str], default_prediction: float = 0.5
+) -> dict[str, dict[str, float]]:
+    """Applies the classifier to a list of SMILES strings. Returns prediction=0.0 for
+    any molecule that could not be cleaned.
+    Args:
+        smiles_list (list[str]): list of SMILES strings
+    Returns:
+        dict: nested prediction dictionary, following {'<smiles>': {'<target>': <pred>}}
+    """
+    print(f"Received {len(smiles_list)} SMILES strings")
+    with open(CONFIG_FILE, "r") as f:
+        config = json.load(f)
+    config = normalize_config(config)
+    features, is_clean = create_descriptors(
+        smiles_list, config["descriptors"], **config["ecfp"]
+    )
+    print(f"Created descriptors for {sum(is_clean)} molecules.")
+    print(f"{len(is_clean) - sum(is_clean)} molecules removed during cleaning")
+    # setup model
+    model = Tox21RFClassifier()
+    preprocessor = FeaturePreprocessor(
+        feature_selection_config=config["feature_selection"],
+        feature_quantilization_config=config["feature_quantilization"],
+        descriptors=config["descriptors"],
+        max_samples=config["max_samples"],
+        scaler=config["scaler"],
+    )
+    model.load(config["ckpt_path"])
+    print(f"Loaded model from {config['ckpt_path']}")
+    state = joblib.load(config["preprocessor_path"])
+    preprocessor.set_state(state)
+    print(f"Loaded preprocessor from {config['preprocessor_path']}")
+    # make predicitons
+    predictions = defaultdict(dict)
+    print(f"Create predictions:")
+    preds = []
+    for target in tqdm(TASKS):
+        X = copy.deepcopy(features)
+        X = {descr: array[is_clean] for descr, array in X.items()}
+        X = preprocessor.transform(X)
+        preds = np.empty_like(is_clean, dtype=np.float64)
+        preds[~is_clean] = default_prediction
+        preds[is_clean] = model.predict(target, X)
+        for smiles, pred in zip(smiles_list, preds):
+            predictions[smiles][target] = float(pred)
+        if config["debug"]:
+            break
+    return predictions

preprocess.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# pipeline taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+"""
+This files includes a the data processing for Tox21.
+As an input it takes a list of SMILES and it outputs a nested dictionary with
+SMILES and target names as keys.
+"""
+import os
+import json
+import argparse
+import numpy as np
+from src.preprocess import create_descriptors, get_tox21_split
+from src.utils import TASKS, HF_TOKEN, create_dir, normalize_config
+parser = argparse.ArgumentParser(
+    description="Data preprocessing script for the Tox21 dataset"
+)
+parser.add_argument(
+    "--config",
+    type=str,
+    default="config/config.json",
+)
+def main(config):
+    """Create molecule descriptors for HF Tox21 dataset"""
+    ds = get_tox21_split(HF_TOKEN, cvfold=config["cvfold"])
+    splits = ["train", "validation"]
+    for split in splits:
+        print(f"Preprocess {split} molecules")
+        ds_split = ds[split]
+        smiles = list(ds_split["smiles"])
+        features, clean_mol_mask = create_descriptors(
+            smiles, config["descriptors"], **config["ecfp"]
+        )
+        labels = []
+        for task in TASKS:
+            labels.append(ds_split[task].to_numpy())
+        labels = np.stack(labels, axis=1)
+        save_path = os.path.join(config["data_folder"], f"tox21_{split}_cv4.npz")
+        with open(save_path, "wb") as f:
+            np.savez(
+                f,
+                clean_mol_mask=clean_mol_mask,
+                labels=labels,
+                **features,
+            )
+            print(f"Saved preprocessed {split} split under {save_path}")
+    print("Preprocessing finished successfully")
+if __name__ == "__main__":
+    args = parser.parse_args()
+    with open(args.config, "r") as f:
+        config = json.load(f)
+    config = normalize_config(config)
+    create_dir(config["data_folder"])
+    main(config)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,19 @@

+[project]
+name = "tox-21-mhfp-classifier"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "datasets>=4.4.1",
+    "fastapi>=0.121.3",
+    "joblib==1.5.2",
+    "mhfp==1.9.6",
+    "numpy==2.3.3",
+    "rdkit==2025.9.1",
+    "scikit-learn==1.7.1",
+    "statsmodels==0.14.5",
+    "tabulate>=0.9.0",
+    "torch==2.8.0",
+    "uvicorn[standard]>=0.38.0",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+fastapi
+uvicorn[standard]
+statsmodels==0.14.5
+rdkit==2025.09.1
+numpy==2.3.3
+scikit-learn==1.7.1
+joblib==1.5.2
+tabulate
+datasets
+torch==2.8.0
+mhfp==1.9.6

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (152 Bytes). View file

src/__pycache__/model.cpython-313.pyc ADDED Viewed

Binary file (3.8 kB). View file

src/__pycache__/preprocess.cpython-313.pyc ADDED Viewed

Binary file (33 kB). View file

src/__pycache__/utils.cpython-313.pyc ADDED Viewed

Binary file (15.8 kB). View file

src/model.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""
+This files includes a RF model for Tox21.
+As an input it takes a list of SMILES and it outputs a nested dictionary with
+SMILES and target names as keys.
+"""
+# ---------------------------------------------------------------------------------------
+# Dependencies
+import joblib
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+from .utils import TASKS
+# ---------------------------------------------------------------------------------------
+class Tox21RFClassifier:
+    """A random forest classifier that assigns a toxicity score to a given SMILES string."""
+    def __init__(self, seed: int = 42, config: dict = None):
+        """Initialize a random forest classifier for each of the 12 Tox21 tasks.
+        Args:
+            seed (int, optional): seed for RF to ensure reproducibility. Defaults to 42.
+        """
+        self.tasks = TASKS
+        self.models = {
+            task: RandomForestClassifier(
+                random_state=seed,
+                n_jobs=8,
+                **({"n_estimators": 1000} if config is None else config[task]),
+            )
+            for task in self.tasks
+        }
+    def load(self, path: str) -> None:
+        """Load model from filepath
+        Args:
+            path (str): filepath to model checkpoint
+        """
+        self.models = joblib.load(path)
+    def save(self, path: str) -> None:
+        """Save model to filepath
+        Args:
+            path (str): filepath to model checkpoint
+        """
+        joblib.dump(self.models, path)
+    def fit(self, task: str, X: np.ndarray, y: np.ndarray) -> None:
+        """Train the random forest for a given task
+        Args:
+            task (str): task to train
+            X (np.ndarray): training features
+            y (np.ndarray): training labels
+        """
+        assert task in self.tasks, f"Unknown task: {task}"
+        _X, _y = X.copy(), y.copy()
+        self.models[task].fit(_X, _y)
+    def predict(self, task: str, X: np.ndarray) -> np.ndarray:
+        """Predicts labels for a given Tox21 target using molecule features
+        Args:
+            task (str): the Tox21 target to predict for
+            X (np.ndarray): molecule features used for prediction
+        Returns:
+            np.ndarray: predicted probability for positive class
+        """
+        assert task in self.tasks, f"Unknown task: {task}"
+        assert (
+            len(X.shape) == 2
+        ), f"Function expects 2D np.array. Current shape: {X.shape}"
+        _X = X.copy()
+        return self.models[task].predict_proba(_X)[:, 1]

src/preprocess.py ADDED Viewed

	@@ -0,0 +1,704 @@

+import copy
+import json
+from typing import Any
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.feature_selection import VarianceThreshold
+from sklearn.preprocessing import StandardScaler, FunctionTransformer
+from statsmodels.distributions.empirical_distribution import ECDF
+from rdkit import Chem, DataStructs
+from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
+from rdkit.Chem.rdchem import Mol
+from rdkit.Chem.rdMHFPFingerprint import MHFPEncoder
+from .utils import USED_200_DESCR, TOX_SMARTS_PATH, Standardizer, FeatureDictMixin
+class SquashScaler(TransformerMixin, BaseEstimator):
+    """
+    Scaler that performs sequential standardization, nonlinearity (tanh), and
+    re-standardization. Inspired by DeepTox (Mayr et al., 2016)
+    """
+    def __init__(self):
+        self.scaler1 = StandardScaler()
+        self.scaler2 = StandardScaler()
+    def fit(self, X):
+        _X = X.copy()
+        _X = self.scaler1.fit_transform(_X)
+        _X = np.tanh(_X)
+        _X = self.scaler2.fit(_X)
+        self.is_fitted_ = True
+        return self
+    def transform(self, X):
+        _X = X.copy()
+        _X = self.scaler1.transform(_X)
+        _X = np.tanh(_X)
+        return self.scaler2.transform(_X)
+SCALER_REGISTRY = {
+    None: FunctionTransformer,
+    "standard": StandardScaler,
+    "squash": SquashScaler,
+}
+class SubSampler(TransformerMixin, BaseEstimator):
+    """
+    Preprocessor that randomly samples `max_samples` from data.
+    Args:
+        max_samples (int): Maximum allowed samples. If -1, all samples are retained.
+    Input:
+        np.ndarray: A 2D NumPy array of shape (n_samples, n_features).
+    Output:
+        np.ndarray: Subsampled array of shape (min(n_samples, max_samples), n_features).
+    """
+    def __init__(self, *, max_samples=-1):
+        self.max_samples = max_samples
+        self.is_fitted_ = True
+    def fit(self, X: np.ndarray, y: np.ndarray | None = None):
+        return self
+    def transform(
+        self, X: np.ndarray, y: np.ndarray | None = None
+    ) -> np.ndarray | tuple[np.ndarray]:
+        _X = X.copy()
+        _y = y.copy() if y is not None else None
+        if self.max_samples > 0 and _X.shape[0] > self.max_samples:
+            resample_idxs = np.random.choice(
+                np.arange(_X.shape[0]), size=(self.max_samples,), replace=True
+            )
+            _X = _X[resample_idxs]
+            _y = _y[resample_idxs] if _y is not None else None
+        if _y is None:
+            return _X
+        return _X, _y
+class FeatureSelector(FeatureDictMixin, TransformerMixin, BaseEstimator):
+    """
+    Preprocessor that performs feature selection based on variance and correlation.
+    This transformer selects features that:
+    1. Have variance above a specified threshold.
+    2. Are below a given pairwise correlation threshold.
+    3. Among the remaining features, keeps only the top `max_features` with the highest variance.
+    The input and output are both dictionaries mapping feature types to their corresponding
+    feature matrices.
+    Args:
+        min_var (float): Minimum variance required for a feature to be retained.
+        max_corr (float): Maximum allowed correlation between features.
+            Features exceeding this threshold with others are removed.
+        max_features (int): Maximum number of features to keep after filtering.
+            If -1, all remaining features are retained.
+        feature_keys (list[str]): Features to apply feature selection to.
+        independent_keys (bool): Apply filtering only within features types.
+    Input:
+        dict[str, np.ndarray]: A dictionary where each key corresponds to a feature type
+        and each value is a 2D NumPy array of shape (n_samples, n_features).
+    Output:
+        dict[str, np.ndarray]: A dictionary with the same keys as the input,
+        containing only the selected features for each feature type.
+    """
+    def __init__(
+        self,
+        *,
+        min_var=0.0,
+        max_corr=1.0,
+        max_features=-1,
+        feature_keys=None,
+        min_var__feature_keys=None,
+        max_corr__feature_keys=None,
+        max_features__feature_keys=None,
+        min_var__independent_keys=False,
+        max_corr__independent_keys=False,
+        max_features__independent_keys=False,
+    ):
+        self.min_var = min_var
+        self.max_corr = max_corr
+        self.max_features = max_features
+        self.min_var__feature_keys = min_var__feature_keys
+        self.max_corr__feature_keys = max_corr__feature_keys
+        self.max_features__feature_keys = max_features__feature_keys
+        self.min_var__independent_keys = min_var__independent_keys
+        self.max_corr__independent_keys = max_corr__independent_keys
+        self.max_features__independent_keys = max_features__independent_keys
+        super().__init__(feature_keys=feature_keys)
+    def _get_min_var_mask(self, X: np.ndarray, *args) -> np.ndarray:
+        var_thresh = VarianceThreshold(threshold=self.min_var)
+        return var_thresh.fit(X).get_support()  # mask
+    def _get_max_corr_mask(
+        self, X: np.ndarray, prev_feature_mask: np.ndarray
+    ) -> np.ndarray:
+        _prev_feature_mask = prev_feature_mask.copy()
+        corr_matrix = np.corrcoef(X[:, _prev_feature_mask], rowvar=False)
+        upper_tri = np.triu(corr_matrix, k=1)
+        to_keep = np.ones((sum(_prev_feature_mask),), dtype=bool)
+        for i in range(upper_tri.shape[0]):
+            for j in range(upper_tri.shape[1]):
+                if upper_tri[i, j] > self.max_corr:
+                    to_keep[j] = False
+        _prev_feature_mask[_prev_feature_mask] = to_keep
+        return _prev_feature_mask
+    def _get_max_features_mask(
+        self, X: np.ndarray, prev_feature_mask: np.ndarray
+    ) -> np.ndarray:
+        _prev_feature_mask = prev_feature_mask.copy()
+        # select features with at least max_var variation
+        feature_vars = np.nanvar(X[:, _prev_feature_mask], axis=0)
+        order = np.argsort(feature_vars)[: -(self.max_features + 1) : -1]
+        keep_feat_idx = np.arange(len(_prev_feature_mask))[order]
+        _prev_feature_mask = np.isin(
+            np.arange(len(_prev_feature_mask)), keep_feat_idx, assume_unique=True
+        )
+        return _prev_feature_mask
+    def apply_filter(self, filter, X, prev_feature_mask):
+        mask = prev_feature_mask.copy()
+        func = self.__getattribute__(f"_get_{filter}_mask")
+        feature_keys = self.__getattribute__(f"{filter}__feature_keys")
+        if self.__getattribute__(f"{filter}__independent_keys"):
+            for key in feature_keys:
+                key_mask = self._curr_keys == key
+                mask[key_mask] = func(X[:, key_mask], mask[key_mask])
+        else:
+            feature_key_mask = np.isin(self._curr_keys, feature_keys)
+            mask[feature_key_mask] = func(
+                X[:, feature_key_mask], mask[feature_key_mask]
+            )
+        return mask
+    def fit(self, X: dict[str, np.ndarray]):
+        _X = self.dict_to_array(X)
+        feature_mask = np.ones((_X.shape[1]), dtype=bool)
+        # select features with at least min_var variation
+        if self.min_var > 0.0:
+            if self.min_var__independent_keys:
+                for key in self.min_var__feature_keys:
+                    key_mask = self._curr_keys == key
+                    feature_mask[key_mask] = self._get_min_var_mask(_X[:, key_mask])
+            else:
+                feature_key_mask = np.isin(self._curr_keys, self.min_var__feature_keys)
+                feature_mask[feature_key_mask] = self._get_min_var_mask(
+                    _X[:, feature_key_mask]
+                )
+        # select features with at least max_var variation
+        if self.max_corr < 1.0:
+            if self.max_corr__independent_keys:
+                for key in self.max_corr__feature_keys:
+                    key_mask = self._curr_keys == key
+                    subset = _X[:, key_mask]
+                    feature_mask[key_mask] = self._get_max_corr_mask(
+                        subset, feature_mask[key_mask]
+                    )
+            else:
+                feature_key_mask = np.isin(self._curr_keys, self.max_corr__feature_keys)
+                feature_mask[feature_key_mask] = self._get_max_corr_mask(
+                    _X[:, feature_key_mask], feature_mask[feature_key_mask]
+                )
+        if self.max_features == 0:
+            raise ValueError(
+                f"max_features (={self.max_features}) must be -1 or larger 0."
+            )
+        elif self.max_features > 0:
+            if self.max_features__independent_keys:
+                for key in self.max_features__feature_keys:
+                    key_mask = self._curr_keys == key
+                    feature_mask[key_mask] = self._get_max_features_mask(
+                        _X[:, key_mask], feature_mask[key_mask]
+                    )
+            else:
+                feature_key_mask = np.isin(
+                    self._curr_keys, self.max_features__feature_keys
+                )
+                feature_mask[feature_key_mask] = self._get_max_features_mask(
+                    _X[:, feature_key_mask], feature_mask[feature_key_mask]
+                )
+        self._feature_mask = feature_mask
+        self.is_fitted_ = True
+        return self
+    def transform(self, X: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
+        _X = self.dict_to_array(X)
+        _X = _X[:, self._feature_mask]
+        self._curr_keys = self._curr_keys[self._feature_mask]
+        return self.array_to_dict(_X)
+class QuantileCreator(FeatureDictMixin, TransformerMixin, BaseEstimator):
+    """
+    Preprocessor that transforms features into empirical quantiles using ECDFs.
+    This transformer applies an Empirical Cumulative Distribution Function (ECDF)
+    to each feature and replaces feature values with their corresponding quantile
+    ranks. The transformation is applied independently to each feature type.
+    Both input and output are dictionaries mapping feature types to their
+    corresponding feature matrices.
+    Args:
+        feature_keys (list[str]): Features to apply quantile creation to.
+    Input:
+        dict[str, np.ndarray]: A dictionary where each key corresponds to a feature type
+        and each value is a 2D NumPy array of shape (n_samples, n_features).
+    Output:
+        dict[str, np.ndarray]: A dictionary with the same keys as the input,
+        where each feature value is replaced by its corresponding ECDF quantile rank.
+    """
+    def __init__(self, *, feature_keys=None):
+        self._ecdfs = None
+        super().__init__(feature_keys=feature_keys)
+    def fit(self, X: dict[str, np.ndarray]):
+        _X = self.dict_to_array(X)
+        ecdfs = []
+        for column in range(_X.shape[1]):
+            raw_values = _X[:, column].reshape(-1)
+            ecdfs.append(ECDF(raw_values))
+        self._ecdfs = ecdfs
+        self.is_fitted_ = True
+        return self
+    def transform(self, X: dict[str, np.ndarray]) -> np.ndarray:
+        _X = self.dict_to_array(X)
+        quantiles = np.zeros_like(_X)
+        for column in range(_X.shape[1]):
+            raw_values = _X[:, column].reshape(-1)
+            ecdf = self._ecdfs[column]
+            q = ecdf(raw_values)
+            quantiles[:, column] = q
+        return self.array_to_dict(quantiles)
+class FeaturePreprocessor(TransformerMixin, BaseEstimator):
+    """This class implements the feature preprocessing from a dictionary of molecule features."""
+    def __init__(
+        self,
+        feature_selection_config: dict[str, Any],
+        feature_quantilization_config: dict[str, Any],
+        descriptors: list[str],
+        max_samples: int = -1,
+        scaler: str = "standard",
+    ):
+        self.descriptors = descriptors
+        self.feature_quantilization_config = copy.deepcopy(
+            feature_quantilization_config
+        )
+        self.use_feat_quant = self.feature_quantilization_config.pop("use")
+        self.quantile_creator = QuantileCreator(**self.feature_quantilization_config)
+        self.feature_selection_config = copy.deepcopy(feature_selection_config)
+        self.use_feat_selec = self.feature_selection_config.pop("use")
+        self.feature_selection_config["feature_keys"] = descriptors
+        self.feature_selector = FeatureSelector(**self.feature_selection_config)
+        self.max_samples = max_samples
+        self.sub_sampler = SubSampler(max_samples=max_samples)
+        self.scaler = SCALER_REGISTRY[scaler]()
+    def __getstate__(self):
+        state = super().__getstate__()
+        state["quantile_creator"] = self.quantile_creator.__getstate__()
+        state["feature_selector"] = self.feature_selector.__getstate__()
+        state["sub_sampler"] = self.sub_sampler.__getstate__()
+        state["scaler"] = self.scaler.__getstate__()
+        return state
+    def __setstate__(self, state):
+        _state = copy.deepcopy(state)
+        self.quantile_creator.__setstate__(_state.pop("quantile_creator"))
+        self.feature_selector.__setstate__(_state.pop("feature_selector"))
+        self.sub_sampler.__setstate__(_state.pop("sub_sampler"))
+        self.scaler.__setstate__(_state.pop("scaler"))
+        super().__setstate__(_state)
+    def get_state(self):
+        return self.__getstate__()
+    def set_state(self, state):
+        return self.__setstate__(state)
+    def fit(self, X: dict[str, np.ndarray]):
+        """Fit the processor transformers"""
+        _X = copy.deepcopy(X)
+        if self.use_feat_quant:
+            _X = self.quantile_creator.fit_transform(_X)
+        if self.use_feat_selec:
+            _X = self.feature_selector.fit_transform(_X)
+        _X = np.concatenate([_X[descr] for descr in self.descriptors], axis=1)
+        self.scaler.fit(_X)
+        return self
+    def transform(
+        self, X: np.ndarray, y: np.ndarray | None = None
+    ) -> np.ndarray | tuple[np.ndarray]:
+        _X = X.copy()
+        _y = y.copy() if y is not None else None
+        if self.use_feat_quant:
+            _X = self.quantile_creator.transform(_X)
+        if self.use_feat_selec:
+            _X = self.feature_selector.transform(_X)
+        _X = np.concatenate([_X[descr] for descr in self.descriptors], axis=1)
+        _X = self.scaler.transform(_X)
+        if _y is None:
+            _X = self.sub_sampler.transform(_X)
+            return _X
+        _X, _y = self.sub_sampler.transform(_X, _y)
+        return _X, _y
+def create_cleaned_mol_objects(smiles: list[str]) -> tuple[list[Mol], np.ndarray]:
+    """This function creates cleaned RDKit mol objects from a list of SMILES.
+    Taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+    Modification by Antonia Ebner:
+        - skip uncleanable molecules
+        - return clean molecule mask
+    Args:
+        smiles (list[str]): list of SMILES
+    Returns:
+        list[Mol]: list of cleaned molecules
+        np.ndarray[bool]: mask that contains False at index `i`, if molecule in `smiles` at
+            index `i` could not be cleaned and was removed.
+    """
+    sm = Standardizer(canon_taut=True)
+    clean_mol_mask = list()
+    mols = list()
+    for i, smile in enumerate(smiles):
+        mol = Chem.MolFromSmiles(smile)
+        standardized_mol, _ = sm.standardize_mol(mol)
+        is_cleaned = standardized_mol is not None
+        clean_mol_mask.append(is_cleaned)
+        if not is_cleaned:
+            continue
+        can_mol = Chem.MolFromSmiles(Chem.MolToSmiles(standardized_mol))
+        mols.append(can_mol)
+    return mols, np.array(clean_mol_mask)
+def create_ecfp_fps(mols: list[Mol], radius=3, fpsize=2048, **kwargs) -> np.ndarray:
+    """This function ECFP fingerprints for a list of molecules.
+    Inspired by from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: ECFP fingerprints of molecules
+    """
+    ecfps = list()
+    for mol in mols:
+        gen = rdFingerprintGenerator.GetMorganGenerator(
+            countSimulation=True, fpSize=fpsize, radius=radius
+        )
+        fp_sparse_vec = gen.GetCountFingerprint(mol)
+        fp = np.zeros((0,), np.int8)
+        DataStructs.ConvertToNumpyArray(fp_sparse_vec, fp)
+        ecfps.append(fp)
+    return np.array(ecfps)
+def create_mhfp_fps(
+    mols: list[Mol], radius=3, fpsize=2048, seed=42, **kwargs
+) -> np.ndarray:
+    """This function creates MHFP fingerprints for a list of molecules.
+    Inspired by from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: ECFP fingerprints of molecules
+    """
+    mhfps = list()
+    enc = MHFPEncoder(fpsize, seed)
+    for mol in mols:
+        hash_values = np.array(enc.EncodeMol(mol, radius=radius))
+        folded = np.zeros(fpsize, dtype=np.uint8)
+        if len(hash_values) > 0:
+            folded[hash_values % fpsize] = 1
+        mhfps.append(folded)
+    return np.array(mhfps)
+def create_maccs_keys(mols: list[Mol]) -> np.ndarray:
+    """This function creates MACCS keys for a list of molecules.
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: MACCS keys of molecules
+    """
+    maccs = [MACCSkeys.GenMACCSKeys(x) for x in mols]
+    return np.array(maccs)
+def get_tox_patterns(filepath: str):
+    """This retrieves the tox features defined in filepath.
+    Args:
+        filepath (str): A list of tox features
+    """
+    # load patterns
+    with open(filepath) as f:
+        smarts_list = [s[1] for s in json.load(f)]
+    # Code does not work for this case
+    assert len([s for s in smarts_list if ("AND" in s) and ("OR" in s)]) == 0
+    # Chem.MolFromSmarts takes a long time so it pays of to parse all the smarts first
+    # and then use them for all molecules. This gives a huge speedup over existing code.
+    # a list of patterns, whether to negate the match result and how to join them to obtain one boolean value
+    all_patterns = []
+    for smarts in smarts_list:
+        patterns = []  # list of smarts-patterns
+        # value for each of the patterns above. Negates the values of the above later.
+        negations = []
+        if " AND " in smarts:
+            smarts = smarts.split(" AND ")
+            merge_any = False  # If an ' AND ' is found all 'subsmarts' have to match
+        else:
+            # If there is an ' OR ' present it's enough is any of the 'subsmarts' match.
+            # This also accumulates smarts where neither ' OR ' nor ' AND ' occur
+            smarts = smarts.split(" OR ")
+            merge_any = True
+        # for all subsmarts check if they are preceded by 'NOT '
+        for s in smarts:
+            neg = s.startswith("NOT ")
+            if neg:
+                s = s[4:]
+            patterns.append(Chem.MolFromSmarts(s))
+            negations.append(neg)
+        all_patterns.append((patterns, negations, merge_any))
+    return all_patterns
+def create_tox_features(mols: list[Mol], patterns: list) -> np.ndarray:
+    """Matches the tox patterns against a molecule. Returns a boolean array"""
+    tox_data = []
+    for mol in mols:
+        mol_features = []
+        for patts, negations, merge_any in patterns:
+            matches = [mol.HasSubstructMatch(p) for p in patts]
+            matches = [m != n for m, n in zip(matches, negations)]
+            if merge_any:
+                pres = any(matches)
+            else:
+                pres = all(matches)
+            mol_features.append(pres)
+        tox_data.append(np.array(mol_features))
+    return np.array(tox_data)
+def create_rdkit_descriptors(mols: list[Mol]) -> np.ndarray:
+    """This function creates RDKit descriptors for a list of molecules.
+    Taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: RDKit descriptors of molecules
+    """
+    rdkit_descriptors = list()
+    for mol in mols:
+        descrs = []
+        for _, descr_calc_fn in Descriptors._descList:
+            descrs.append(descr_calc_fn(mol))
+        descrs = np.array(descrs)
+        descrs = descrs[USED_200_DESCR]
+        rdkit_descriptors.append(descrs)
+    return np.array(rdkit_descriptors)
+def create_quantiles(raw_features: np.ndarray, ecdfs: list) -> np.ndarray:
+    """Create quantile values for given features using the columns
+    Taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+    Args:
+        raw_features (np.ndarray): values to put into quantiles
+        ecdfs (list): ECDFs to use
+    Returns:
+        np.ndarray: computed quantiles
+    """
+    quantiles = np.zeros_like(raw_features)
+    for column in range(raw_features.shape[1]):
+        raw_values = raw_features[:, column].reshape(-1)
+        ecdf = ecdfs[column]
+        q = ecdf(raw_values)
+        quantiles[:, column] = q
+    return quantiles
+def fill(features, mask, value=np.nan):
+    n_mols = len(mask)
+    n_features = features.shape[1]
+    data = np.zeros(shape=(n_mols, n_features))
+    data.fill(value)
+    data[~mask] = features
+    return data
+def create_descriptors(
+    smiles,
+    descriptors,
+    **ecfp_kwargs,
+):
+    """Generate molecular descriptors for multiple SMILES strings.
+    Inspired by https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+    Each SMILES is processed and sanitized using RDKit.
+    SMILES that cannot be sanitized are encoded with NaNs, and a corresponding boolean mask
+    is returned to indicate which inputs were successfully processed.
+    Args:
+        smiles (list[str]): List of SMILES strings for which to generate descriptors.
+        descriptors (list[str]): List of descriptor types to compute.
+            Supported values include:
+            ['ecfps', 'tox', 'maccs', 'rdkit_descrs'].
+    Returns:
+        tuple[dict[str, np.ndarray], np.ndarray]:
+            - A dictionary mapping descriptor names to their computed arrays.
+            - A boolean mask of shape (len(smiles),) indicating which SMILES
+            were successfully sanitized and processed.
+    """
+    # Create cleanded rdkit mol objects
+    mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
+    print(f"Cleaned molecules, {(~clean_mol_mask).sum()} could not be sanitized")
+    # Create fingerprints and descriptors
+    if "mhfps" in descriptors:
+        mhfps = create_mhfp_fps(mols, **ecfp_kwargs)
+        mhfps = fill(mhfps, ~clean_mol_mask)
+        print("Created MHFP fingerprints")
+    if "ecfps" in descriptors:
+        ecfps = create_ecfp_fps(mols, **ecfp_kwargs)
+        ecfps = fill(ecfps, ~clean_mol_mask)
+        print("Created ECFP fingerprints")
+    if "tox" in descriptors:
+        tox_patterns = get_tox_patterns(TOX_SMARTS_PATH)
+        tox = create_tox_features(mols, tox_patterns)
+        tox = fill(tox, ~clean_mol_mask)
+        print("Created Tox features")
+    if "maccs" in descriptors:
+        maccs = create_maccs_keys(mols)
+        maccs = fill(maccs, ~clean_mol_mask)
+        print("Created MACCS keys")
+    if "rdkit_descrs" in descriptors:
+        rdkit_descrs = create_rdkit_descriptors(mols)
+        rdkit_descrs = fill(rdkit_descrs, ~clean_mol_mask)
+        print("Created RDKit descriptors")
+    # concatenate features
+    features = {}
+    for descr in descriptors:
+        features[descr] = vars()[descr]
+    return features, clean_mol_mask
+def get_tox21_split(token, cvfold=None):
+    """Retrieve Tox21 splits from HuggingFace with respect to given cvfold."""
+    ds = load_dataset("ml-jku/tox21", token=token)
+    train_df = ds["train"].to_pandas()
+    val_df = ds["validation"].to_pandas()
+    if cvfold is None:
+        return {"train": train_df, "validation": val_df}
+    combined_df = pd.concat([train_df, val_df], ignore_index=True)
+    cvfold = float(cvfold)
+    # create new splits
+    cvfold = float(cvfold)
+    train_df = combined_df[combined_df.CVfold != cvfold]
+    val_df = combined_df[combined_df.CVfold == cvfold]
+    # exclude train mols that occur in the validation split
+    val_inchikeys = set(val_df["inchikey"])
+    train_df = train_df[~train_df["inchikey"].isin(val_inchikeys)]
+    return {
+        "train": train_df.reset_index(drop=True),
+        "validation": val_df.reset_index(drop=True),
+    }

src/utils.py ADDED Viewed

	@@ -0,0 +1,524 @@

+## These MolStandardizer classes are due to Paolo Tosco
+## It was taken from the FS-Mol github
+## (https://github.com/microsoft/FS-Mol/blob/main/fs_mol/preprocessing/utils/
+##  standardizer.py)
+## They ensure that a sequence of standardization operations are applied
+## https://gist.github.com/ptosco/7e6b9ab9cc3e44ba0919060beaed198e
+import os
+import pickle
+from typing import Any
+import numpy as np
+from rdkit import Chem
+from rdkit.Chem.MolStandardize import rdMolStandardize
+HF_TOKEN = os.environ.get("HF_TOKEN")
+TOX_SMARTS_PATH = "data/tox_smarts.json"
+TASKS = [
+    "NR-AR",
+    "NR-AR-LBD",
+    "NR-AhR",
+    "NR-Aromatase",
+    "NR-ER",
+    "NR-ER-LBD",
+    "NR-PPAR-gamma",
+    "SR-ARE",
+    "SR-ATAD5",
+    "SR-HSE",
+    "SR-MMP",
+    "SR-p53",
+]
+USED_200_DESCR = [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    10,
+    11,
+    12,
+    13,
+    14,
+    15,
+    16,
+    25,
+    26,
+    27,
+    28,
+    29,
+    30,
+    31,
+    32,
+    33,
+    34,
+    35,
+    36,
+    37,
+    38,
+    39,
+    40,
+    41,
+    42,
+    43,
+    44,
+    45,
+    46,
+    47,
+    48,
+    49,
+    50,
+    51,
+    52,
+    53,
+    54,
+    55,
+    56,
+    57,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    64,
+    65,
+    66,
+    67,
+    68,
+    69,
+    70,
+    71,
+    72,
+    73,
+    74,
+    75,
+    76,
+    77,
+    78,
+    79,
+    80,
+    81,
+    82,
+    83,
+    84,
+    85,
+    86,
+    87,
+    88,
+    89,
+    90,
+    91,
+    92,
+    93,
+    94,
+    95,
+    96,
+    97,
+    98,
+    99,
+    100,
+    101,
+    102,
+    103,
+    104,
+    105,
+    106,
+    107,
+    108,
+    109,
+    110,
+    111,
+    112,
+    113,
+    114,
+    115,
+    116,
+    117,
+    118,
+    119,
+    120,
+    121,
+    122,
+    123,
+    124,
+    125,
+    126,
+    127,
+    128,
+    129,
+    130,
+    131,
+    132,
+    133,
+    134,
+    135,
+    136,
+    137,
+    138,
+    139,
+    140,
+    141,
+    142,
+    143,
+    144,
+    145,
+    146,
+    147,
+    148,
+    149,
+    150,
+    151,
+    152,
+    153,
+    154,
+    155,
+    156,
+    157,
+    158,
+    159,
+    160,
+    161,
+    162,
+    163,
+    164,
+    165,
+    166,
+    167,
+    168,
+    169,
+    170,
+    171,
+    172,
+    173,
+    174,
+    175,
+    176,
+    177,
+    178,
+    179,
+    180,
+    181,
+    182,
+    183,
+    184,
+    185,
+    186,
+    187,
+    188,
+    189,
+    190,
+    191,
+    192,
+    193,
+    194,
+    195,
+    196,
+    197,
+    198,
+    199,
+    200,
+    201,
+    202,
+    203,
+    204,
+    205,
+    206,
+    207,
+]
+class Standardizer:
+    """
+    Simple wrapper class around rdkit Standardizer.
+    """
+    DEFAULT_CANON_TAUT = False
+    DEFAULT_METAL_DISCONNECT = False
+    MAX_TAUTOMERS = 100
+    MAX_TRANSFORMS = 100
+    MAX_RESTARTS = 200
+    PREFER_ORGANIC = True
+    def __init__(
+        self,
+        metal_disconnect=None,
+        canon_taut=None,
+    ):
+        """
+        Constructor.
+        All parameters are optional.
+        :param metal_disconnect:    if True, metallorganic complexes are
+                                    disconnected
+        :param canon_taut:          if True, molecules are converted to their
+                                    canonical tautomer
+        """
+        super().__init__()
+        if metal_disconnect is None:
+            metal_disconnect = self.DEFAULT_METAL_DISCONNECT
+        if canon_taut is None:
+            canon_taut = self.DEFAULT_CANON_TAUT
+        self._canon_taut = canon_taut
+        self._metal_disconnect = metal_disconnect
+        self._taut_enumerator = None
+        self._uncharger = None
+        self._lfrag_chooser = None
+        self._metal_disconnector = None
+        self._normalizer = None
+        self._reionizer = None
+        self._params = None
+    @property
+    def params(self):
+        """Return the MolStandardize CleanupParameters."""
+        if self._params is None:
+            self._params = rdMolStandardize.CleanupParameters()
+            self._params.maxTautomers = self.MAX_TAUTOMERS
+            self._params.maxTransforms = self.MAX_TRANSFORMS
+            self._params.maxRestarts = self.MAX_RESTARTS
+            self._params.preferOrganic = self.PREFER_ORGANIC
+            self._params.tautomerRemoveSp3Stereo = False
+        return self._params
+    @property
+    def canon_taut(self):
+        """Return whether tautomer canonicalization will be done."""
+        return self._canon_taut
+    @property
+    def metal_disconnect(self):
+        """Return whether metallorganic complexes will be disconnected."""
+        return self._metal_disconnect
+    @property
+    def taut_enumerator(self):
+        """Return the TautomerEnumerator object."""
+        if self._taut_enumerator is None:
+            self._taut_enumerator = rdMolStandardize.TautomerEnumerator(self.params)
+        return self._taut_enumerator
+    @property
+    def uncharger(self):
+        """Return the Uncharger object."""
+        if self._uncharger is None:
+            self._uncharger = rdMolStandardize.Uncharger()
+        return self._uncharger
+    @property
+    def lfrag_chooser(self):
+        """Return the LargestFragmentChooser object."""
+        if self._lfrag_chooser is None:
+            self._lfrag_chooser = rdMolStandardize.LargestFragmentChooser(
+                self.params.preferOrganic
+            )
+        return self._lfrag_chooser
+    @property
+    def metal_disconnector(self):
+        """Return the MetalDisconnector object."""
+        if self._metal_disconnector is None:
+            self._metal_disconnector = rdMolStandardize.MetalDisconnector()
+        return self._metal_disconnector
+    @property
+    def normalizer(self):
+        """Return the Normalizer object."""
+        if self._normalizer is None:
+            self._normalizer = rdMolStandardize.Normalizer(
+                self.params.normalizationsFile, self.params.maxRestarts
+            )
+        return self._normalizer
+    @property
+    def reionizer(self):
+        """Return the Reionizer object."""
+        if self._reionizer is None:
+            self._reionizer = rdMolStandardize.Reionizer(self.params.acidbaseFile)
+        return self._reionizer
+    def charge_parent(self, mol_in):
+        """Sequentially apply a series of MolStandardize operations:
+        * MetalDisconnector
+        * Normalizer
+        * Reionizer
+        * LargestFragmentChooser
+        * Uncharger
+        The net result is that a desalted, normalized, neutral
+        molecule with implicit Hs is returned.
+        """
+        params = Chem.RemoveHsParameters()
+        params.removeAndTrackIsotopes = True
+        mol_in = Chem.RemoveHs(mol_in, params, sanitize=False)
+        if self._metal_disconnect:
+            mol_in = self.metal_disconnector.Disconnect(mol_in)
+        normalized = self.normalizer.normalize(mol_in)
+        Chem.SanitizeMol(normalized)
+        normalized = self.reionizer.reionize(normalized)
+        Chem.AssignStereochemistry(normalized)
+        normalized = self.lfrag_chooser.choose(normalized)
+        normalized = self.uncharger.uncharge(normalized)
+        # need this to reassess aromaticity on things like
+        # cyclopentadienyl, tropylium, azolium, etc.
+        Chem.SanitizeMol(normalized)
+        return Chem.RemoveHs(Chem.AddHs(normalized))
+    def standardize_mol(self, mol_in):
+        """
+        Standardize a single molecule.
+        :param mol_in:  a Chem.Mol
+        :return:        * (standardized Chem.Mol, n_taut) tuple
+                          if success. n_taut will be negative if
+                          tautomer enumeration was aborted due
+                          to reaching a limit
+                        * (None, error_msg) if failure
+        This calls self.charge_parent() and, if self._canon_taut
+        is True, runs tautomer canonicalization.
+        """
+        n_tautomers = 0
+        if isinstance(mol_in, Chem.Mol):
+            name = None
+            try:
+                name = mol_in.GetProp("_Name")
+            except KeyError:
+                pass
+            if not name:
+                name = "NONAME"
+        else:
+            error = f"Expected SMILES or Chem.Mol as input, got {str(type(mol_in))}"
+            return None, error
+        try:
+            mol_out = self.charge_parent(mol_in)
+        except Exception as e:
+            error = f"charge_parent FAILED: {str(e).strip()}"
+            return None, error
+        if self._canon_taut:
+            try:
+                res = self.taut_enumerator.Enumerate(mol_out, False)
+            except TypeError:
+                # we are still on the pre-2021 RDKit API
+                res = self.taut_enumerator.Enumerate(mol_out)
+            except Exception as e:
+                # something else went wrong
+                error = f"canon_taut FAILED: {str(e).strip()}"
+                return None, error
+            n_tautomers = len(res)
+            if hasattr(res, "status"):
+                completed = (
+                    res.status == rdMolStandardize.TautomerEnumeratorStatus.Completed
+                )
+            else:
+                # we are still on the pre-2021 RDKit API
+                completed = len(res) < 1000
+            if not completed:
+                n_tautomers = -n_tautomers
+            try:
+                mol_out = self.taut_enumerator.PickCanonical(res)
+            except AttributeError:
+                # we are still on the pre-2021 RDKit API
+                mol_out = max(
+                    [(self.taut_enumerator.ScoreTautomer(m), m) for m in res]
+                )[1]
+            except Exception as e:
+                # something else went wrong
+                error = f"canon_taut FAILED: {str(e).strip()}"
+                return None, error
+        mol_out.SetProp("_Name", name)
+        return mol_out, n_tautomers
+class FeatureDictMixin:
+    """
+    Mixin that enables bidirectional handling of dict-based multi-feature inputs.
+    Allows selective removal of columns directly from the combined array.
+    Example input:
+        {
+            "ecfps": np.ndarray,
+            "tox": np.ndarray,
+        }
+    """
+    def __init__(self, feature_keys=None):
+        self.feature_keys = feature_keys
+        self._curr_keys = None
+        self._unused_data = None
+    def dict_to_array(self, input: dict[Any, np.ndarray]) -> np.ndarray:
+        """Parse dict input and concatenate into a single array."""
+        if not isinstance(input, dict):
+            raise TypeError("Input must be a dict {feature_type: np.ndarray, ...}")
+        self._unused_data = {}
+        remaining_input = {}
+        for key in list(input.keys()):
+            if key not in self.feature_keys:
+                self._unused_data[key] = input[key]
+            else:
+                remaining_input[key] = input[key]
+        curr_keys = []
+        output = []
+        for key in self.feature_keys:
+            array = remaining_input.pop(key)
+            if array.ndim != 2:
+                raise ValueError(f"Feature '{key}' must be 2D, got shape {array.shape}")
+            curr_keys.extend([key] * array.shape[1])
+            output.append(array)
+        self._curr_keys = np.array(curr_keys)
+        return np.concatenate(output, axis=1)
+    def array_to_dict(self, input: np.ndarray) -> dict[Any, np.ndarray]:
+        """Reconstruct dict from a concatenated array."""
+        if self._curr_keys is None:
+            raise ValueError("No feature mapping stored. Did you call parse_input()?")
+        output = {key: input[:, self._curr_keys == key] for key in self.feature_keys}
+        output.update(self._unused_data)
+        self._curr_keys = None
+        self._unused_data = None
+        return output
+def load_pickle(path: str):
+    with open(path, "rb") as file:
+        content = pickle.load(file)
+    return content
+def write_pickle(path: str, obj: object):
+    with open(path, "wb") as file:
+        pickle.dump(obj, file)
+def create_dir(path, is_file=False):
+    """Creates the parent directories if a path to a file is given, else create the given directory"""
+    to_create = os.path.dirname(path) if is_file else path
+    if not os.path.exists(to_create):
+        os.makedirs(to_create)
+def normalize_config(config: dict):
+    """Normalizes a json config recursively by applying a mapping"""
+    mapping = {"none": None, "true": True, "false": False}
+    new_config = {}
+    for key, val in config.items():
+        if isinstance(val, dict):
+            new_config[key] = normalize_config(val)
+        elif isinstance(val, (int, float, str)) and val in mapping:
+            new_config[key] = mapping[val]
+        else:
+            new_config[key] = val
+    return new_config

train.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""
+Script for fitting and saving any preprocessing assets, as well as the fitted RF model
+"""
+import os
+import json
+import random
+import logging
+import argparse
+import joblib
+import numpy as np
+from datetime import datetime
+from src.model import Tox21RFClassifier
+from src.preprocess import FeaturePreprocessor
+from src.utils import create_dir, normalize_config
+parser = argparse.ArgumentParser(description="RF Training script for Tox21 dataset")
+parser.add_argument(
+    "--config",
+    type=str,
+    default="config/config.json",
+)
+def main(config):
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    # setup logger
+    logger = logging.getLogger(__name__)
+    script_name = os.path.splitext(os.path.basename(__file__))[0]
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s [%(levelname)s] %(message)s",
+        handlers=[
+            logging.FileHandler(
+                os.path.join(
+                    config["log_folder"],
+                    f"{script_name}_{timestamp}.log",
+                )
+            ),
+            logging.StreamHandler(),
+        ],
+    )
+    logger.info(f"Config: {config}")
+    model_config_repr = "Model config: \n" + "\n".join(
+        [str(val) for val in config["model_config"].values()]
+    )
+    logger.info(f"Model config: \n{model_config_repr}")
+    # seeding
+    random.seed(config["seed"])
+    np.random.seed(config["seed"])
+    train_data = np.load(os.path.join(config["data_folder"], "tox21_train_cv4.npz"))
+    val_data = np.load(os.path.join(config["data_folder"], "tox21_validation_cv4.npz"))
+    # filter out unsanitized molecules
+    train_is_clean = train_data["clean_mol_mask"]
+    val_is_clean = val_data["clean_mol_mask"]
+    train_data = {descr: array[train_is_clean] for descr, array in train_data.items()}
+    val_data = {descr: array[val_is_clean] for descr, array in val_data.items()}
+    if config["merge_train_val"]:
+        data = {
+            descr: np.concatenate([train_data[descr], val_data[descr]], axis=0)
+            for descr in config["descriptors"]
+        }
+        labels = np.concatenate([train_data["labels"], val_data["labels"]], axis=0)
+    else:
+        data = {descr: train_data[descr] for descr in config["descriptors"]}
+        labels = train_data["labels"]
+    if config["ckpt_path"]:
+        logger.info(
+            f"Fitted RandomForestClassifier will be saved as: {config['ckpt_path']}"
+        )
+    else:
+        logger.info("Fitted RandomForestClassifier will NOT be saved.")
+    model = Tox21RFClassifier(seed=config["seed"], config=config["model_config"])
+    # setup processors
+    preprocessor = FeaturePreprocessor(
+        feature_selection_config=config["feature_selection"],
+        feature_quantilization_config=config["feature_quantilization"],
+        descriptors=config["descriptors"],
+        max_samples=config["max_samples"],
+        scaler=config["scaler"],
+    )
+    preprocessor.fit(data)
+    logger.info("Start training.")
+    for i, task in enumerate(model.tasks):
+        task_labels = labels[:, i]
+        label_mask = ~np.isnan(task_labels)
+        logger.info(f"Fit task {task} using {sum(label_mask)} samples")
+        task_data = {key: val[label_mask] for key, val in data.items()}
+        task_labels = task_labels[label_mask].astype(int)
+        task_data = preprocessor.transform(task_data)
+        model.fit(task, task_data, task_labels)
+        if config["debug"]:
+            break
+    log_text = f"Finished training."
+    logger.info(log_text)
+    if config["ckpt_path"]:
+        model.save(config["ckpt_path"])
+        logger.info(f"Save model as: {config['ckpt_path']}")
+    if config["preprocessor_path"]:
+        state = preprocessor.get_state()
+        joblib.dump(state, config["preprocessor_path"])
+        logger.info(f"Save preprocessor as: {config['preprocessor_path']}")
+if __name__ == "__main__":
+    args = parser.parse_args()
+    with open(args.config, "r") as f:
+        config = json.load(f)
+    config = normalize_config(config)
+    create_dir(config["log_folder"])
+    main(config)

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff