Spaces:

mschuh
/

MultiTaskTox

Sleeping

App Files Files Community

mschuh commited on Nov 19

Commit

94b1553

verified ·

1 Parent(s): 8fae56b

Added first version

Browse files

Files changed (19) hide show

.example.env +1 -0
.gitignore +6 -0
Dockerfile +16 -0
LICENSE +407 -0
README.md +115 -12
app.py +77 -0
config/config.json +31 -0
docs/proposed_lightgbm_framework.md +203 -0
predict.py +125 -0
requirements.txt +13 -0
src/constants.py +16 -0
src/features.py +125 -0
src/lightgbm_trainer.py +235 -0
src/model.py +52 -0
src/preprocess.py +65 -0
src/seed.py +10 -0
src/stage_two.py +106 -0
src/train_evaluate.py +116 -0
train.py +119 -0

.example.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ TOKEN=example_token

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+predict copy.py
+hp_search/logs/*
+hp_search/models/*
+__pycache__
+.env
+notes.txt

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.11.4
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,407 @@

+Attribution-NonCommercial 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

README.md CHANGED Viewed

@@ -1,12 +1,115 @@
----
-title: MultiTaskTox
-emoji: 🦀
-colorFrom: blue
-colorTo: indigo
-sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# MultiTaskTox – LightGBM Fingerprint Classifier for Tox21
+MultiTaskTox is a two-stage Gradient Boosting workflow purpose-built for the [Tox21](https://huggingface.co/datasets/ml-jku/tox21) benchmark. It ingests molecular SMILES strings, converts them into high-dimensional fingerprints (ECFP or MAP4), and trains a set of LightGBM classifiers that leverage cross-task signal to improve toxicity prediction across all 12 Tox21 targets.
+## Why MultiTaskTox?
+- **Deterministic preprocessing** – every SMILES string is standardized through RDKit before fingerprint generation, ensuring training and inference behave identically.
+- **Optuna-tuned per-task boosters** – each toxicity endpoint receives its own LightGBM classifier, tuned directly on the provided train/validation splits.
+- **Multitask enhancement** – stage two augments the fingerprint vector with the predictions of the other tasks, capturing label correlations without building a fully joint model.
+- **Leaderboard-ready interface** – `train.py` produces checkpoints and metadata under `checkpoints/`, while `predict.py` exposes the required `predict(smiles_list)` signature.
+## Installation
+```bash
+git clone https://huggingface.co/spaces/ml-jku/tox21_gin_classifier
+cd tox21_gin_classifier
+python -m venv .venv && source .venv/bin/activate
+pip install --upgrade pip
+pip install -r requirements.txt
+```
+The requirements include RDKit, LightGBM, Optuna, and the MAP4 fingerprint package so you can switch feature types via the config.
+## Training
+1. Create a `.env` file (all Hugging Face Spaces support secrets) with your dataset token:
+   ```
+   TOKEN=hf_xxx
+   ```
+2. Adjust `config/config.json` if needed (fingerprint type, Optuna trial count, etc.).
+3. Run:
+   ```bash
+   python train.py
+   ```
+### What `train.py` does
+1. Loads the predefined `train` and `validation` splits from the Tox21 dataset.
+2. Standardizes SMILES and builds fingerprints using `src/features.py`.
+3. For each target:
+   - Runs Optuna to find the best LightGBM hyperparameters using the validation split as the evaluation set.
+   - Fits the classifier (`stage1`) and stores the model as `checkpoints/stage1/<target>.pkl`.
+4. Generates prediction matrices for both splits.
+5. If multitask mode is enabled (`config["multitask"]["enabled"]`), creates augmented features (fingerprint + other-task predictions) and trains stage-two boosters saved under `checkpoints/stage2/`.
+6. Writes metrics (`metrics_stage1.json`, `metrics_stage2.json`) and a manifest (`training_manifest.json`) describing the experiment.
+## Inference
+`predict.py` exposes:
+```python
+from predict import predict
+smiles = ["CCO", "c1ccccc1", "CC(=O)O"]
+results = predict(smiles)
+```
+The function:
+1. Loads the training manifest to know which fingerprint type and checkpoints to use.
+2. Standardizes and fingerprints the SMILES on the fly.
+3. Runs stage-one LightGBM classifiers to obtain probabilistic predictions.
+4. If stage-two models exist, augments the features with cross-task predictions and runs the multitask models.
+5. Returns `{smiles: {target_name: probability}}` with values in `[0, 1]`. Invalid SMILES fall back to `0.5`.
+## Configuration Overview (`config/config.json`)
+```json
+{
+  "seed": 42,
+  "dataset": {"name": "ml-jku/tox21"},
+  "features": {
+    "type": "ecfp",
+    "radius": 2,
+    "n_bits": 1024,
+    "map4_dim": 1024,
+    "cache_dir": "./checkpoints/cache"
+  },
+  "training": {
+    "optuna_trials": 40,
+    "boosting_rounds": 1500,
+    "early_stopping_rounds": 100,
+    "lightgbm_params": {
+      "objective": "binary",
+      "metric": "auc",
+      "verbosity": -1
+    }
+  },
+  "multitask": {"enabled": true},
+  "output": {"checkpoint_dir": "./checkpoints"}
+}
+```
+- Switch `features.type` to `"map4"` to use MAP4 fingerprints (installed by default).
+- Disable multitask behavior by setting `"multitask": {"enabled": false}`.
+- Increase `optuna_trials` for a more exhaustive search if compute allows.
+## Repository Layout
+- `train.py` – orchestrates the full training workflow (feature generation, Optuna tuning, stage-one and stage-two models).
+- `predict.py` – leaderboard-friendly inference function that loads the checkpoints generated by `train.py`.
+- `src/preprocess.py` – dataset loading and SMILES standardization helpers.
+- `src/features.py` – fingerprint computation with disk caching.
+- `src/lightgbm_trainer.py` – LightGBM + Optuna utilities for stage-one training.
+- `src/stage_two.py` – multitask feature augmentation and model training.
+- `src/constants.py`, `src/seed.py` – shared utilities.
+- `docs/proposed_lightgbm_framework.md` – detailed design notes for the workflow.
+- `checkpoints/` – default output directory containing models, metrics, caches, and the training manifest used at inference time.
+## Tips
+- Training relies on the `TOKEN` environment variable to access the Tox21 dataset on Hugging Face. Locally you can omit it if the dataset is public for your account.
+- MAP4 fingerprints are more expensive to compute; enable the cache directory to avoid recomputation across runs.
+- Use the saved metrics files to compare stage-one vs. stage-two AUCs and to trace which configuration produced a set of checkpoints.
+Happy modeling! If you extend MultiTaskTox (new fingerprints, alternative learners, etc.), keep the `predict(smiles)` contract intact so your Space remains leaderboard compatible.

app.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+This is the main entry point for the FastAPI application.
+The app handles the request to predict toxicity for a list of SMILES strings.
+"""
+# ---------------------------------------------------------------------------------------
+# Dependencies and global variable definition
+import os
+from typing import List, Dict, Optional
+from fastapi import FastAPI, Header, HTTPException
+from pydantic import BaseModel, Field
+from predict import predict as predict_func
+API_KEY = os.getenv("API_KEY")  # set via Space Secrets
+# ---------------------------------------------------------------------------------------
+class Request(BaseModel):
+    smiles: List[str] = Field(min_items=1, max_items=1000)
+class Response(BaseModel):
+    predictions: dict
+    model_info: Dict[str, str] = {}
+app = FastAPI(title="toxicity-api")
+@app.get("/")
+def root():
+    return {
+        "message": "Toxicity Prediction API",
+        "endpoints": {
+            "/metadata": "GET - API metadata and capabilities",
+            "/healthz": "GET - Health check",
+            "/predict": "POST - Predict toxicity for SMILES",
+        },
+        "usage": "Send POST to /predict with {'smiles': ['your_smiles_here']}",
+    }
+@app.get("/metadata")
+def metadata():
+    return {
+        "name": "Tox21 GIN classifier",
+        "version": "1.0.0",
+        "tox_endpoints": [
+            "NR-AR",
+            "NR-AR-LBD",
+            "NR-AhR",
+            "NR-Aromatase",
+            "NR-ER",
+            "NR-ER-LBD",
+            "NR-PPAR-gamma",
+            "SR-ARE",
+            "SR-ATAD5",
+            "SR-HSE",
+            "SR-MMP",
+            "SR-p53",
+        ],
+    }
+@app.get("/healthz")
+def healthz():
+    return {"ok": True}
+@app.post("/predict", response_model=Response)
+def predict(request: Request):
+    predictions = predict_func(request.smiles)
+    return {
+        "predictions": predictions,
+        "model_info": {"name": "MultiTaskTox", "version": "0.0.1"},
+    }

config/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "seed": 42,
+  "dataset": {
+    "name": "ml-jku/tox21"
+  },
+  "features": {
+    "type": "ecfp",
+    "radius": 2,
+    "n_bits": 1024,
+    "use_counts": false,
+    "map4_dim": 1024,
+    "cache_dir": "./checkpoints/cache"
+  },
+  "training": {
+    "optuna_trials": 40,
+    "boosting_rounds": 1500,
+    "early_stopping_rounds": 100,
+    "lightgbm_params": {
+      "objective": "binary",
+      "metric": "auc",
+      "verbosity": -1
+    }
+  },
+  "multitask": {
+    "enabled": true,
+    "prediction_source": "oof"
+  },
+  "output": {
+    "checkpoint_dir": "./checkpoints"
+  }
+}

docs/proposed_lightgbm_framework.md ADDED Viewed

	@@ -0,0 +1,203 @@

+# LightGBM-Based Multitask Workflow for Tox21
+This document proposes a stepwise plan to replace the current GIN baseline (`train.py`, `predict.py`, `src/`) with a Gradient Boosting pipeline that remains compatible with the leaderboard I/O contract. Each phase can be validated independently before moving to the next, ensuring we have working training and inference artifacts at all times.
+---
+## 0. Repository Integration Checklist
+- **Entry-points stay the same.** `train.py` must continue to train from `config/config.json` and drop an inference-ready artifact into `checkpoints/`. `predict.py` must keep the `predict(smiles_list)` signature and return the nested `{smiles: {target: score}}` mapping.
+- **New modules.** Introduce `src/features.py` (fingerprints & caching), `src/lightgbm_trainer.py` (shared utilities for training/evaluation), and `src/stage_two.py` (cross-task augmentation logic). Keep `src/preprocess.py` for SMILES standardization + RDKit `Mol` construction so inference stays aligned with training.
+- **Dependencies.** Add `lightgbm`, `optuna`, `rdkit-pypi`, and optionally `map4` or `map4` reference code to `requirements.txt`. Verify any native dependencies are supported by the Spaces environment.
+- **Artifacts.** Store per-task boosters as `checkpoints/stage1_{task}.txt` and `checkpoints/stage2_{task}.txt` (LightGBM text dumps). Derived predictions (e.g., stage-1 OOF matrices) should live under `checkpoints/cache/` or `/tmp` during training, but inference must rely only on checkpoint files generated by `train.py`.
+---
+## 1. Phase 1 — Baseline LightGBM with Optuna
+### 1.1 Data handling
+1. Load the Hugging Face dataset inside `train.py` exactly as today (`load_dataset("ml-jku/tox21", token=TOKEN)`).
+2. Keep the same per-split segmentation (train/validation/test) to remain comparable with the GIN baseline.
+3. Convert SMILES strings to RDKit `Mol` objects using the existing cleaners in `src/preprocess.py`. For the baseline, we can featurize molecules with a minimal descriptor set (e.g., RDKit physicochemical descriptors) while fingerprints are being implemented.
+### 1.2 Baseline features
+Use easily-computed descriptors such as:
+- Molecular weight, logP, TPSA, number of H-bond donors/acceptors, rotatable bonds, aromatic proportion, etc.
+- Concatenate one-hot encodings for atom count bins (C, N, O, halogens).
+This gives a quick tabular vector per SMILES while fingerprint work is in progress.
+### 1.3 Training objective
+- **Task granularity:** Train one LightGBM binary classifier per Tox21 task (12 total). Targets remain the provided binary toxicity labels.
+- **Metric:** ROC-AUC per task, with macro-average for reporting (mirrors leaderboard metric).
+- **Data split:** For each task, drop rows with missing labels and perform K-fold CV (e.g., 5 folds) inside Optuna to make best use of labeled data.
+### 1.4 Optuna search space
+Within `src/lightgbm_trainer.py`, expose an `objective(trial, task_name)` that:
+1. Samples:
+   - `learning_rate ∈ [1e-3, 0.2]` (log scale)
+   - `num_leaves ∈ [16, 256]`
+   - `max_depth ∈ [-1, 12]`
+   - `min_data_in_leaf ∈ [10, 200]`
+   - `feature_fraction ∈ [0.5, 1.0]`
+   - `bagging_fraction ∈ [0.5, 1.0]` with `bagging_freq ∈ [1, 10]`
+   - `lambda_l1`, `lambda_l2` (10^-8 to 10^1)
+2. Trains the LightGBM model on each CV split and averages ROC-AUC.
+3. Returns the negative mean ROC-AUC so Optuna can minimize the objective.
+Persist the best hyperparameters per task into the config (or a JSON artifact) so `predict.py` can instantiate the booster with exact values. When data volume is small, Optuna’s `Study` can share the same random seed for reproducibility (`src/seed.py` can be reused).
+### 1.5 Deliverables for Phase 1
+- Updated `train.py` calling into `src/lightgbm_trainer.train_single_task(task_name, features, labels, config)`.
+- `checkpoints/stage1_{task}.txt` boosters (even though they are “stage 1”, they form the baseline deliverable).
+- Validation report (per-task ROC-AUC) saved to `checkpoints/metrics_stage1.json`.
+- `predict.py` loads each per-task LightGBM model, computes baseline descriptors on-the-fly, and returns predictions.
+---
+## 2. Phase 2 — Fingerprint-Based Representations
+### 2.1 Feature computation
+Implement `src/features.py` with methods:
+- `compute_ecfp(mol, radius=2, n_bits=1024)` using `GetMorganFingerprintAsBitVect`.
+- `compute_map4(mol)` via MAP4 codebase (counts hashed patterns). Because MAP4 is computationally heavier, cache features to disk (e.g., `cache/fingerprints_{split}.npz`).
+- `fingerprint_pipeline(smiles_list, fingerprint_type)` that accepts sanitized SMILES, constructs `Mol` objects, and returns a dense `np.ndarray`.
+### 2.2 Integration
+- Update `train.py` to choose the fingerprint type from config (e.g., `config["features"]["type"] = "ecfp"`).
+- Align `predict.py` to call the same fingerprint builder on incoming SMILES.
+- Maintain metadata describing fingerprint dimensionality and type in a manifest (e.g., `checkpoints/features.json`) so inference knows how to parse the stored LightGBM feature order.
+### 2.3 Training flow
+Apart from the enriched features, Phase 2 reuses the Phase 1 training loop. If resource constraints exist, we can:
+- Run Optuna once on a representative task (e.g., NR-AhR) and reuse its best hyperparameters for all tasks; or
+- Run Optuna briefly per task (e.g., 30 trials) and share results.
+### 2.4 Deliverables
+- Fingerprint cache builders + unit tests (small set of SMILES).
+- Configurable training/inference that toggles between baseline descriptors and fingerprint vectors.
+- Updated metrics comparing descriptors vs. ECFP vs. MAP4.
+---
+## 3. Phase 3 — Cross-Task Label Augmentation
+### 3.1 Motivation
+By incorporating predictions from other tasks, we expose LightGBM to shared toxicity patterns without building a fully joint model. This is especially valuable for underrepresented tasks where correlated labels provide additional signal.
+### 3.2 Feature construction
+Given `T = 12` tasks and fingerprint dimension `D`, the augmented features for task `k` are:
+```
+X_k = [fingerprint_vector (D dims), ŷ_1, …, ŷ_{k-1}, ŷ_{k+1}, …, ŷ_T]
+```
+where `ŷ_t` are the stage-1 predictions for task `t` on the same molecule. Use floats instead of hard labels to preserve uncertainty.
+### 3.3 Implementation details
+1. **Collect stage-1 predictions.**
+   - After Phase 2 training, run inference with each stage-1 model on every molecule in train/val/test splits.
+   - Store the `N × T` prediction matrix in `checkpoints/stage1_predictions_{split}.npz`.
+2. **Align missing data.**
+   - If task `t` lacks a label for a molecule, mask it during stage-1 training but still compute predictions for other tasks so the feature matrix stays dense.
+3. **Data leakage prevention.**
+   - During training, use out-of-fold predictions (OOF) for the stage-1 features so models do not see their own ground-truth labels through the augmented vector.
+   - Implementation: For each fold, train stage-1 LightGBM on K-1 folds, predict on the held-out fold, and concatenate predictions.
+4. **Config surface.**
+   - `config["multitask"]["use_stage1_predictions"] = true/false`
+   - `config["multitask"]["prediction_source"] = "oof" | "full_train"` to switch between strict OOF features and simpler (but leakier) full-train predictions for debugging.
+### 3.4 Training
+Once augmented features are ready, rerun the single-task LightGBM training per target (`stage2`). Hyperparameter search can be narrower because fingerprints already provide a strong baseline; focus on `num_leaves`, `feature_fraction`, and regularization strength.
+### 3.5 Deliverables
+- Scripts that generate OOF prediction matrices.
+- Updated `train.py` orchestration:
+  1. Train Stage 1 models.
+  2. Materialize cross-task prediction cache.
+  3. Train Stage 2 models from augmented features.
+- Metrics comparing Stage 1 vs. Stage 2 per task.
+---
+## 4. Phase 4 — Two-Stage Training & Inference
+### 4.1 Training orchestration
+Pseudo-flow for `train.py`:
+```python
+def train(config):
+    ds = load_dataset(...)
+    mols = preprocess.standardize(ds["train"]["smiles"])
+    fp_cache = features.fingerprint_pipeline(mols, config["features"])
+    stage1 = StageOneTrainer(config)
+    stage1.train_all_tasks(fp_cache, labels, splits)
+    stage1.save_models("checkpoints/stage1_*.txt")
+    pred_cache = stage1.generate_predictions(fp_cache, splits, use_oof=True)
+    stage2 = StageTwoTrainer(config)
+    stage2.train_all_tasks(fp_cache, pred_cache, labels)
+    stage2.save_models("checkpoints/stage2_*.txt")
+    dump_metrics(stage1.metrics, stage2.metrics)
+```
+### 4.2 Inference pipeline (`predict.py`)
+1. **Fingerprint computation:** identical to training (deterministic sanitization).
+2. **Stage-1 pass:** Load every `stage1_{task}.txt`, predict on the incoming SMILES batch, and collect predictions.
+3. **Stage-2 pass:** For each task `k`, build `[fingerprint, predicted_labels_except_k]` on-the-fly and evaluate the corresponding stage-2 booster.
+4. **Output:** Return the stage-2 predictions for leaderboard submission. Optionally include stage-1 scores in the response if needed for debugging (but the official output should stick to stage-2 values).
+### 4.3 Failure modes & mitigations
+- **Unrecognized SMILES:** fall back to zeros or 0.5 predictions like the current baseline but log warnings so we can monitor failure rates.
+- **Missing checkpoint:** raise an informative exception instructing users to rerun `train.py`.
+- **Performance drift:** store SHA or timestamp metadata with checkpoints to trace which training configuration produced a given model.
+---
+## 5. Configuration & Experiment Tracking
+Proposed structure for `config/config.json`:
+```json
+{
+  "seed": 42,
+  "features": {
+    "type": "ecfp",
+    "radius": 2,
+    "n_bits": 1024,
+    "use_counts": false
+  },
+  "training": {
+    "n_folds": 5,
+    "n_optuna_trials": 50,
+    "lightgbm_params": {
+      "objective": "binary",
+      "metric": "auc",
+      "verbosity": -1
+    }
+  },
+  "multitask": {
+    "enabled": true,
+    "use_stage1_predictions": true,
+    "prediction_source": "oof"
+  }
+}
+```
+Track experiment results in `checkpoints/experiments.csv` with columns `[timestamp, fingerprint, stage, task, auc, params_hash]`.
+---
+## 6. Testing & Validation
+- **Unit tests:** Ensure fingerprint builders reproduce known vectors (compare with RDKit reference) and that cross-task feature assembly drops the correct task column.
+- **Integration tests:** Small toy dataset (3 tasks, <50 samples) to run the full Stage1→Stage2 pipeline quickly. Assert shapes of caches and that inference matches training predictions.
+- **Performance tracking:** Plot per-task ROC-AUC improvements by phase to confirm each enhancement adds value.
+---
+## 7. Suggested Implementation Milestones
+1. **M1:** Skeleton LightGBM trainer + Optuna integration (Phase 1). ✓
+2. **M2:** Fingerprint computation module with caching + updated training/inference (Phase 2).
+3. **M3:** Stage-1 prediction cache + feature augmentation (Phase 3).
+4. **M4:** End-to-end Stage1→Stage2 orchestration, packaging of checkpoints, and inference updates (Phase 4).
+5. **M5:** Documentation + automated tests to guard against regressions.
+This phased roadmap keeps the leaderboard interface intact while progressively increasing the modeling capacity from simple descriptors to multitask-enhanced fingerprints.

predict.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from __future__ import annotations
+import json
+from functools import lru_cache
+from pathlib import Path
+from typing import Dict, List
+import joblib
+import numpy as np
+from src.constants import TARGET_NAMES
+from src.features import FingerprintFeaturizer
+from src.seed import set_seed
+BASE_PREDICTION = 0.5
+@lru_cache(maxsize=1)
+def _load_manifest() -> Dict:
+    manifest_path = Path("./checkpoints/training_manifest.json")
+    if not manifest_path.exists():
+        raise FileNotFoundError("Missing checkpoints/training_manifest.json. Run train.py first.")
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest = json.load(f)
+    return manifest
+@lru_cache(maxsize=2)
+def _load_stage_models(stage: str):
+    manifest = _load_manifest()
+    stage_info = manifest.get(stage, {})
+    model_dir = stage_info.get("model_dir")
+    if not model_dir:
+        return {}
+    model_path = Path(model_dir)
+    models = {}
+    for target in manifest.get("target_names", TARGET_NAMES):
+        model_file = model_path / f"{target}.pkl"
+        if model_file.exists():
+            models[target] = joblib.load(model_file)
+    return models
+def _compute_stage1_predictions(features: np.ndarray, target_names: List[str]) -> np.ndarray:
+    """Return predictions for the valid molecules from stage-1 models."""
+    stage1_models = _load_stage_models("stage1")
+    if features.shape[0] == 0:
+        return np.zeros((0, len(target_names)), dtype=np.float32)
+    predictions = np.full((features.shape[0], len(target_names)), BASE_PREDICTION, dtype=np.float32)
+    for idx, target in enumerate(target_names):
+        booster = stage1_models.get(target)
+        if booster is None:
+            continue
+        best_iter = getattr(booster, "best_iteration_", None)
+        kwargs = {"num_iteration": best_iter} if best_iter is not None else {}
+        preds = booster.predict_proba(features, **kwargs)[:, 1]
+        predictions[:, idx] = preds
+    return predictions
+def _compute_stage2_predictions(
+    base_features: np.ndarray,
+    stage1_preds: np.ndarray,
+    target_names: List[str],
+) -> np.ndarray:
+    stage2_models = _load_stage_models("stage2")
+    if not stage2_models:
+        return stage1_preds
+    n_samples = base_features.shape[0]
+    results = np.full((n_samples, len(target_names)), BASE_PREDICTION, dtype=np.float32)
+    for idx, target in enumerate(target_names):
+        model = stage2_models.get(target)
+        if model is None:
+            results[:, idx] = stage1_preds[:, idx]
+            continue
+        augmented = np.concatenate(
+            [
+                base_features,
+                np.delete(stage1_preds, idx, axis=1),
+            ],
+            axis=1,
+        )
+        best_iter = getattr(model, "best_iteration_", None)
+        kwargs = {"num_iteration": best_iter} if best_iter is not None else {}
+        preds = model.predict_proba(augmented, **kwargs)[:, 1]
+        results[:, idx] = preds
+    return results
+def predict(smiles_list: List[str]) -> Dict[str, Dict[str, float]]:
+    """
+    Predict toxicity targets for a list of SMILES strings.
+    Args:
+        smiles_list (list[str]): SMILES strings
+    Returns:
+        dict: {smiles: {target_name: prediction_prob}}
+    """
+    set_seed(0)
+    manifest = _load_manifest()
+    target_names = manifest.get("target_names", TARGET_NAMES)
+    feature_config = manifest.get("feature_config", {"type": "ecfp"})
+    featurizer = FingerprintFeaturizer(feature_config)
+    batch, features = featurizer.featurize_smiles(smiles_list)
+    stage1_preds = _compute_stage1_predictions(features, target_names)
+    stage2_preds = _compute_stage2_predictions(features, stage1_preds, target_names)
+    predictions: Dict[str, Dict[str, float]] = {}
+    valid_idx = 0
+    for original_smiles, is_valid in zip(smiles_list, batch.mask):
+        if not is_valid:
+            predictions[original_smiles] = {target: BASE_PREDICTION for target in target_names}
+            continue
+        row_preds = stage2_preds[valid_idx] if stage2_preds.size else np.full(len(target_names), BASE_PREDICTION)
+        predictions[original_smiles] = {target: float(score) for target, score in zip(target_names, row_preds)}
+        valid_idx += 1
+    return predictions

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastapi
+uvicorn[standard]
+numpy==1.26.2
+python-dotenv
+pandas==2.2.2
+scikit-learn==1.7.1
+pydantic
+rdkit-pypi
+datasets
+lightgbm
+optuna
+joblib
+map4

src/constants.py ADDED Viewed

	@@ -0,0 +1,16 @@

+TARGET_NAMES = [
+    "NR-AhR",
+    "NR-AR",
+    "NR-AR-LBD",
+    "NR-Aromatase",
+    "NR-ER",
+    "NR-ER-LBD",
+    "NR-PPAR-gamma",
+    "SR-ARE",
+    "SR-ATAD5",
+    "SR-HSE",
+    "SR-MMP",
+    "SR-p53",
+]
+CANONICAL_SMILES_COLUMN = "canonical_smiles"

src/features.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Dict, Sequence
+import numpy as np
+import pandas as pd
+from rdkit import DataStructs
+from rdkit.Chem import AllChem
+from .constants import CANONICAL_SMILES_COLUMN
+from .preprocess import MoleculeBatch, filter_dataframe_by_mask, standardize_smiles
+try:
+    from map4 import MAP4Calculator  # type: ignore
+except Exception:  # pragma: no cover - optional dependency
+    MAP4Calculator = None
+class FingerprintFeaturizer:
+    """Compute molecular fingerprints with optional caching."""
+    def __init__(self, feature_config: Dict):
+        self.config = feature_config
+        self.fingerprint_type = feature_config.get("type", "ecfp").lower()
+        self.radius = feature_config.get("radius", 2)
+        self.n_bits = feature_config.get("n_bits", 1024)
+        self.map4_dim = feature_config.get("map4_dim", 1024)
+        self.use_counts = feature_config.get("use_counts", False)
+        cache_dir = feature_config.get("cache_dir")
+        self.cache_dir = Path(cache_dir) if cache_dir else None
+        if self.cache_dir:
+            self.cache_dir.mkdir(parents=True, exist_ok=True)
+    def featurize_dataframe(self, df: pd.DataFrame, split_name: str):
+        cache_payload = self._load_cache(split_name)
+        if cache_payload is not None:
+            mask = cache_payload["mask"]
+            canonical_smiles = cache_payload["canonical_smiles"].tolist()
+            features = cache_payload["features"]
+            clean_df = filter_dataframe_by_mask(df, mask, canonical_smiles)
+            return clean_df, features
+        batch = standardize_smiles(df["smiles"].tolist())
+        clean_df = filter_dataframe_by_mask(df, batch.mask, batch.canonical_smiles)
+        features = self._compute_fingerprints(batch.mols)
+        self._write_cache(split_name, batch.mask, batch.canonical_smiles, features)
+        return clean_df, features
+    def featurize_smiles(self, smiles: Sequence[str]) -> tuple[MoleculeBatch, np.ndarray]:
+        batch = standardize_smiles(smiles)
+        features = self._compute_fingerprints(batch.mols)
+        return batch, features
+    def _cache_path(self, split_name: str) -> Path | None:
+        if self.cache_dir is None:
+            return None
+        return self.cache_dir / f"{split_name}_{self.fingerprint_type}.npz"
+    def _load_cache(self, split_name: str):
+        cache_path = self._cache_path(split_name)
+        if cache_path is None or not cache_path.exists():
+            return None
+        return np.load(cache_path, allow_pickle=True)
+    def _write_cache(self, split_name: str, mask, canonical_smiles, features):
+        cache_path = self._cache_path(split_name)
+        if cache_path is None:
+            return
+        np.savez(
+            cache_path,
+            mask=mask,
+            canonical_smiles=np.array(canonical_smiles, dtype=object),
+            features=features,
+        )
+    def _compute_fingerprints(self, mols):
+        if not mols:
+            dim = self._fingerprint_dimension()
+            return np.zeros((0, dim), dtype=np.float32)
+        if self.fingerprint_type == "ecfp":
+            return self._compute_ecfp(mols)
+        if self.fingerprint_type == "map4":
+            return self._compute_map4(mols)
+        raise ValueError(f"Unsupported fingerprint type: {self.fingerprint_type}")
+    def _fingerprint_dimension(self) -> int:
+        if self.fingerprint_type == "map4":
+            return self.map4_dim
+        return self.n_bits
+    def _compute_ecfp(self, mols):
+        fingerprints = np.zeros((len(mols), self.n_bits), dtype=np.float32)
+        for idx, mol in enumerate(mols):
+            if self.use_counts:
+                fp = AllChem.GetMorganFingerprint(mol, self.radius)
+                arr = np.zeros(self.n_bits, dtype=np.float32)
+                for bit, value in fp.GetNonzeroElements().items():
+                    arr[bit % self.n_bits] += value
+            else:
+                bitvect = AllChem.GetMorganFingerprintAsBitVect(
+                    mol,
+                    self.radius,
+                    nBits=self.n_bits,
+                )
+                arr = np.zeros(self.n_bits, dtype=np.float32)
+                DataStructs.ConvertToNumpyArray(bitvect, arr)
+            fingerprints[idx] = arr
+        return fingerprints
+    def _compute_map4(self, mols):
+        if MAP4Calculator is None:
+            raise ImportError(
+                "MAP4 fingerprint requested but the `map4` package is not installed. "
+                "Install it via `pip install map4` or switch features.type to 'ecfp'."
+            )
+        calc = MAP4Calculator(dimensions=self.map4_dim)
+        fingerprints = np.zeros((len(mols), self.map4_dim), dtype=np.float32)
+        for idx, mol in enumerate(mols):
+            vec = np.array(calc.calculate(mol), dtype=np.float32)
+            fingerprints[idx] = vec
+        return fingerprints

src/lightgbm_trainer.py ADDED Viewed

	@@ -0,0 +1,235 @@

+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Optional, Sequence
+import joblib
+import lightgbm as lgb
+import numpy as np
+import optuna
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from .constants import TARGET_NAMES
+@dataclass
+class TaskTrainingOutput:
+    model: lgb.LGBMClassifier
+    val_auc: float
+    best_iteration: int
+    best_params: Dict
+def _sample_hyperparams(trial: optuna.Trial, base_params: Dict) -> Dict:
+    params = dict(base_params)
+    params.update(
+        {
+            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.2, log=True),
+            "num_leaves": trial.suggest_int("num_leaves", 16, 256, log=True),
+            "max_depth": trial.suggest_int("max_depth", -1, 12),
+            "min_child_samples": trial.suggest_int("min_child_samples", 10, 200),
+            "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
+            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
+            "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
+            "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
+            "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
+        }
+    )
+    params.setdefault("objective", "binary")
+    params.setdefault("metric", "auc")
+    params.setdefault("verbosity", -1)
+    params.setdefault("boosting_type", "gbdt")
+    params.setdefault("n_jobs", -1)
+    return params
+def train_lightgbm_task(
+    X_train: np.ndarray,
+    y_train: np.ndarray,
+    X_val: np.ndarray,
+    y_val: np.ndarray,
+    base_params: Dict,
+    boosting_rounds: int,
+    early_stopping_rounds: int,
+    n_trials: int,
+    seed: int,
+) -> Optional[TaskTrainingOutput]:
+    if len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2:
+        return None
+    def objective(trial: optuna.Trial) -> float:
+        params = _sample_hyperparams(trial, base_params)
+        params["n_estimators"] = boosting_rounds
+        params["random_state"] = seed
+        model = lgb.LGBMClassifier(**params)
+        model.fit(
+            X_train,
+            y_train,
+            eval_set=[(X_val, y_val)],
+            eval_metric="auc",
+            callbacks=[
+                lgb.early_stopping(
+                    early_stopping_rounds,
+                    first_metric_only=True,
+                    verbose=False,
+                )
+            ],
+            verbose=False,
+        )
+        best_iter = getattr(model, "best_iteration_", boosting_rounds)
+        preds = model.predict_proba(X_val, num_iteration=best_iter)[:, 1]
+        return float(roc_auc_score(y_val, preds))
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
+    best_params = _sample_hyperparams(study.best_trial, base_params)
+    best_params["n_estimators"] = boosting_rounds
+    best_params["random_state"] = seed
+    final_model = lgb.LGBMClassifier(**best_params)
+    final_model.fit(
+        X_train,
+        y_train,
+        eval_set=[(X_val, y_val)],
+        eval_metric="auc",
+        callbacks=[
+            lgb.early_stopping(
+                early_stopping_rounds,
+                first_metric_only=True,
+                verbose=False,
+            )
+        ],
+        verbose=False,
+    )
+    best_iteration = getattr(final_model, "best_iteration_", boosting_rounds)
+    val_preds = final_model.predict_proba(X_val, num_iteration=best_iteration)[:, 1]
+    val_auc = roc_auc_score(y_val, val_preds)
+    return TaskTrainingOutput(
+        model=final_model,
+        val_auc=float(val_auc),
+        best_iteration=int(best_iteration),
+        best_params=best_params,
+    )
+def save_stage_metrics(metrics: Dict, path: Path):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        json.dump(metrics, f, indent=2)
+def train_stage_one_models(
+    train_features: np.ndarray,
+    val_features: Optional[np.ndarray],
+    train_df: pd.DataFrame,
+    val_df: Optional[pd.DataFrame],
+    config: Dict,
+    checkpoint_dir: Path,
+    target_names: Sequence[str] = TARGET_NAMES,
+) -> Dict:
+    stage_dir = checkpoint_dir / "stage1"
+    stage_dir.mkdir(parents=True, exist_ok=True)
+    training_cfg = config.get("training", {})
+    base_params = training_cfg.get("lightgbm_params", {})
+    n_trials = training_cfg.get("optuna_trials", 40)
+    boosting_rounds = training_cfg.get("boosting_rounds", 1500)
+    early_stopping = training_cfg.get("early_stopping_rounds", 100)
+    seed = config.get("seed", 42)
+    n_train = len(train_df)
+    n_tasks = len(target_names)
+    train_preds = np.full((n_train, n_tasks), 0.5, dtype=np.float32)
+    val_preds = (
+        np.full((len(val_df), n_tasks), 0.5, dtype=np.float32)
+        if val_df is not None and val_features is not None
+        else None
+    )
+    metrics: Dict[str, Dict] = {}
+    params_dump: Dict[str, Dict] = {}
+    for task_idx, task_name in enumerate(target_names):
+        train_mask = train_df[task_name].notna().values
+        if val_df is None or val_features is None:
+            metrics[task_name] = {"status": "skipped", "reason": "missing validation split"}
+            continue
+        val_mask = val_df[task_name].notna().values
+        if train_mask.sum() < 2 or val_mask.sum() < 2:
+            metrics[task_name] = {"status": "skipped", "reason": "insufficient labeled data"}
+            continue
+        X_train_task = train_features[train_mask]
+        y_train_task = train_df.loc[train_mask, task_name].astype(float).values
+        X_val_task = val_features[val_mask]
+        y_val_task = val_df.loc[val_mask, task_name].astype(float).values
+        if len(np.unique(y_train_task)) < 2 or len(np.unique(y_val_task)) < 2:
+            metrics[task_name] = {"status": "skipped", "reason": "single-class labels"}
+            continue
+        task_result = train_lightgbm_task(
+            X_train_task,
+            y_train_task,
+            X_val_task,
+            y_val_task,
+            base_params=base_params,
+            boosting_rounds=boosting_rounds,
+            early_stopping_rounds=early_stopping,
+            n_trials=n_trials,
+            seed=seed,
+        )
+        if task_result is None:
+            metrics[task_name] = {"status": "skipped", "reason": "training failed"}
+            continue
+        model = task_result.model
+        best_iter = task_result.best_iteration
+        model_path = stage_dir / f"{task_name}.pkl"
+        joblib.dump(model, model_path)
+        params_dump[task_name] = {
+            **task_result.best_params,
+            "best_iteration": best_iter,
+            "val_auc": task_result.val_auc,
+        }
+        full_train_preds = model.predict_proba(
+            train_features,
+            num_iteration=best_iter,
+        )[:, 1]
+        train_preds[:, task_idx] = full_train_preds.astype(np.float32)
+        if val_preds is not None:
+            full_val_preds = model.predict_proba(
+                val_features,
+                num_iteration=best_iter,
+            )[:, 1]
+            val_preds[:, task_idx] = full_val_preds.astype(np.float32)
+        metrics[task_name] = {
+            "val_auc": task_result.val_auc,
+            "n_train_samples": int(train_mask.sum()),
+            "n_val_samples": int(val_mask.sum()),
+        }
+    save_stage_metrics(metrics, checkpoint_dir / "metrics_stage1.json")
+    params_path = checkpoint_dir / "stage1_params.json"
+    with params_path.open("w", encoding="utf-8") as f:
+        json.dump(params_dump, f, indent=2)
+    return {
+        "train_full": train_preds,
+        "val_full": val_preds,
+        "metrics": metrics,
+    }

src/model.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch
+import torch.nn as nn
+from torch_geometric.nn import GINConv, global_add_pool, global_mean_pool
+import torch.nn.functional as F
+import numpy as np
+class GIN(torch.nn.Module):
+    def __init__(self, num_features, num_classes, dropout, hidden_dim=128, num_layers=5, add_or_mean="add"):
+        super().__init__()
+        self.num_layers = num_layers
+        self.hidden_dim = hidden_dim
+        self.add_or_mean = add_or_mean
+        self.dropout = dropout
+        self.conv_layers = nn.ModuleList()
+        # input features → hidden_dim
+        mlp = nn.Sequential(
+            nn.Linear(num_features, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.BatchNorm1d(hidden_dim)
+        )
+        self.conv_layers.append(GINConv(mlp, train_eps=True))
+        # hidden GIN layers
+        for _ in range(num_layers - 1):
+            mlp = nn.Sequential(
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.BatchNorm1d(hidden_dim)
+            )
+            self.conv_layers.append(GINConv(mlp, train_eps=True))
+        # Final classifier (after pooling)
+        self.fc = nn.Linear(hidden_dim, num_classes)
+    def forward(self, x, edge_index, batch):
+        for conv in self.conv_layers:
+            x = conv(x, edge_index)
+            x = F.relu(x)
+            x = F.dropout(x, p=self.dropout, training=self.training)
+        # Pool to get graph-level representation
+        if self.add_or_mean == "mean":
+            x = global_mean_pool(x, batch)
+        elif self.add_or_mean == "add":
+            x = global_add_pool(x, batch)
+        x = F.dropout(x, p=0.5, training=self.training)
+        return self.fc(x)

src/preprocess.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Dict, List, Sequence
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+from rdkit import Chem
+from rdkit.Chem.MolStandardize import rdMolStandardize
+from .constants import CANONICAL_SMILES_COLUMN
+@dataclass
+class MoleculeBatch:
+    mols: List[Chem.Mol]
+    mask: np.ndarray
+    canonical_smiles: List[str]
+def load_tox21_dataset(token: str | None, dataset_name: str) -> Dict[str, pd.DataFrame]:
+    """Load dataset splits from Hugging Face into pandas DataFrames."""
+    dataset = load_dataset(dataset_name, token=token)
+    splits: Dict[str, pd.DataFrame] = {}
+    for split_name in dataset.keys():
+        splits[split_name] = dataset[split_name].to_pandas()
+    return splits
+def standardize_smiles(smiles: Sequence[str]) -> MoleculeBatch:
+    """Standardize SMILES strings and return RDKit molecules with canonical SMILES."""
+    tautomer_enumerator = rdMolStandardize.TautomerEnumerator()
+    cleanup_params = rdMolStandardize.CleanupParameters()
+    mols: List[Chem.Mol] = []
+    canonical_smiles: List[str] = []
+    mask = np.zeros(len(smiles), dtype=bool)
+    for idx, smi in enumerate(smiles):
+        try:
+            mol = Chem.MolFromSmiles(smi)
+            if mol is None:
+                continue
+            mol = rdMolStandardize.Cleanup(mol, cleanup_params)
+            mol = tautomer_enumerator.Canonicalize(mol)
+            canonical = Chem.MolToSmiles(mol)
+            mol = Chem.MolFromSmiles(canonical)
+            if mol is None:
+                continue
+            mols.append(mol)
+            canonical_smiles.append(canonical)
+            mask[idx] = True
+        except Exception:
+            continue
+    return MoleculeBatch(mols=mols, mask=mask, canonical_smiles=canonical_smiles)
+def filter_dataframe_by_mask(df: pd.DataFrame, mask: np.ndarray, canonical_smiles: Sequence[str]) -> pd.DataFrame:
+    """Apply mask to dataframe and append canonical SMILES column."""
+    clean_df = df.loc[mask].copy().reset_index(drop=True)
+    clean_df[CANONICAL_SMILES_COLUMN] = canonical_smiles
+    return clean_df

src/seed.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+import random
+import numpy as np
+def set_seed(seed: int = 42):
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    random.seed(seed)
+    np.random.seed(seed)

src/stage_two.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Dict, Optional, Sequence
+import joblib
+import numpy as np
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from .constants import TARGET_NAMES
+from .lightgbm_trainer import save_stage_metrics, train_lightgbm_task
+def _build_augmented_matrix(base_features: np.ndarray, prediction_matrix: np.ndarray, target_idx: int) -> np.ndarray:
+    mask = np.ones(prediction_matrix.shape[1], dtype=bool)
+    mask[target_idx] = False
+    return np.concatenate([base_features, prediction_matrix[:, mask]], axis=1)
+def train_stage_two_models(
+    train_features: np.ndarray,
+    val_features: Optional[np.ndarray],
+    train_df: pd.DataFrame,
+    val_df: Optional[pd.DataFrame],
+    config: Dict,
+    checkpoint_dir: Path,
+    stage1_train_preds: np.ndarray,
+    stage1_val_preds: Optional[np.ndarray],
+    target_names: Sequence[str] = TARGET_NAMES,
+) -> Dict:
+    training_cfg = config.get("training", {})
+    base_params = training_cfg.get("lightgbm_params", {})
+    n_trials = training_cfg.get("optuna_trials", 40)
+    boosting_rounds = training_cfg.get("boosting_rounds", 1500)
+    early_stopping = training_cfg.get("early_stopping_rounds", 100)
+    seed = config.get("seed", 42)
+    stage_dir = checkpoint_dir / "stage2"
+    stage_dir.mkdir(parents=True, exist_ok=True)
+    n_train = len(train_df)
+    n_val = len(val_df) if val_df is not None else 0
+    metrics: Dict[str, Dict] = {}
+    for task_idx, task_name in enumerate(target_names):
+        mask = train_df[task_name].notna().values
+        if mask.sum() == 0:
+            metrics[task_name] = {"status": "skipped", "reason": "no labels"}
+            continue
+        augmented_train_matrix = _build_augmented_matrix(
+            train_features[mask],
+            stage1_train_preds[mask],
+            task_idx,
+        )
+        y_train = train_df.loc[mask, task_name].astype(float).values
+        if (
+            val_features is None
+            or val_df is None
+            or stage1_val_preds is None
+            or val_df[task_name].notna().sum() < 2
+        ):
+            metrics[task_name] = {"status": "skipped", "reason": "missing validation data"}
+            continue
+        val_mask = val_df[task_name].notna().values
+        augmented_val_matrix = _build_augmented_matrix(
+            val_features[val_mask],
+            stage1_val_preds[val_mask],
+            task_idx,
+        )
+        y_val = val_df.loc[val_mask, task_name].astype(float).values
+        if len(np.unique(y_val)) < 2 or len(np.unique(y_train)) < 2:
+            metrics[task_name] = {"status": "skipped", "reason": "single-class labels"}
+            continue
+        task_result = train_lightgbm_task(
+            augmented_train_matrix,
+            y_train,
+            augmented_val_matrix,
+            y_val,
+            base_params=base_params,
+            boosting_rounds=boosting_rounds,
+            early_stopping_rounds=early_stopping,
+            n_trials=n_trials,
+            seed=seed,
+        )
+        if task_result is None:
+            metrics[task_name] = {"status": "skipped", "reason": "training failed"}
+            continue
+        model_path = stage_dir / f"{task_name}.pkl"
+        joblib.dump(task_result.model, model_path)
+        metrics[task_name] = {
+            "val_auc": task_result.val_auc,
+            "best_iteration": int(task_result.best_iteration),
+        }
+    save_stage_metrics(metrics, checkpoint_dir / "metrics_stage2.json")
+    return {"metrics": metrics}

src/train_evaluate.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import torch
+import torch.nn as nn
+import numpy as np
+from sklearn.metrics import roc_auc_score
+def masked_bce_loss(logits, labels, mask):
+    """
+    logits: [batch_size, num_classes] (raw outputs)
+    labels: [batch_size, num_classes] (0/1 with filler)
+    mask:   [batch_size, num_classes] (True if label is valid)
+    """
+    criterion = nn.BCEWithLogitsLoss(reduction="none")
+    loss_raw = criterion(logits, labels)
+    loss = (loss_raw * mask.float()).sum() / mask.float().sum()
+    return loss
+def train_model(model, loader, optimizer, device):
+    model.train()
+    total_loss = 0
+    for batch in loader:
+        batch = batch.to(device)
+        optimizer.zero_grad()
+        out = model(batch.x, batch.edge_index, batch.batch)  # [num_graphs, num_classes]
+        loss = masked_bce_loss(out, batch.y, batch.mask)
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item() * batch.num_graphs
+    return total_loss / len(loader.dataset)
+@torch.no_grad()
+def evaluate(model, loader, device):
+    model.eval()
+    total_loss = 0
+    for batch in loader:
+        batch = batch.to(device)
+        out = model(batch.x, batch.edge_index, batch.batch)
+        loss = masked_bce_loss(out, batch.y, batch.mask)
+        total_loss += loss.item() * batch.num_graphs
+    return total_loss / len(loader.dataset)
+@torch.no_grad()
+def compute_roc_auc(model, loader, device):
+    model.eval()
+    y_true, y_pred, y_mask = [], [], []
+    for batch in loader:
+        batch = batch.to(device)
+        out = model(batch.x, batch.edge_index, batch.batch)
+        # Store predictions (sigmoid → probabilities)
+        y_pred.append(torch.sigmoid(out).cpu())
+        y_true.append(batch.y.cpu())
+        y_mask.append(batch.mask.cpu())
+    # Concatenate across all batches
+    y_true = torch.cat(y_true, dim=0).numpy()
+    y_pred = torch.cat(y_pred, dim=0).numpy()
+    y_mask = torch.cat(y_mask, dim=0).numpy()
+    auc_list = []
+    for i in range(y_true.shape[1]):  # per label
+        mask_i = y_mask[:, i].astype(bool)
+        if mask_i.sum() > 0:  # at least one valid label
+            try:
+                auc = roc_auc_score(y_true[mask_i, i], y_pred[mask_i, i])
+                auc_list.append(auc)
+            except ValueError:
+                # happens if only one class present (all 0 or all 1)
+                pass
+    return np.mean(auc_list) if len(auc_list) > 0 else float("nan")
+@torch.no_grad()
+def compute_roc_auc_avg_and_per_class(model, loader, device):
+    model.eval()
+    y_true, y_pred, y_mask = [], [], []
+    with torch.no_grad():
+        for batch in loader:
+            batch = batch.to(device)
+            out = model(batch.x, batch.edge_index, batch.batch)
+            # Store predictions (sigmoid → probabilities)
+            y_pred.append(torch.sigmoid(out).cpu())
+            y_true.append(batch.y.cpu())
+            y_mask.append(batch.mask.cpu())
+    # Concatenate across all batches
+    y_true = torch.cat(y_true, dim=0).numpy()
+    y_pred = torch.cat(y_pred, dim=0).numpy()
+    y_mask = torch.cat(y_mask, dim=0).numpy()
+    # Compute AUC per class
+    auc_list = []
+    for i in range(y_true.shape[1]):
+        mask_i = y_mask[:, i].astype(bool)
+        if mask_i.sum() > 0:
+            try:
+                auc = roc_auc_score(y_true[mask_i, i], y_pred[mask_i, i])
+            except ValueError:
+                auc = np.nan  # in case only one class present
+        else:
+            auc = np.nan
+        auc_list.append(auc)
+    # Convert to numpy array for easier manipulation
+    auc_array = np.array(auc_list, dtype=np.float32)
+    mean_auc = np.nanmean(auc_array)  # overall mean ignoring NaNs
+    # Return both per-class and mean
+    return auc_array, mean_auc

train.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from __future__ import annotations
+import json
+import os
+from pathlib import Path
+from typing import Dict
+import numpy as np
+from dotenv import load_dotenv
+from src.constants import TARGET_NAMES
+from src.features import FingerprintFeaturizer
+from src.lightgbm_trainer import train_stage_one_models
+from src.preprocess import load_tox21_dataset
+from src.seed import set_seed
+from src.stage_two import train_stage_two_models
+def _default_checkpoint_dir(config: Dict) -> Path:
+    checkpoint_cfg = config.get("output", {})
+    checkpoint_dir = checkpoint_cfg.get("checkpoint_dir", "./checkpoints")
+    path = Path(checkpoint_dir)
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+def train(config: Dict):
+    load_dotenv()
+    set_seed(config.get("seed", 42))
+    token = os.getenv("TOKEN")
+    dataset_cfg = config.get("dataset", {})
+    dataset_name = dataset_cfg.get("name", "ml-jku/tox21")
+    splits = load_tox21_dataset(token, dataset_name)
+    if "train" not in splits or "validation" not in splits:
+        raise ValueError("Dataset must provide 'train' and 'validation' splits.")
+    featurizer = FingerprintFeaturizer(config.get("features", {}))
+    train_df, train_features = featurizer.featurize_dataframe(splits["train"], "train")
+    val_df, val_features = featurizer.featurize_dataframe(splits["validation"], "validation")
+    checkpoint_dir = _default_checkpoint_dir(config)
+    cache_dir = checkpoint_dir / "cache"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    print("==== Stage 1: Training baseline LightGBM models ====")
+    stage1_artifacts = train_stage_one_models(
+        train_features,
+        val_features,
+        train_df,
+        val_df,
+        config,
+        checkpoint_dir,
+        target_names=TARGET_NAMES,
+    )
+    stage1_train_full = stage1_artifacts["train_full"]
+    stage1_val_full = stage1_artifacts["val_full"]
+    np.savez(
+        cache_dir / "stage1_train_predictions.npz",
+        full=stage1_train_full,
+        target_names=np.array(TARGET_NAMES, dtype=object),
+    )
+    if stage1_val_full is not None:
+        np.savez(
+            cache_dir / "stage1_validation_predictions.npz",
+            full=stage1_val_full,
+            target_names=np.array(TARGET_NAMES, dtype=object),
+        )
+    stage2_metrics = None
+    multitask_cfg = config.get("multitask", {"enabled": False})
+    if multitask_cfg.get("enabled", False):
+        print("==== Stage 2: Training multitask-augmented LightGBM models ====")
+        stage2_artifacts = train_stage_two_models(
+            train_features,
+            val_features,
+            train_df,
+            val_df,
+            config,
+            checkpoint_dir,
+            stage1_train_full,
+            stage1_val_full,
+            target_names=TARGET_NAMES,
+        )
+        stage2_metrics = stage2_artifacts["metrics"]
+    stage2_entry = {
+        "enabled": bool(multitask_cfg.get("enabled", False)),
+        "model_dir": str(checkpoint_dir / "stage2") if stage2_metrics is not None else None,
+        "metrics": str(checkpoint_dir / "metrics_stage2.json") if stage2_metrics is not None else None,
+    }
+    manifest = {
+        "feature_config": config.get("features", {}),
+        "target_names": TARGET_NAMES,
+        "dataset": dataset_cfg,
+        "stage1": {
+            "model_dir": str(checkpoint_dir / "stage1"),
+            "metrics": str((checkpoint_dir / "metrics_stage1.json")),
+        },
+        "stage2": stage2_entry,
+        "multitask": multitask_cfg,
+        "seed": config.get("seed", 42),
+    }
+    manifest_path = checkpoint_dir / "training_manifest.json"
+    with manifest_path.open("w", encoding="utf-8") as f:
+        json.dump(manifest, f, indent=2)
+    print("Training complete.")
+if __name__ == "__main__":
+    with open("./config/config.json", "r", encoding="utf-8") as f:
+        config = json.load(f)
+    train(config)