Spaces:

pregH
/

MolCraftDiffusion-demo

Build error

App Files Files Community

iflp1908sl commited on Feb 11

Commit

005e4d3

1 Parent(s): 777c843

Add source code (clean)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +9 -0
MolecularDiffusion/__init__.py +26 -0
MolecularDiffusion/_version.py +21 -0
MolecularDiffusion/callbacks/__init__.py +13 -0
MolecularDiffusion/callbacks/train_helper.py +259 -0
MolecularDiffusion/cli/__init__.py +6 -0
MolecularDiffusion/cli/_hydra.py +129 -0
MolecularDiffusion/cli/analyze.py +380 -0
MolecularDiffusion/cli/eval_predict.py +259 -0
MolecularDiffusion/cli/generate.py +282 -0
MolecularDiffusion/cli/main.py +197 -0
MolecularDiffusion/cli/predict.py +395 -0
MolecularDiffusion/cli/train.py +453 -0
MolecularDiffusion/configs/data/filter_molecules_by_property.py +0 -0
MolecularDiffusion/configs/data/formed_data.yaml +20 -0
MolecularDiffusion/configs/data/mol_dataset.yaml +25 -0
MolecularDiffusion/configs/data/mol_dataset_extraf.yaml +23 -0
MolecularDiffusion/configs/engine/lightning.yaml +33 -0
MolecularDiffusion/configs/engine/original.yaml +4 -0
MolecularDiffusion/configs/hydra/default.yaml +19 -0
MolecularDiffusion/configs/interference/gen_cfg.yaml +15 -0
MolecularDiffusion/configs/interference/gen_cfggg.yaml +29 -0
MolecularDiffusion/configs/interference/gen_conditional.yaml +12 -0
MolecularDiffusion/configs/interference/gen_gg.yaml +29 -0
MolecularDiffusion/configs/interference/gen_hybrid.yaml +28 -0
MolecularDiffusion/configs/interference/gen_inpaint.yaml +69 -0
MolecularDiffusion/configs/interference/gen_outpaint.yaml +31 -0
MolecularDiffusion/configs/interference/gen_outpaintft.yaml +18 -0
MolecularDiffusion/configs/interference/gen_unconditional.yaml +11 -0
MolecularDiffusion/configs/interference/prediction.yaml +2 -0
MolecularDiffusion/configs/logger/default.yaml +9 -0
MolecularDiffusion/configs/logger/wandb.yaml +9 -0
MolecularDiffusion/configs/models/tabasco_transformer.yaml +72 -0
MolecularDiffusion/configs/tasks/diffusion.yaml +48 -0
MolecularDiffusion/configs/tasks/diffusion_egt.yaml +54 -0
MolecularDiffusion/configs/tasks/diffusion_extraf.yaml +47 -0
MolecularDiffusion/configs/tasks/diffusion_hybrid.yaml +95 -0
MolecularDiffusion/configs/tasks/diffusion_hybrid_egcl.yaml +53 -0
MolecularDiffusion/configs/tasks/diffusion_integer.yaml +62 -0
MolecularDiffusion/configs/tasks/diffusion_pretrained.yaml +47 -0
MolecularDiffusion/configs/tasks/diffusion_pyg.yaml +82 -0
MolecularDiffusion/configs/tasks/diffusion_pyg_egcl.yaml +55 -0
MolecularDiffusion/configs/tasks/diffusion_pyg_egt.yaml +56 -0
MolecularDiffusion/configs/tasks/diffusion_tabasco.yaml +66 -0
MolecularDiffusion/configs/tasks/guidance.yaml +40 -0
MolecularDiffusion/configs/tasks/guidance_esen.yaml +43 -0
MolecularDiffusion/configs/tasks/guidance_pc.yaml +43 -0
MolecularDiffusion/configs/tasks/ldm_dit.yaml +24 -0
MolecularDiffusion/configs/tasks/regression.yaml +30 -0
MolecularDiffusion/configs/tasks/regression_esen.yaml +34 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.DS_Store
+.env
+.venv
+env/
+venv/

MolecularDiffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""
+MolecularDiffusion - A molecular diffusion framework.
+This package provides tools and models for molecular diffusion processes.
+"""
+__version__ = "0.1.0"
+__author__ = "Thanapat Worakul"
+__email__ = "thanapat.worakul@epfl.ch"
+# Import main modules to make them available at package level
+from . import core
+from . import data
+from . import modules
+from . import utils
+from . import callbacks
+from . import runmodes
+__all__ = [
+    "core",
+    "data",
+    "modules",
+    "utils",
+    "callbacks",
+    "runmodes"
+]

MolecularDiffusion/_version.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# file generated by setuptools-scm
+# don't change, don't track in version control
+__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple
+    from typing import Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+else:
+    VERSION_TUPLE = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+__version__ = version = '0.1.dev26+gff3c644.d20250809'
+__version_tuple__ = version_tuple = (0, 1, 'dev26', 'gff3c644.d20250809')

MolecularDiffusion/callbacks/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .train_helper import (
+    Queue,
+    gradient_clipping,
+    EMA,
+    SP_regularizer
+)
+__all__ = [
+    "Queue",
+    "gradient_clipping",
+    "EMA",
+    "SP_regularizer"
+]

MolecularDiffusion/callbacks/train_helper.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import numpy as np
+import torch
+import logging
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.CRITICAL)
+class Queue:
+    def __init__(self, max_len=50):
+        self.items = []
+        self.max_len = max_len
+    def __len__(self):
+        return len(self.items)
+    def add(self, item):
+        self.items.insert(0, item)
+        if len(self) > self.max_len:
+            self.items.pop()
+    def mean(self):
+        return np.mean(self.items)
+    def std(self):
+        return np.std(self.items)
+class gradient_clipping:
+    def __init__(self, m=1, max_len=200):
+        self.max_grad_norm = None
+        self.max_grad_norms = []
+        self.max_len = max_len
+        self.m = m
+        self.FACTOR = 100
+    def __call__(self, model, gradnorm_queue):
+        self.max_grad_norm = 1.5 * gradnorm_queue.mean() + 2 * gradnorm_queue.std()
+        if len(self.max_grad_norms) == 0:
+            self.max_grad_norms.append(self.max_grad_norm)
+        else:
+            #max_grad_norm_mean = torch.mean(torch.tensor(self.max_grad_norms))
+            previous_max_grad_norm = self.max_grad_norms[-1]
+            # if the current max_grad_norm is greater than the mean of the previous max_grad_norms
+            if self.max_grad_norm > previous_max_grad_norm:
+                self.max_grad_norm = previous_max_grad_norm * self.m
+                if self.max_grad_norm > previous_max_grad_norm * 1e5:
+                    self.max_grad_norm = previous_max_grad_norm * self.m / self.FACTOR
+            self.max_grad_norms.append(self.max_grad_norm)
+        if len(self.max_grad_norms) > self.max_len:
+            self.max_grad_norms.pop(0)
+        # Clips gradient and returns the norm
+        grad_norm = torch.nn.utils.clip_grad_norm_(
+            model.parameters(), max_norm=self.max_grad_norm, norm_type=2.0
+        )
+        if float(grad_norm) > self.max_grad_norm:
+            gradnorm_queue.add(float(self.max_grad_norm))
+        else:
+            gradnorm_queue.add(float(grad_norm))
+        if float(grad_norm) > self.max_grad_norm:
+            logger.info(
+                f"Clipped gradient with value {grad_norm:.1f} "
+                f"while allowed {self.max_grad_norm:.1f}"
+            )
+        return grad_norm
+class gradient_clipping_0:
+    def __init__(self, m=1, max_len=200):
+        self.max_grad_norm = None
+        self.max_grad_norms = []
+        self.max_len = max_len
+        self.m = m
+    def __call__(self, model, gradnorm_queue):
+        self.max_grad_norm = 1.5 * gradnorm_queue.mean() + 2 * gradnorm_queue.std()
+        if len(self.max_grad_norms) == 0:
+            self.max_grad_norms.append(self.max_grad_norm)
+        else:
+            max_grad_norm_mean = torch.mean(torch.tensor(self.max_grad_norms))
+            if self.max_grad_norm > max_grad_norm_mean:
+                self.max_grad_norm = max_grad_norm_mean * self.m
+                if self.max_grad_norm > max_grad_norm_mean * 1e5:
+                    self.max_grad_norm = max_grad_norm_mean * self.m / 10
+            self.max_grad_norms.append(self.max_grad_norm)
+        if len(self.max_grad_norms) > self.max_len:
+            self.max_grad_norms.pop(0)
+        # Clips gradient and returns the norm
+        grad_norm = torch.nn.utils.clip_grad_norm_(
+            model.parameters(), max_norm=self.max_grad_norm, norm_type=2.0
+        )
+        if float(grad_norm) > self.max_grad_norm:
+            gradnorm_queue.add(float(self.max_grad_norm))
+        else:
+            gradnorm_queue.add(float(grad_norm))
+        if float(grad_norm) > self.max_grad_norm:
+            print(
+                f"Clipped gradient with value {grad_norm:.1f} "
+                f"while allowed {self.max_grad_norm:.1f}"
+            )
+        return grad_norm
+class EMA:
+    def __init__(self, beta):
+        super().__init__()
+        self.beta = beta
+    def update_model_average(self, ma_model, current_model):
+        for current_params, ma_params in zip(
+            current_model.parameters(), ma_model.parameters()
+        ):
+            old_weight, up_weight = ma_params.data, current_params.data
+            ma_params.data = self.update_average(old_weight, up_weight)
+    def update_average(self, old, new):
+        if old is None:
+            return new
+        return old * self.beta + (1 - self.beta) * new
+class SP_regularizer:
+    def __init__(
+        self,
+        regularizer: str,
+        lambda_: float = 10,
+        lambda_2: float = 100,
+        lambda_update_value: float = 50,
+        lambda_update_step: int = 2500,
+        polynomial_p: float = 1.5,
+        warm_up_steps: int = 100,
+    ):
+        """
+        Self-paced regularizer for curriculum learning
+        Args:
+            regularizer (str): Regularizer to use. Options are:
+                - hard
+                - linear
+                - logaritmic
+                - logistic
+            lambda_ (float): Initial lambda value
+            lambda_2 (float): Initial lambda value for the second regularizer
+            lambda_update_value (float): Value to update lambda
+            lambda_update_step (int): Number of steps to update lambda
+            polynomial_p (float): Value of p for polynomial regularizer
+            warm_up_steps (int): Number of steps to use the regularizer
+        """
+        self.regularizer = regularizer
+        self.lambda_ = lambda_
+        self.lambda_2 = lambda_2
+        self.n_calls = 1
+        self.lambda_update_value = lambda_update_value
+        self.lambda_update_step = lambda_update_step
+        self.p = polynomial_p
+        self.warm_up_steps = warm_up_steps
+    def __call__(self, losses: torch.Tensor):
+        # TODO during warm up steps, keep the losses infomation, to be used to determine lambda
+        if self.n_calls < self.warm_up_steps:
+            self.n_calls += 1
+            return losses
+        else:
+            if self.regularizer == "hard":
+                weighted_loss = self.hard(losses)
+            elif self.regularizer == "linear":
+                weighted_loss = self.linear(losses)
+            elif self.regularizer == "logaritmic":
+                weighted_loss = self.logaritmic(losses)
+            elif self.regularizer == "logistic":
+                weighted_loss = self.logistic(losses)
+            elif self.regularizer == "polynomial":
+                weighted_loss = self.polynomial(losses)
+            elif self.regularizer == "hard_relax":
+                weighted_loss = self.hard_relax(losses)
+            else:
+                raise ValueError("Regularizer not implemented")
+            self.n_calls += 1
+            self.update_lambda()
+            return weighted_loss
+    def update_lambda(self):
+        if self.n_calls % self.lambda_update_step == 0:
+            self.lambda_ += self.lambda_update_value
+            self.lambda_2 += self.lambda_update_value
+        elif self.n_calls == 0:
+            self.lambda_ = self.lambda_
+            self.lambda_2 = self.lambda_2
+    def hard(self, losses: torch.Tensor):
+        weights = (losses <= self.lambda_).float()
+        sp_loss = losses * weights
+        return sp_loss
+    def hard_relax(self, losses: torch.Tensor):
+        weights = torch.where(
+            losses < self.lambda_,
+            torch.ones_like(losses),
+            (1 - losses / self.lambda_2) ** (1 / (self.p - 1)),
+        )
+        idces_zero = torch.where(losses > self.lambda_2)
+        weights[idces_zero] = 0
+        weights = torch.clamp(weights, 0, 1)
+        sp_loss = losses * weights
+        return sp_loss
+    def linear(self, losses: torch.Tensor):
+        weights = torch.where(
+            losses > self.lambda_, torch.zeros_like(losses), 1 - losses / self.lambda_
+        )
+        weights = torch.clamp(weights, 0, 1)
+        sp_loss = losses * weights
+        return sp_loss
+    def logaritmic(self, losses: torch.Tensor):
+        weights = torch.where(
+            losses > self.lambda_,
+            torch.zeros_like(losses),
+            torch.log(2 - losses / self.lambda_),
+        )
+        weights = torch.clamp(weights, 0, 1)
+        sp_loss = losses * weights
+        return sp_loss
+    def logistic(self, losses: torch.Tensor):
+        weights = torch.where(
+            losses > self.lambda_,
+            torch.zeros_like(losses),
+            (1 - torch.exp(torch.tensor(self.lambda_)))
+            / (1 - torch.exp(losses - self.lambda_)),
+        )
+        weights = torch.clamp(weights, 0, 1)
+        sp_loss = losses * weights
+        return sp_loss
+    def polynomial(self, losses: torch.Tensor):
+        weights = torch.where(
+            losses > self.lambda_,
+            torch.zeros_like(losses),
+            (1 - losses / self.lambda_) ** (1 / (self.p - 1)),
+        )
+        weights = torch.clamp(weights, 0, 1)
+        sp_loss = losses * weights
+        return sp_loss

MolecularDiffusion/cli/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# CLI module for MolecularDiffusion.
+"""Unified command-line interface for MolecularDiffusion package."""
+from MolecularDiffusion.cli.main import cli
+__all__ = ["cli"]

MolecularDiffusion/cli/_hydra.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""Hydra configuration utilities for CLI.
+Provides utilities for discovering and loading bundled configs
+while allowing user configs to reference them.
+"""
+import os
+from pathlib import Path
+from typing import Optional, List
+from importlib import resources
+def get_package_config_path() -> Path:
+    """Get the absolute path to bundled config directory.
+    Returns:
+        Path to the configs directory within the installed package.
+    """
+    # Use importlib.resources for Python 3.9+
+    try:
+        # For Python 3.9+
+        pkg_files = resources.files("MolecularDiffusion")
+        config_path = pkg_files / "configs"
+        # Convert to real path (handles both installed and editable installs)
+        if hasattr(config_path, '_path'):
+            # Traversable from importlib.resources
+            real_path = Path(str(config_path))
+        else:
+            real_path = Path(config_path)
+        if real_path.is_dir():
+            return real_path
+    except (TypeError, AttributeError, Exception):
+        pass
+    # Fallback: relative to this module
+    module_dir = Path(__file__).parent.parent
+    config_path = module_dir / "configs"
+    if config_path.is_dir():
+        return config_path
+    raise FileNotFoundError(
+        "Could not find bundled configs. Ensure package is installed correctly."
+    )
+def setup_hydra_config(
+    config_name: str,
+    config_dir: Optional[str] = None,
+    overrides: Optional[List[str]] = None,
+):
+    """Setup Hydra configuration with proper search paths.
+    Configures Hydra to search:
+    1. User's config_dir (if provided) or current directory
+    2. Package bundled configs (via searchpath)
+    Args:
+        config_name: Name of the config file (without .yaml extension)
+        config_dir: Optional user config directory
+        overrides: Optional list of Hydra override strings
+    Returns:
+        DictConfig from Hydra
+    """
+    from hydra import compose, initialize_config_dir
+    from hydra.core.global_hydra import GlobalHydra
+    # Get package config path for defaults
+    pkg_config_path = get_package_config_path()
+    # Determine primary config directory
+    # If config_name contains a path (e.g., "configs/train.yaml"), extract the directory
+    config_name_path = Path(config_name)
+    if config_name_path.parent != Path("."):
+        # Config name includes directory, use that as config_dir
+        if config_dir is None:
+            config_dir = str(config_name_path.parent)
+        config_name = config_name_path.name
+    if config_dir:
+        primary_config_dir = os.path.abspath(config_dir)
+    else:
+        primary_config_dir = os.getcwd()
+    # Clear any existing Hydra state
+    GlobalHydra.instance().clear()
+    # Initialize with the primary config directory
+    initialize_config_dir(
+        config_dir=primary_config_dir,
+        version_base="1.3",
+    )
+    # Build overrides to include searchpath for bundled configs
+    all_overrides = overrides or []
+    # Add package config path to searchpath using file:// protocol
+    # This allows Hydra to find bundled defaults like data/mol_dataset.yaml
+    searchpath_override = f"hydra.searchpath=[file://{pkg_config_path}]"
+    all_overrides = [searchpath_override] + all_overrides
+    # Handle config name (strip .yaml if present)
+    if config_name.endswith(".yaml"):
+        config_name = config_name[:-5]
+    # Compose the configuration
+    cfg = compose(config_name=config_name, overrides=all_overrides)
+    return cfg
+def run_hydra_app(
+    config_name: str,
+    task_function,
+    config_dir: Optional[str] = None,
+    overrides: Optional[List[str]] = None,
+):
+    """Run a Hydra-based task function with proper config setup.
+    This is the main entry point for CLI commands that use Hydra configs.
+    Args:
+        config_name: Name of the config file
+        task_function: Function to call with the composed config
+        config_dir: Optional user config directory
+        overrides: Optional Hydra overrides
+    """
+    cfg = setup_hydra_config(config_name, config_dir, overrides)
+    return task_function(cfg)

MolecularDiffusion/cli/analyze.py ADDED Viewed

	@@ -0,0 +1,380 @@

+"""Analyze CLI subcommands for 3D molecule analysis.
+Provides subcommands for:
+- optimize: XTB geometry optimization
+- metrics: Validity/connectivity metrics
+- compare: RMSD, energy, and optional bond analysis
+- xyz2mol: XYZ to SMILES conversion + fingerprints
+"""
+import os
+import click
+# Enable -h as alias for --help
+CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
+@click.group(context_settings=CONTEXT_SETTINGS)
+def analyze():
+    """Analyze 3D molecular structures.
+    \b
+    Subcommands:
+      optimize  XTB geometry optimization
+      metrics   Validity/connectivity metrics
+      compare   RMSD, energy, and bond analysis
+      xyz2mol   Convert XYZ to SMILES + fingerprints
+    """
+    pass
+# ============================================================================
+# OPTIMIZE: XTB geometry optimization
+# ============================================================================
+@analyze.command("optimize", context_settings=CONTEXT_SETTINGS)
+@click.argument("input_dir", type=click.Path(exists=True))
+@click.option("--output-dir", "-o", "--o", default=None, type=click.Path(),
+              help="Output directory for optimized files (default: input_dir/optimized_xyz)")
+@click.option("--charge", "-c", "--c", default=0, type=int,
+              help="Molecular charge for xTB (default: 0)")
+@click.option("--level", "-l", "--l", default="gfn1", type=click.Choice(["gfn1", "gfn2", "gfn-ff", "mmff94"]),
+              help="Optimization level (default: gfn1)")
+@click.option("--timeout", "-t", "--t", default=240, type=int,
+              help="Timeout per molecule in seconds (default: 240)")
+@click.option("--scale-factor", "-s", "--s", default=1.3, type=float,
+              help="Scale factor for covalent radii (default: 1.3)")
+@click.option("--csv", "csv_path", default=None, type=click.Path(),
+              help="CSV file to filter which files to optimize")
+@click.option("--filter-column", default=None, type=str,
+              help="Column name in CSV to filter by (values must be 1)")
+def optimize(input_dir, output_dir, charge, level, timeout, scale_factor, csv_path, filter_column):
+    """Optimize XYZ geometries using xTB.
+    \b
+    Examples:
+        MolCraftDiff analyze optimize gen_xyz/
+        MolCraftDiff analyze optimize gen_xyz/ --o optimized/ --level gfn2
+    """
+    from MolecularDiffusion.runmodes.analyze.xtb_optimization import get_xtb_optimized_xyz
+    output_dir = output_dir or os.path.join(input_dir, "optimized_xyz")
+    click.echo(f"Optimizing XYZ files from: {input_dir}")
+    click.echo(f"Output directory: {output_dir}")
+    click.echo(f"xTB level: {level}, charge: {charge}")
+    optimized_files = get_xtb_optimized_xyz(
+        input_directory=input_dir,
+        output_directory=output_dir,
+        charge=charge,
+        level=level,
+        timeout=timeout,
+        scale_factor=scale_factor,
+        csv_path=csv_path,
+        filter_column=filter_column,
+    )
+    click.echo(f"\nSuccessfully optimized {len(optimized_files)} files.")
+# ============================================================================
+# METRICS: Validity/connectivity metrics
+# ============================================================================
+@analyze.command("metrics", context_settings=CONTEXT_SETTINGS)
+@click.argument("input_dir", type=click.Path(exists=True))
+@click.option("--output", "-o", "--o", "--output-csv", default=None, type=click.Path(),
+              help="Output CSV file for results")
+@click.option("--metrics", "-m", "--m", "metrics_type", default="all",
+              type=click.Choice(["all", "core", "posebuster", "geom_revised"]),
+              help="Which metrics to compute (default: all)")
+@click.option("--recheck-topo", is_flag=True, default=False,
+              help="Recheck topology using RDKit")
+@click.option("--check-strain", is_flag=True, default=False,
+              help="Check strain via XTB optimization")
+@click.option("--portion", "-p", "--p", default=1.0, type=float,
+              help="Portion of XYZ files to process (default: 1.0 = all)")
+@click.option("--mol-converter", default="cell2mol",
+              type=click.Choice(["cell2mol", "openbabel"]),
+              help="XYZ to mol converter (default: cell2mol)")
+@click.option("--skip-atoms", multiple=True, type=int,
+              help="Atom indices to skip in validation")
+@click.option("--n-subsets", "-n", "--n", default=5, type=int,
+              help="Number of subsets for std calculation (default: 5)")
+@click.option("--timeout", "-t", "--t", default=10, type=int,
+              help="Timeout per xyz2mol conversion in seconds (default: 10)")
+def metrics(input_dir, output, metrics_type, recheck_topo, check_strain, portion, mol_converter, skip_atoms, n_subsets, timeout):
+    """Compute validity and connectivity metrics for XYZ files.
+    \b
+    Metrics types:
+      all          Run all metrics (core + posebuster + geom_revised)
+      core         Basic validity checks (connectivity, atom stability)
+      posebuster   PoseBusters checks (bond lengths, angles, clashes)
+      geom_revised Aromatic-aware stability metrics
+    \b
+    Examples:
+        MolCraftDiff analyze metrics gen_xyz/
+        MolCraftDiff analyze metrics gen_xyz/ --metrics posebuster
+        MolCraftDiff analyze metrics gen_xyz/ --metrics geom_revised --mol-converter openbabel
+    """
+    import argparse
+    from MolecularDiffusion.runmodes.analyze.compute_metrics import runner
+    args = argparse.Namespace(
+        input=input_dir,
+        output=output,
+        metrics=metrics_type,
+        recheck_topo=recheck_topo,
+        check_strain=check_strain,
+        portion=portion,
+        mol_converter=mol_converter,
+        skip_atoms=list(skip_atoms) if skip_atoms else None,
+        n_subsets=n_subsets,
+        timeout=timeout,
+    )
+    click.echo(f"Computing {metrics_type} metrics for: {input_dir}")
+    runner(args)
+# ============================================================================
+# COMPARE: Unified RMSD, energy, and bond analysis
+# ============================================================================
+@analyze.command("compare", context_settings=CONTEXT_SETTINGS)
+@click.argument("directory", type=click.Path(exists=True))
+@click.option("--mol-converter", default="openbabel", type=click.Choice(["openbabel", "cell2mol"]),
+              help="Converter for bond perception (default: openbabel)")
+@click.option("--n-subsets", "-n", "--n", default=5, type=int,
+              help="Number of subsets for std calculation (default: 5)")
+@click.option("--output", "-o", "--o", "--csv", "csv_path", default=None, type=click.Path(),
+              help="Output CSV filename for results")
+@click.option("--charge", "-c", "--c", default=0, type=int,
+              help="Molecular charge for xTB energy (default: 0)")
+@click.option("--level", "-l", "--l", default="gfn2", type=click.Choice(["gfn1", "gfn2", "gfn-ff", "mmff94"]),
+              help="xTB level for energy calculation (default: gfn2)")
+@click.option("--timeout", "-t", "--t", default=120, type=int,
+              help="Timeout per xTB calculation in seconds (default: 120)")
+def compare(directory, mol_converter, n_subsets, csv_path, charge, level, timeout):
+    """Compare XYZ files with their optimized counterparts.
+    Computes RMSD, xTB Energy Difference, and Bond Geometry Metrics.
+    Enforces strict connectivity checks.
+    Requires 'optimized_xyz' subdirectory with *_opt.xyz files.
+    """
+    import argparse
+    from MolecularDiffusion.runmodes.analyze.compare_to_optimized import run_compare_analysis
+    # Construct args namespace to pass to run_compare_analysis
+    args = argparse.Namespace(
+        directory=directory,
+        mol_converter=mol_converter,
+        n_subsets=n_subsets,
+        csv_path=csv_path,
+        charge=charge,
+        level=level,
+        timeout=timeout
+    )
+    run_compare_analysis(args)
+# ============================================================================
+# XYZ2MOL: Convert XYZ to SMILES + fingerprints
+# ============================================================================
+@analyze.command("xyz2mol", context_settings=CONTEXT_SETTINGS)
+@click.argument("xyz_dir", type=click.Path(exists=True))
+@click.option("--input-csv", "-i", "--i", default=None, type=click.Path(),
+              help="Optional input CSV with xyz file list")
+@click.option("--label", "-l", "--l", default=None, type=str,
+              help="Label for processed files")
+@click.option("--timeout", "-t", "--t", default=30, type=int,
+              help="Timeout per conversion in seconds (default: 30)")
+@click.option("--bits", "-b", "--b", default=2048, type=int,
+              help="Number of bits for Morgan fingerprint (default: 2048)")
+@click.option("--verbose", "-v", "--v", is_flag=True,
+              help="Enable verbose output")
+def xyz2mol(xyz_dir, input_csv, label, timeout, bits, verbose):
+    """Convert XYZ files to SMILES and extract fingerprints/scaffolds.
+    Outputs are saved to xyz_dir/2d_reprs/:
+      - smiles_processed.csv
+      - fingerprints.npy
+      - scaffolds.txt
+      - substructures.json
+    \b
+    Examples:
+        MolCraftDiff analyze xyz2mol gen_xyz/
+        MolCraftDiff analyze xyz2mol gen_xyz/ --bits 1024 -v
+    """
+    from pathlib import Path
+    import pandas as pd
+    import numpy as np
+    import json
+    import logging
+    from MolecularDiffusion.runmodes.analyze.xyz2mol import (
+        load_file_list_from_dir, run_processing, extract_scaffold_and_fingerprints
+    )
+    if verbose:
+        logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+    xyz_dir = Path(xyz_dir)
+    two_d_reprs_dir = xyz_dir / "2d_reprs"
+    two_d_reprs_dir.mkdir(parents=True, exist_ok=True)
+    smiles_csv_output = two_d_reprs_dir / "smiles_processed.csv"
+    click.echo(f"Processing XYZ files from: {xyz_dir}")
+    click.echo(f"Output directory: {two_d_reprs_dir}")
+    # Load file list
+    if input_csv:
+        df = pd.read_csv(input_csv)
+    else:
+        df = load_file_list_from_dir(str(xyz_dir))
+    # Generate SMILES
+    df_smiles = run_processing(df, str(xyz_dir), label, smiles_csv_output, timeout=timeout, verbose=verbose)
+    if df_smiles is None or 'smiles' not in df_smiles.columns or df_smiles['smiles'].isnull().all():
+        click.echo("No valid SMILES generated.", err=True)
+        return
+    # Extract fingerprints and scaffolds
+    click.echo("\nExtracting fingerprints and scaffolds...")
+    fps, scaffolds, clean_smiles, n_fail, substruct_counts = \
+        extract_scaffold_and_fingerprints(df_smiles["smiles"].dropna().values, fp_bits=bits)
+    np.save(two_d_reprs_dir / "fingerprints.npy", fps)
+    with open(two_d_reprs_dir / "scaffolds.txt", "w") as f:
+        f.write("\n".join(scaffolds))
+    with open(two_d_reprs_dir / "smiles_cleaned.txt", "w") as f:
+        f.write("\n".join(clean_smiles))
+    with open(two_d_reprs_dir / "substructures.json", "w") as f:
+        json.dump(substruct_counts, f, indent=2)
+    total = len(df_smiles["smiles"].dropna())
+    click.echo(f"\n--- Summary ---")
+    click.echo(f"Total SMILES: {total}")
+    click.echo(f"Failed FP extraction: {n_fail}")
+    click.echo(f"Unique substructures: {len(substruct_counts)}")
+    click.echo(f"Outputs saved to: {two_d_reprs_dir}")
+# ============================================================================
+# XTB-ELECTRONIC: Compute XTB electronic properties
+# ============================================================================
+@analyze.command("xtb-electronic", context_settings=CONTEXT_SETTINGS)
+@click.argument("input_dir", type=click.Path(exists=True))
+@click.option("--output", "--o", "-o", default=None, type=click.Path(),
+              help="Output file path (without extension for 'all' format)")
+@click.option("--method", "--m", "-m", default="2", type=click.Choice(["1", "2", "ptb"]),
+              help="XTB method: 1=GFN1, 2=GFN2, ptb=PTB (default: 2)")
+@click.option("--charge", "--c", "-c", default=0, type=int,
+              help="Molecular charge (default: 0)")
+@click.option("--n-unpaired", "--unpaired", default=0, type=int,
+              help="Number of unpaired electrons (default: 0)")
+@click.option("--solvent", "--s", "-s", default=None, type=str,
+              help="Solvent for solvation calculations (e.g., 'water', 'thf', 'chcl3')")
+@click.option("--properties", "--prop", "-p", multiple=True,
+              type=click.Choice(["energy", "dipole", "reactivity", "global",
+                                  "charges", "fukui", "bond_orders", "all"]),
+              help="Property groups to compute (default: energy)")
+@click.option("--corrected/--no-corrected", default=True,
+              help="Apply empirical IP/EA correction (default: True)")
+@click.option("--timeout", "--t", "-t", default=120, type=int,
+              help="Timeout per molecule in seconds (default: 120)")
+@click.option("--n-jobs", "--jobs", "-j", default=1, type=int,
+              help="Number of parallel jobs (default: 1)")
+@click.option("--format", "--fmt", "-f", "output_format", default="csv",
+              type=click.Choice(["csv", "json", "ase", "all"]),
+              help="Output format: csv, json, ase (.db), or all (default: csv)")
+def xtb_electronic(input_dir, output, method, charge, n_unpaired,
+                   solvent, properties, corrected, timeout, n_jobs, output_format):
+    """Compute XTB electronic properties for XYZ files.
+    Uses morfeus to calculate quantum-chemical descriptors at the GFN-xTB level.
+    \b
+    Property groups (molecular-level):
+      energy      Total energy, HOMO, LUMO, gap, Fermi level
+      dipole      Dipole moment and vector
+      reactivity  IP, EA, electronegativity, hardness, softness
+      global      Electrophilicity, nucleophilicity, fugalities
+      solvation   Solvation energy, H-bond correction (requires --solvent)
+    \b
+    Property groups (atomic-level):
+      charges     Atomic charges (Mulliken)
+      fukui       Fukui indices (f+, f-, f, dual)
+      bond_orders Bond orders between atom pairs
+    \b
+    Output formats:
+      csv   Molecular-level properties only (one row per molecule)
+      json  Full data including atomic-level properties
+      ase   ASE database with properties in atoms.info/arrays
+      all   Generate all three formats
+    \b
+    Examples:
+        MolCraftDiff analyze xtb-electronic gen_xyz/
+        MolCraftDiff analyze xtb-electronic gen_xyz/ -p energy -p reactivity
+        MolCraftDiff analyze xtb-electronic gen_xyz/ -s water -p solvation
+        MolCraftDiff analyze xtb-electronic gen_xyz/ -p all -f ase -o results.db
+    """
+    from MolecularDiffusion.runmodes.analyze.xtb_electronic import batch_xtb_electronic
+    # Parse method
+    if method in ["1", "2"]:
+        method = int(method)
+    # Default properties
+    if not properties:
+        properties = ["energy"]
+    # Default output path
+    if output is None:
+        output = os.path.join(input_dir, "xtb_electronic")
+    click.echo(f"Computing XTB electronic properties for: {input_dir}")
+    click.echo(f"Method: GFN{method}-xTB" if method != "ptb" else "Method: PTB")
+    click.echo(f"Charge: {charge}, Unpaired: {n_unpaired}")
+    if solvent:
+        click.echo(f"Solvent: {solvent}")
+    click.echo(f"Properties: {', '.join(properties)}")
+    click.echo(f"Output format: {output_format}")
+    df = batch_xtb_electronic(
+        input_dir=input_dir,
+        output_path=output,
+        output_format=output_format,
+        method=method,
+        charge=charge,
+        n_unpaired=n_unpaired,
+        solvent=solvent,
+        properties=list(properties),
+        corrected=corrected,
+        timeout=timeout,
+        n_jobs=n_jobs,
+    )
+    n_success = df["success"].sum() if "success" in df.columns else len(df)
+    n_total = len(df)
+    click.echo(f"\n--- Summary ---")
+    click.echo(f"Processed: {n_total} molecules")
+    click.echo(f"Successful: {n_success}")
+    click.echo(f"Failed: {n_total - n_success}")
+    click.echo(f"Output saved to: {output}")

MolecularDiffusion/cli/eval_predict.py ADDED Viewed

	@@ -0,0 +1,259 @@

+"""Eval-Predict command for MolCraft CLI.
+Adapted from scripts/eval_predict.py for package-level execution.
+"""
+import os
+from typing import Any, Dict, Tuple
+import hydra
+import numpy as np
+import pandas as pd
+import torch
+from omegaconf import DictConfig, OmegaConf
+from torch.utils.data import ConcatDataset
+from MolecularDiffusion.core import Engine
+from MolecularDiffusion.runmodes.train import DataModule, ModelTaskFactory_EGCL, OptimSchedulerFactory
+from MolecularDiffusion.utils import RankedLogger, seed_everything
+from MolecularDiffusion.utils.plot_function import (
+    plot_kde_distribution,
+    plot_histogram_distribution,
+    plot_kde_distribution_multiple,
+    plot_correlation_with_histograms,
+)
+log = RankedLogger(__name__, rank_zero_only=True)
+def is_rank_zero():
+    """Check if current process is rank zero."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_rank() == 0
+    return True
+def load_checkpoint_weights(task, chkpt_path):
+    """Load weights from checkpoint with support for Engine and Lightning formats."""
+    log.info(f"Loading weights from: {chkpt_path}")
+    checkpoint = torch.load(chkpt_path, map_location="cpu", weights_only=False)
+    # Check if it's a Lightning checkpoint
+    if "state_dict" in checkpoint:
+        log.info("Detected Lightning checkpoint.")
+        state_dict = checkpoint["state_dict"]
+        cleaned_state_dict = {}
+        for k, v in state_dict.items():
+            if k.startswith("task."):
+                cleaned_state_dict[k[5:]] = v
+            else:
+                cleaned_state_dict[k] = v
+        load_result = task.load_state_dict(cleaned_state_dict, strict=False)
+        log.info(f"Loaded {len(cleaned_state_dict)} parameters from state_dict")
+        if load_result.missing_keys:
+            log.warning(f"Missing keys: {load_result.missing_keys}")
+        # Recover statistics
+        for key in ["mean", "std", "weight"]:
+            val = None
+            if key in checkpoint:
+                 val = checkpoint[key]
+            elif f"task.{key}" in state_dict:
+                 val = state_dict[f"task.{key}"]
+            elif key in state_dict:
+                 val = state_dict[key]
+            if val is not None:
+                if not isinstance(val, torch.Tensor):
+                    val = torch.as_tensor(val, dtype=torch.float32)
+                # Register as buffer to ensure it moves with the model to the correct device
+                if key in task._buffers:
+                    task._buffers[key].copy_(val)
+                else:
+                    task.register_buffer(key, val)
+    elif "model" in checkpoint:
+        log.info("Detected original Engine checkpoint.")
+        task.load_state_dict(checkpoint["model"], strict=False)
+        # Recover statistics
+        for key in ["mean", "std", "weight"]:
+            if key in checkpoint["model"]:
+                val = checkpoint["model"][key]
+                if not isinstance(val, torch.Tensor):
+                    val = torch.as_tensor(val, dtype=torch.float32)
+                # Register as buffer to ensure it moves with the model to the correct device
+                if key in task._buffers:
+                    task._buffers[key].copy_(val)
+                else:
+                    task.register_buffer(key, val)
+    else:
+        # Fallback for unexpected formats
+        log.warning("Unknown checkpoint format. Attempting direct load.")
+        task.load_state_dict(checkpoint, strict=False)
+    # Ensure task has a device attribute for initial loading,
+    # but don't hardcode it if it's about to be moved by Engine
+    if not hasattr(task, 'device'):
+        task.device = next(task.parameters()).device if list(task.parameters()) else torch.device('cpu')
+def engine_wrapper(task_module, data_module, trainer_module):
+    """Run evaluation with Engine."""
+    trainer_module.get_optimizer()
+    trainer_module.get_scheduler()
+    pred_dataset = ConcatDataset([data_module.valid_set, data_module.test_set])
+    solver = Engine(
+        task_module.task,
+        None,
+        None,
+        pred_dataset,
+        batch_size=data_module.batch_size,
+        collate_fn=data_module.collate_fn,
+        logger="logging",
+    )
+    # Ensure task.device is updated to the actual device solver is using
+    task_module.task.device = solver.device
+    _, preds_test, targets_test = solver.evaluate("test")
+    y_preds = torch.cat(preds_test, dim=0)
+    y_trues = torch.cat(targets_test, dim=0)
+    return y_preds, y_trues
+def predict(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    """Evaluate predictions on validation/test sets."""
+    if cfg.get("seed"):
+        seed_everything(cfg.seed, workers=True)
+    log.info(f"Instantiating datamodule <{cfg.data._target_}>")
+    data_module: DataModule = hydra.utils.instantiate(
+        cfg.data, task_type=cfg.tasks.task_type, train_ratio=0
+    )
+    data_module.load()
+    log.info(f"Instantiating task <{cfg.tasks._target_}>")
+    act_fn = hydra.utils.instantiate(cfg.tasks.act_fn)
+    # Store checkpoint path and temporarily disable it for task_module.build()
+    # to avoid the factory's internal (legacy) loading.
+    chkpt_path = cfg.tasks.get("chkpt_path")
+    # Create a copy of the config to modify safely
+    tasks_cfg = OmegaConf.to_container(cfg.tasks, resolve=True)
+    tasks_cfg['chkpt_path'] = None
+    tasks_cfg = OmegaConf.create(tasks_cfg)
+    task_module: ModelTaskFactory_EGCL = hydra.utils.instantiate(tasks_cfg, act_fn=act_fn)
+    task_module.build()
+    # Manually load weights using our robust loader
+    if chkpt_path:
+        load_checkpoint_weights(task_module.task, chkpt_path)
+    log.info(f"Instantiating trainer <{cfg.trainer._target_}>")
+    trainer_module: OptimSchedulerFactory = hydra.utils.instantiate(
+        cfg.trainer, parameters=task_module.task.parameters()
+    )
+    object_dict = {
+        "cfg": cfg,
+        "datamodule": data_module,
+        "task": task_module,
+        "trainer": trainer_module,
+    }
+    log.info("Logging hyperparameters!")
+    log_hyperparameters(object_dict)
+    y_preds, y_trues = engine_wrapper(task_module, data_module, trainer_module)
+    df = pd.read_csv(cfg.data.filename)
+    task_matrix = df[cfg.tasks.task_learn].to_numpy()
+    filenames = df["filename"].to_numpy()
+    filenames_aligned = []
+    for row in y_trues.cpu().numpy():
+        mask = np.all(np.isclose(task_matrix, row, atol=1e-4), axis=1)
+        idx = np.flatnonzero(mask)
+        if idx.size == 0:
+            raise ValueError(f"No match for row {row}")
+        if idx.size > 1:
+            raise ValueError(f"Multiple matches for row {row}: {filenames[idx].tolist()}")
+        filenames_aligned.append(filenames[idx[0]])
+    df_compiled = pd.DataFrame({
+        "filename": filenames_aligned,
+        "y_true": y_trues.cpu().numpy().tolist(),
+        "y_pred": y_preds.cpu().numpy().tolist(),
+    })
+    os.makedirs(cfg.output_directory, exist_ok=True)
+    df_compiled.to_csv(f"{cfg.output_directory}/predictions.csv", index=False)
+    log.info("Prediction statistics:")
+    for task_name in cfg.tasks.task_learn:
+        log.info(f"--- {task_name} ---")
+        log.info(f"Mean: {df[task_name].mean():.4f}")
+        log.info(f"Std: {df[task_name].std():.4f}")
+        log.info(f"Min: {df[task_name].min():.4f}")
+        log.info(f"Max: {df[task_name].max():.4f}")
+    log.info("Plotting distributions...")
+    props = []
+    for i, prop in enumerate(cfg.tasks.task_learn):
+        plot_kde_distribution(df[prop], prop, f"{cfg.output_directory}/{prop}_kde.png")
+        plot_histogram_distribution(df[prop], prop, f"{cfg.output_directory}/{prop}_hist.png")
+        plot_correlation_with_histograms(
+            y_trues[:, i].cpu().numpy(),
+            y_preds[:, i].cpu().numpy(),
+            prop,
+            "",
+            f"{cfg.output_directory}/{prop}_correlation.png",
+        )
+        props.append(df[prop].values)
+    props = np.array(props).T
+    plot_kde_distribution_multiple(props, cfg.tasks.task_learn, f"{cfg.output_directory}/kde_all.png")
+def log_hyperparameters(object_dict: dict):
+    """Log hyperparameters for debugging."""
+    if not is_rank_zero():
+        return
+    log.info("\n========== Logging Hyperparameters ==========\n")
+    for name, obj in object_dict.items():
+        log.info(f"{'=' * 20} {name.upper()} {'=' * 20}")
+        if name == "cfg":
+            if isinstance(obj, dict):
+                log.info("\n" + OmegaConf.to_yaml(OmegaConf.create(obj)))
+            else:
+                log.info("\n" + OmegaConf.to_yaml(obj))
+        else:
+            if hasattr(obj, '__dict__'):
+                for k, v in vars(obj).items():
+                    if not k.startswith("_"):
+                        log.info(f"{k}: {v}")
+        log.info(f"{'=' * (44 + len(name))}\n")
+    if "task" in object_dict and hasattr(object_dict["task"], "task"):
+        model = object_dict["task"].task
+        total = sum(p.numel() for p in model.parameters())
+        trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        log.info(f"{'=' * 20} MODEL PARAMS {'=' * 20}")
+        log.info(f"model/params/total: {total}")
+        log.info(f"model/params/trainable: {trainable}")
+        log.info("=" * 54 + "\n")
+    log.info("========== End of Hyperparameters ==========\n")
+def eval_predict_main(cfg: DictConfig):
+    """Entry point for CLI eval-predict command."""
+    predict(cfg)

MolecularDiffusion/cli/generate.py ADDED Viewed

	@@ -0,0 +1,282 @@

+"""Generation command for MolCraft CLI.
+Adapted from scripts/generate.py for package-level execution.
+"""
+import glob
+import os
+import re
+import time
+import copy
+import pickle
+from typing import Any, Dict, Optional, Tuple
+import hydra
+import torch
+from omegaconf import DictConfig, OmegaConf
+from MolecularDiffusion.core import Engine
+from MolecularDiffusion.runmodes.generate.tasks_generate import GenerativeFactory
+from MolecularDiffusion.utils import (
+    RankedLogger,
+    seed_everything,
+    recursive_module_to_device,
+)
+log = RankedLogger(__name__, rank_zero_only=True)
+def is_rank_zero():
+    """Check if current process is rank zero."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_rank() == 0
+    return True
+def load_lightning_model(chkpt_path, task_config, atom_vocab=None, total_step=0):
+    """Load model from Lightning checkpoint (.ckpt)."""
+    log.info(f"Loading Lightning checkpoint from: {chkpt_path}")
+    try:
+        from MolecularDiffusion.core.engine_lightning import EngineLightning
+        wrapper = EngineLightning.load_from_checkpoint(chkpt_path, map_location="cpu")
+        log.info("Successfully loaded model using EngineLightning.load_from_checkpoint")
+        if atom_vocab and hasattr(wrapper.task, 'atom_vocab') and wrapper.task.atom_vocab is None:
+            wrapper.task.atom_vocab = atom_vocab
+        # Apply diffusion_steps override from config
+        if total_step > 0:
+            if hasattr(wrapper.task, 'model') and hasattr(wrapper.task.model, 'T'):
+                log.info(f"Overriding diffusion steps: {wrapper.task.model.T} -> {total_step}")
+                wrapper.task.model.T = total_step
+            elif hasattr(wrapper.task, 'T'):
+                log.info(f"Overriding diffusion steps: {wrapper.task.T} -> {total_step}")
+                wrapper.task.T = total_step
+        wrapper.task.eval()
+        return wrapper.task
+    except Exception as e:
+        log.warning(f"EngineLightning.load_from_checkpoint failed ({type(e).__name__}: {e}). Falling back to manual config reconstruction.")
+    # Fallback: Load checkpoint manually
+    checkpoint = torch.load(chkpt_path, map_location="cpu", weights_only=False)
+    hparams = checkpoint.get("hyper_parameters", {})
+    if "model_config" in hparams and hparams["model_config"] is not None:
+        task_config = OmegaConf.create(hparams["model_config"])
+        log.info("Loaded task configuration from checkpoint hyperparameters")
+    elif task_config is None:
+        raise ValueError("task_config not provided and 'model_config' not found in checkpoint.")
+    task_config = copy.deepcopy(task_config)
+    OmegaConf.set_readonly(task_config, False)
+    OmegaConf.set_struct(task_config, False)
+    n_types = len(atom_vocab) if atom_vocab else 0
+    if OmegaConf.is_missing(task_config, "num_atom_types") or task_config.get("num_atom_types") == "???":
+        task_config.num_atom_types = n_types if n_types > 0 else 100
+    if hasattr(task_config, "transformer_config"):
+        if OmegaConf.is_missing(task_config.transformer_config, "atom_dim"):
+            task_config.transformer_config.atom_dim = task_config.num_atom_types
+    if hasattr(task_config, "dataset_stats"):
+        if OmegaConf.is_missing(task_config.dataset_stats, "max_atoms"):
+            task_config.dataset_stats.max_atoms = 150
+    log.info(f"Building task from config: {task_config._target_}")
+    task_factory = hydra.utils.instantiate(task_config, atom_vocab=atom_vocab)
+    task = task_factory.build()
+    state_dict = checkpoint.get('state_dict', {})
+    cleaned_state_dict = {}
+    for key, value in state_dict.items():
+        if key.startswith('task.'):
+            cleaned_state_dict[key[5:]] = value
+        else:
+            cleaned_state_dict[key] = value
+    task.load_state_dict(cleaned_state_dict, strict=False)
+    log.info(f"Loaded {len(cleaned_state_dict)} parameters from checkpoint")
+    if 'data_stats' in checkpoint:
+        task.tabasco_model.set_data_stats(checkpoint['data_stats'])
+    if 'node_dist_model' in checkpoint:
+        task._node_dist_model = checkpoint['node_dist_model']
+    if 'prop_dist_model' in checkpoint:
+        task.prop_dist_model = checkpoint['prop_dist_model']
+    if total_step > 0:
+        if hasattr(task, 'model') and hasattr(task.model, 'T'):
+            task.model.T = total_step
+        elif hasattr(task, 'T'):
+            task.T = total_step
+    task.eval()
+    return task
+def load_model(chkpt_directory, task_config=None, atom_vocab=None, total_step=0):
+    """Load model from checkpoint directory with auto-detection."""
+    ckpt_files = glob.glob(os.path.join(chkpt_directory, '*.ckpt'))
+    if ckpt_files:
+        best_metric = -1.0
+        best_checkpoint = None
+        for ckpt_file in ckpt_files:
+            match = re.search(r"(?:metric|val)[_=](\d+\.?\d*)", os.path.basename(ckpt_file))
+            if match:
+                metric = float(match.group(1))
+                if metric > best_metric:
+                    best_metric = metric
+                    best_checkpoint = ckpt_file
+        if best_checkpoint is None:
+            last_ckpt = os.path.join(chkpt_directory, 'last.ckpt')
+            best_checkpoint = last_ckpt if os.path.exists(last_ckpt) else ckpt_files[0]
+        task = load_lightning_model(best_checkpoint, task_config, atom_vocab, total_step)
+        try:
+            with open(os.path.join(chkpt_directory, "edm_stat.pkl"), "rb") as file:
+                edm_stats = pickle.load(file)
+            task.node_dist_model = edm_stats.get("node")
+            if "prop" in edm_stats:
+                task.prop_dist_model = edm_stats["prop"]
+        except (ImportError, FileNotFoundError):
+            log.warning("edm_stat.pkl not found")
+        return task
+    # Original engine (.pkl files)
+    model_path = os.path.join(chkpt_directory, "edm_chem.pkl")
+    if not os.path.exists(model_path):
+        checkpoint_files = glob.glob(os.path.join(chkpt_directory, '*.pkl'))
+        checkpoint_files = [f for f in checkpoint_files if 'edm_stat.pkl' not in os.path.basename(f)]
+        if not checkpoint_files:
+            raise FileNotFoundError(f"No checkpoints found in {chkpt_directory}")
+        best_metric = -1.0
+        best_checkpoint = None
+        for ckpt_file in checkpoint_files:
+            match = re.search(r"metric=([\d.]+)\.pkl", os.path.basename(ckpt_file))
+            if match:
+                metric = float(match.group(1))
+                if metric > best_metric:
+                    best_metric = metric
+                    best_checkpoint = ckpt_file
+        model_path = best_checkpoint or checkpoint_files[0]
+    log.info(f"Loading original engine checkpoint from: {model_path}")
+    edm_stats = {"node": None, "prop": None}
+    stat_path = os.path.join(chkpt_directory, "edm_stat.pkl")
+    if os.path.exists(stat_path):
+        try:
+            with open(stat_path, "rb") as file:
+                loaded_stats = pickle.load(file)
+            if "node" in loaded_stats:
+                edm_stats["node"] = loaded_stats["node"]
+            elif "node_dist_model" in loaded_stats:
+                edm_stats["node"] = loaded_stats["node_dist_model"]
+            if "prop" in loaded_stats:
+                edm_stats["prop"] = loaded_stats["prop"]
+            elif "prop_dist_model" in loaded_stats:
+                edm_stats["prop"] = loaded_stats["prop_dist_model"]
+        except Exception as e:
+            log.warning(f"Failed to load edm_stat.pkl: {e}")
+    engine = Engine(None, None, None, None, None)
+    engine = engine.load_from_checkpoint(model_path, interference_mode=True)
+    task = engine.model
+    if edm_stats["node"] is not None:
+        task.node_dist_model = edm_stats["node"]
+    if edm_stats["prop"] is not None:
+        task.prop_dist_model = edm_stats["prop"]
+    if total_step > 0:
+        if hasattr(task, 'model') and hasattr(task.model, 'T'):
+            task.model.T = total_step
+        elif hasattr(task, 'T'):
+            task.T = total_step
+    task.eval()
+    return task
+def generate(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    """Main generation function."""
+    if cfg.get("seed"):
+        seed_everything(cfg.seed, workers=True)
+    log.info(f"Instantiating diffusion task and loading the model <{cfg.tasks._target_}>")
+    task = load_model(
+        cfg.chkpt_directory,
+        task_config=cfg.tasks,
+        atom_vocab=cfg.atom_vocab,
+        total_step=cfg.diffusion_steps,
+    )
+    if not hasattr(task, 'atom_vocab') or task.atom_vocab is None:
+        task.atom_vocab = cfg.atom_vocab
+    if not hasattr(task, 'device'):
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        recursive_module_to_device(task, device)
+    log.info(f"Instantiating generator... <{cfg.interference._target_}>")
+    generator: GenerativeFactory = hydra.utils.instantiate(cfg.interference, task=task)
+    object_dict = {"cfg": cfg, "task": task, "generator": generator}
+    log.info("Logging hyperparameters!")
+    log_hyperparameters(object_dict)
+    os.makedirs(cfg.interference.output_path, exist_ok=True)
+    if is_rank_zero():
+        config_path = os.path.join(cfg.interference.output_path, "config.yaml")
+        with open(config_path, "w") as f:
+            OmegaConf.save(config=cfg, f=f)
+        log.info(f"Configuration saved to {config_path}")
+    generator.run()
+def log_hyperparameters(object_dict: dict):
+    """Log hyperparameters for debugging."""
+    if not is_rank_zero():
+        return
+    log.info("\n========== Logging Hyperparameters ==========\n")
+    for name, obj in object_dict.items():
+        log.info(f"{'=' * 20} {name.upper()} {'=' * 20}")
+        if name == "cfg":
+            if isinstance(obj, dict):
+                log.info("\n" + OmegaConf.to_yaml(OmegaConf.create(obj)))
+            else:
+                log.info("\n" + OmegaConf.to_yaml(obj))
+        else:
+            if hasattr(obj, '__dict__'):
+                for k, v in vars(obj).items():
+                    if not k.startswith("_"):
+                        log.info(f"{k}: {v}")
+        log.info(f"{'=' * (44 + len(name))}\n")
+    log.info("========== End of Hyperparameters ==========\n")
+def generate_main(cfg: DictConfig):
+    """Entry point for CLI generate command."""
+    start_time = time.time()
+    generate(cfg)
+    total_time = time.time() - start_time
+    log.warning(f"Total time of execution: {total_time:.2f} seconds")

MolecularDiffusion/cli/main.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""MolCraft CLI - Unified command-line interface for MolecularDiffusion.
+Usage:
+    molcraft train config.yaml [overrides...]
+    molcraft generate config.yaml [overrides...]
+    molcraft predict config.yaml [overrides...]
+"""
+import os
+import logging
+import platform
+import click
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def log_system_info():
+    """Log basic system information."""
+    import psutil
+    logger.info("=" * 60)
+    logger.info(f"OS: {platform.system()} {platform.release()}")
+    logger.info(f"CPU: {platform.processor()}, Cores: {os.cpu_count()}")
+    ram = psutil.virtual_memory()
+    logger.info(f"RAM: Total {ram.total / (1024**3):.2f} GB, Available {ram.available / (1024**3):.2f} GB")
+    logger.info(f"Python: {platform.python_version()}")
+    try:
+        import torch
+        logger.info(f"PyTorch: {torch.__version__}")
+        if torch.cuda.is_available():
+            logger.info(f"CUDA: {torch.version.cuda}, GPUs: {torch.cuda.device_count()}")
+    except ImportError:
+        pass
+    logger.info("=" * 60)
+# Enable -h as alias for --help
+CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
+@click.group(context_settings=CONTEXT_SETTINGS)
+@click.version_option(package_name="MolecularDiffusion")
+def cli():
+    """MolCraft - Molecular Diffusion CLI.
+    A unified command-line interface for training, generation, and prediction
+    with molecular diffusion models.
+    \b
+    Examples:
+        molcraft train configs/my_train_config.yaml
+        molcraft generate configs/my_gen_config.yaml
+        molcraft predict configs/my_pred_config.yaml
+    """
+    pass
+@cli.command(context_settings=CONTEXT_SETTINGS)
+@click.argument("config", type=str)
+@click.argument("overrides", nargs=-1)
+def train(config: str, overrides: tuple):
+    """Train a molecular diffusion model.
+    \b
+    Arguments:
+        CONFIG     Config file path (e.g., configs/train.yaml)
+        OVERRIDES  Hydra-style config overrides (e.g., trainer.num_epochs=100)
+    \b
+    Examples:
+        molcraft train configs/train_tabasco_geom.yaml
+        molcraft train configs/my_config.yaml trainer.num_epochs=50 seed=42
+    """
+    log_system_info()
+    logger.info(f"Starting training with config: {config}")
+    from MolecularDiffusion.cli._hydra import run_hydra_app
+    from MolecularDiffusion.cli.train import train_main
+    run_hydra_app(
+        config_name=config,
+        task_function=train_main,
+        config_dir=None,
+        overrides=list(overrides),
+    )
+@cli.command(context_settings=CONTEXT_SETTINGS)
+@click.argument("config", type=str)
+@click.argument("overrides", nargs=-1)
+def generate(config: str, overrides: tuple):
+    """Generate molecules using a trained model.
+    \b
+    Arguments:
+        CONFIG     Config file path (e.g., configs/generate.yaml)
+        OVERRIDES  Hydra-style config overrides
+    \b
+    Examples:
+        molcraft generate configs/gen_config.yaml
+        molcraft generate configs/gen_config.yaml interference.n_samples=1000
+    """
+    log_system_info()
+    logger.info(f"Starting generation with config: {config}")
+    from MolecularDiffusion.cli._hydra import run_hydra_app
+    from MolecularDiffusion.cli.generate import generate_main
+    run_hydra_app(
+        config_name=config,
+        task_function=generate_main,
+        config_dir=None,
+        overrides=list(overrides),
+    )
+@cli.command(context_settings=CONTEXT_SETTINGS)
+@click.argument("config", type=str)
+@click.argument("overrides", nargs=-1)
+def predict(config: str, overrides: tuple):
+    """Run property prediction on molecules.
+    \b
+    Arguments:
+        CONFIG     Config file path (e.g., configs/predict.yaml)
+        OVERRIDES  Hydra-style config overrides
+    \b
+    Examples:
+        molcraft predict configs/predict.yaml
+        molcraft predict configs/my_pred.yaml xyz_directory=/path/to/xyz
+    """
+    log_system_info()
+    logger.info(f"Starting prediction with config: {config}")
+    from MolecularDiffusion.cli._hydra import run_hydra_app
+    from MolecularDiffusion.cli.predict import predict_main
+    run_hydra_app(
+        config_name=config,
+        task_function=predict_main,
+        config_dir=None,
+        overrides=list(overrides),
+    )
+@cli.command("eval-predict", context_settings=CONTEXT_SETTINGS)
+@click.argument("config", type=str)
+@click.argument("overrides", nargs=-1)
+def eval_predict(config: str, overrides: tuple):
+    """Evaluate model predictions on validation/test sets.
+    \b
+    Arguments:
+        CONFIG     Config file path (e.g., configs/eval_predict.yaml)
+        OVERRIDES  Hydra-style config overrides
+    \b
+    Examples:
+        molcraft eval-predict configs/eval_predict.yaml
+    """
+    log_system_info()
+    logger.info(f"Starting eval-predict with config: {config}")
+    from MolecularDiffusion.cli._hydra import run_hydra_app
+    from MolecularDiffusion.cli.eval_predict import eval_predict_main
+    run_hydra_app(
+        config_name=config,
+        task_function=eval_predict_main,
+        config_dir=None,
+        overrides=list(overrides),
+    )
+# Register analyze subcommand group
+from MolecularDiffusion.cli.analyze import analyze
+cli.add_command(analyze)
+def main():
+    """Entry point."""
+    cli()
+if __name__ == "__main__":
+    main()

MolecularDiffusion/cli/predict.py ADDED Viewed

	@@ -0,0 +1,395 @@

+"""Prediction command for MolCraft CLI.
+Adapted from scripts/predict.py for package-level execution.
+"""
+import os
+from glob import glob
+from typing import Any, Dict, Tuple
+import hydra
+import numpy as np
+import pandas as pd
+import torch
+from ase.data import atomic_numbers
+from omegaconf import DictConfig, OmegaConf
+from torch_geometric.data import Data
+from torch_geometric.nn import knn_graph, radius_graph
+from tqdm import tqdm
+from MolecularDiffusion.core import Engine
+from MolecularDiffusion.data.component.pointcloud import PointCloud_Mol
+from MolecularDiffusion.data.component.feature import (
+    onehot,
+    atom_topological,
+    atom_geom,
+    atom_geom_compact,
+    atom_geom_opt,
+    atom_geom_v2,
+    atom_geom_v2_trun,
+)
+from MolecularDiffusion.utils import RankedLogger, seed_everything
+from MolecularDiffusion.utils.plot_function import (
+    plot_kde_distribution,
+    plot_histogram_distribution,
+    plot_kde_distribution_multiple,
+)
+log = RankedLogger(__name__, rank_zero_only=True)
+def is_rank_zero():
+    """Check if current process is rank zero."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_rank() == 0
+    return True
+def load_model(chkpt_path, task_config=None, atom_vocab=None):
+    """Load a pre-trained model from checkpoint with auto-detection."""
+    log.info(f"Loading checkpoint from: {chkpt_path}")
+    # Try loading as Lightning checkpoint first if it has .ckpt extension
+    if chkpt_path.endswith('.ckpt'):
+        try:
+            from MolecularDiffusion.core.engine_lightning import EngineLightning
+            wrapper = EngineLightning.load_from_checkpoint(chkpt_path, map_location="cpu")
+            log.info("Successfully loaded model using EngineLightning.load_from_checkpoint")
+            # Need to return something that has a .model attribute for backward compatibility
+            class SolverWrapper:
+                def __init__(self, task):
+                    self.model = task
+            solver = SolverWrapper(wrapper.task)
+            solver.model.eval()
+            return solver
+        except Exception as e:
+            log.warning(f"EngineLightning.load_from_checkpoint failed ({type(e).__name__}: {e}). Trying manual fallback.")
+    # Manual fallback or original engine (.pkl/no extension)
+    checkpoint = torch.load(chkpt_path, map_location="cpu", weights_only=False)
+    # Check if it's a Lightning checkpoint dictionary
+    if "hyper_parameters" in checkpoint:
+        log.info("Detected Lightning checkpoint dictionary.")
+        hparams = checkpoint.get("hyper_parameters", {})
+        # Try to get model_config from checkpoint
+        model_config = hparams.get("model_config", task_config)
+        if model_config is None:
+             raise ValueError("Lightning checkpoint lacks 'model_config' and no 'task_config' provided.")
+        # Instantiate task
+        if isinstance(model_config, dict):
+            model_config = OmegaConf.create(model_config)
+        # Ensure we have atom_vocab if needed
+        if atom_vocab is not None and ('atom_vocab' not in model_config or model_config.atom_vocab is None):
+            OmegaConf.set_struct(model_config, False)
+            model_config.atom_vocab = atom_vocab
+        task_factory = hydra.utils.instantiate(model_config)
+        task = task_factory.build()
+        # Load weights
+        state_dict = checkpoint.get("state_dict", {})
+        cleaned_state_dict = {}
+        for k, v in state_dict.items():
+            if k.startswith("task."):
+                cleaned_state_dict[k[5:]] = v
+            else:
+                cleaned_state_dict[k] = v
+        task.load_state_dict(cleaned_state_dict, strict=False)
+        log.info(f"Loaded {len(cleaned_state_dict)} parameters from state_dict")
+        # Try to recover mean/std if they are in the checkpoint root or state_dict but not as buffers
+        for key in ["mean", "std", "weight"]:
+            val = None
+            if key in checkpoint:
+                 val = checkpoint[key]
+            elif f"task.{key}" in state_dict:
+                 val = state_dict[f"task.{key}"]
+            elif key in state_dict:
+                 val = state_dict[key]
+            if val is not None:
+                if not isinstance(val, torch.Tensor):
+                    val = torch.as_tensor(val, dtype=torch.float32)
+                # Register as buffer to ensure it moves with the model to the correct device
+                if key in task._buffers:
+                    task._buffers[key].copy_(val)
+                else:
+                    task.register_buffer(key, val)
+        # Ensure task has a device attribute
+        if not hasattr(task, 'device'):
+            task.device = next(task.parameters()).device if list(task.parameters()) else torch.device('cpu')
+        class SolverWrapper:
+            def __init__(self, task):
+                self.model = task
+        solver = SolverWrapper(task)
+        solver.model.eval()
+        # Ensure task.device is updated to the actual device solver is using
+        if hasattr(solver.model, 'device') and solver.model.device != next(solver.model.parameters()).device:
+             solver.model.device = next(solver.model.parameters()).device if list(solver.model.parameters()) else torch.device('cpu')
+        elif not hasattr(solver.model, 'device'):
+             solver.model.device = next(solver.model.parameters()).device if list(solver.model.parameters()) else torch.device('cpu')
+        return solver
+    else:
+        # Original Engine checkpoint
+        engine = Engine(None, None, None, None, None)
+        solver = engine.load_from_checkpoint(chkpt_path, interference_mode=True)
+        solver.model.eval()
+        # Ensure task.device is updated to the actual device solver is using
+        solver.model.device = solver.device
+        return solver
+def xyz2mol(xyz_file, atom_vocab, node_feature, edge_type="fully_connected",
+            radius=4.0, n_neigh=5, device="cpu"):
+    """Convert an XYZ file into a PyTorch Geometric Data object."""
+    mol_obj = {}
+    mol_xyz = PointCloud_Mol.from_xyz(xyz_file, with_hydrogen=True, forbidden_atoms=[])
+    coords = mol_xyz.get_coord()
+    n_nodes = len(mol_xyz.atoms)
+    node_features = []
+    for atom in mol_xyz.atoms:
+        node_features.append(onehot(atom.element, atom_vocab, allow_unknown=False))
+    charges = [
+        atomic_numbers[atom.element]
+        for atom in mol_xyz.atoms
+        if atom.element in atomic_numbers
+    ]
+    if node_feature:
+        if node_feature in [
+            "atom_topological", "atom_geom", "atom_geom_v2",
+            "atom_geom_v2_trun", "atom_geom_opt", "atom_geom_compact"
+        ]:
+            feature_mapping = {
+                "atom_topological": atom_topological,
+                "atom_geom": atom_geom,
+                "atom_geom_v2": atom_geom_v2,
+                "atom_geom_v2_trun": atom_geom_v2_trun,
+                "atom_geom_opt": atom_geom_opt,
+                "atom_geom_compact": atom_geom_compact,
+            }
+            feature_function = feature_mapping.get(node_feature)
+            if feature_function is not None:
+                node_features_extra = feature_function(charges, coords)
+            node_features = torch.cat(
+                [torch.tensor(node_features), node_features_extra], dim=1
+            )
+        else:
+            raise ValueError("Unknown node feature type")
+    else:
+        node_features = torch.tensor(node_features, dtype=torch.float32)
+    node_features = torch.tensor(node_features, dtype=torch.float32)
+    charges = torch.as_tensor(charges, dtype=torch.long)
+    node_mask = torch.ones(n_nodes, dtype=torch.int8)
+    edge_mask = node_mask.unsqueeze(0) * node_mask.unsqueeze(1)
+    diag_mask = ~torch.eye(n_nodes, dtype=torch.bool)
+    edge_mask *= diag_mask
+    edge_mask = edge_mask.view(1 * n_nodes * n_nodes, 1)
+    h = node_features.view(1 * n_nodes, -1).clone()
+    if edge_type == "distance":
+        edge_index = radius_graph(coords, r=radius)
+    elif edge_type == "neighbor":
+        edge_index = knn_graph(coords, k=n_neigh)
+    elif edge_type == "fully_connected":
+        num_nodes = coords.size(0)
+        row = torch.arange(num_nodes).repeat_interleave(num_nodes)
+        col = torch.arange(num_nodes).repeat(num_nodes)
+        edge_index = torch.stack([row, col], dim=0)
+        edge_index = edge_index[:, row != col]
+    else:
+        raise ValueError(f"Unknown edge type {edge_type}")
+    graph_data = Data(
+        x=h,
+        pos=coords,
+        atomic_numbers=charges,
+        natoms=torch.tensor([n_nodes]),
+        edge_index=edge_index,
+        times=torch.tensor([0]),
+        batch=torch.zeros(n_nodes, dtype=torch.long),
+    ).to(device)
+    mol_obj["graph"] = graph_data
+    return mol_obj
+def count_atoms_from_xyz(path: str) -> int:
+    """Fast atom counter for XYZ files."""
+    try:
+        with open(path, "r") as f:
+            first = f.readline().strip()
+            return int(first)
+    except Exception:
+        return 0
+def _runner(solver, xyz_paths: list, max_atoms: int = 100) -> torch.Tensor:
+    """Runs predictions on a list of XYZ files."""
+    device = getattr(solver.model, 'device', next(solver.model.parameters()).device if list(solver.model.parameters()) else torch.device('cpu'))
+    task_names = list(solver.model.task.keys())
+    num_molecules = len(xyz_paths)
+    progress_bar = tqdm(
+        enumerate(xyz_paths),
+        desc="Predicting molecules",
+        leave=True,
+        dynamic_ncols=True,
+        total=num_molecules,
+    )
+    predictions = []
+    xyz_paths_clear = []
+    skipped = 0
+    for i, xyz_path in progress_bar:
+        n_atoms = count_atoms_from_xyz(xyz_path)
+        if n_atoms > max_atoms:
+            skipped += 1
+            progress_bar.set_postfix({"batch": i + 1, "skipped": skipped})
+            log.info(f"Skipping {xyz_path} (atoms={n_atoms} > max_atoms={max_atoms})")
+            continue
+        mol_obj = xyz2mol(
+            xyz_file=xyz_path,
+            atom_vocab=solver.model.atom_vocab,
+            node_feature=solver.model.node_feature,
+            device=device,
+        )
+        prediction = solver.model.predict(mol_obj, evaluate=True)[0]
+        predictions.append(prediction.detach().cpu().numpy())
+        current_preds_dict = {prop_name: prediction[j].item() for j, prop_name in enumerate(task_names)}
+        progress_bar.set_postfix({"batch": i + 1, "skipped": skipped, **current_preds_dict})
+        xyz_paths_clear.append(xyz_path)
+    predictions = np.array(predictions)
+    if predictions.ndim > 1 and predictions.shape[-1] == 1:
+        predictions = predictions.squeeze(-1)
+    return predictions, xyz_paths_clear
+def runner(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    """Property prediction run."""
+    if cfg.get("seed"):
+        seed_everything(cfg.seed, workers=True)
+    log.info(f"Instantiating diffusion task and loading the model <{cfg.tasks._target_}>")
+    solver = load_model(cfg.chkpt_directory, task_config=cfg.tasks, atom_vocab=cfg.atom_vocab)
+    task_names = list(solver.model.task.keys())
+    if not hasattr(solver.model, 'std') or solver.model.std is None:
+        chkpt = torch.load(cfg.chkpt_directory, weights_only=False)
+        if "model" in chkpt:
+            solver.model.std = chkpt["model"].get("std", torch.ones(1)).to(solver.model.device)
+            solver.model.weight = chkpt["model"].get("weight", torch.ones(1)).to(solver.model.device)
+            solver.model.mean = chkpt["model"].get("mean", torch.zeros(1)).to(solver.model.device)
+        elif "state_dict" in chkpt:
+            # Fallback for Lightning if not already loaded by load_model
+            sd = chkpt["state_dict"]
+            solver.model.std = sd.get("task.std", sd.get("std", torch.ones(1))).to(solver.model.device)
+            solver.model.weight = sd.get("task.weight", sd.get("weight", torch.ones(1))).to(solver.model.device)
+            solver.model.mean = sd.get("task.mean", sd.get("mean", torch.zeros(1))).to(solver.model.device)
+    if not hasattr(solver.model, 'atom_vocab'):
+        solver.model.atom_vocab = cfg.atom_vocab
+    if not hasattr(solver.model, 'node_feature'):
+        solver.model.node_feature = cfg.node_feature
+    object_dict = {"cfg": cfg, "solver": solver}
+    log.info("Logging hyperparameters!")
+    log_hyperparameters(object_dict)
+    os.makedirs(cfg.output_directory, exist_ok=True)
+    if is_rank_zero():
+        config_path = os.path.join(cfg.output_directory, "config.yaml")
+        with open(config_path, "w") as f:
+            OmegaConf.save(config=cfg, f=f)
+        log.info(f"Configuration saved to {config_path}")
+    log.info("Running the predictions...")
+    xyz_paths = glob(f"{cfg.xyz_directory}/*.xyz")
+    xyz_paths = [str(xyz_path) for xyz_path in xyz_paths]
+    predictions, xyz_paths_clear = _runner(solver, xyz_paths, max_atoms=cfg.get("max_atoms", 100))
+    df_dicts = {}
+    for task_name, prediction in zip(task_names, predictions.T):
+        df_dicts[task_name] = prediction
+    df_dicts["xyz_path"] = xyz_paths_clear
+    df = pd.DataFrame(df_dicts)
+    df = df.sort_values(by="xyz_path")
+    df.to_csv(f"{cfg.output_directory}/predictions.csv", index=False)
+    log.info("Prediction statistics:")
+    for task_name in task_names:
+        log.info(f"--- {task_name} ---")
+        log.info(f"Mean: {df[task_name].mean():.4f}")
+        log.info(f"Std: {df[task_name].std():.4f}")
+        log.info(f"Min: {df[task_name].min():.4f}")
+        log.info(f"Max: {df[task_name].max():.4f}")
+    log.info("Plotting distributions...")
+    props = []
+    for prop in task_names:
+        plot_kde_distribution(df[prop], prop, f"{cfg.output_directory}/{prop}_kde.png")
+        plot_histogram_distribution(df[prop], prop, f"{cfg.output_directory}/{prop}_hist.png")
+        props.append(df[prop].values)
+    props = np.array(props).T
+    plot_kde_distribution_multiple(props, task_names, f"{cfg.output_directory}/kde_all.png")
+def log_hyperparameters(object_dict: dict):
+    """Log hyperparameters for debugging."""
+    if not is_rank_zero():
+        return
+    log.info("\n========== Logging Hyperparameters ==========\n")
+    for name, obj in object_dict.items():
+        log.info(f"{'=' * 20} {name.upper()} {'=' * 20}")
+        if name == "cfg":
+            if isinstance(obj, dict):
+                log.info("\n" + OmegaConf.to_yaml(OmegaConf.create(obj)))
+            else:
+                log.info("\n" + OmegaConf.to_yaml(obj))
+        else:
+            if hasattr(obj, '__dict__'):
+                for k, v in vars(obj).items():
+                    if not k.startswith("_"):
+                        log.info(f"{k}: {v}")
+        log.info(f"{'=' * (44 + len(name))}\n")
+    if "task" in object_dict and hasattr(object_dict["task"], "task"):
+        model = object_dict["task"].task
+        total = sum(p.numel() for p in model.parameters())
+        trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        log.info(f"{'=' * 20} MODEL PARAMS {'=' * 20}")
+        log.info(f"model/params/total: {total}")
+        log.info(f"model/params/trainable: {trainable}")
+        log.info("=" * 54 + "\n")
+    log.info("========== End of Hyperparameters ==========\n")
+def predict_main(cfg: DictConfig):
+    """Entry point for CLI predict command."""
+    runner(cfg)

MolecularDiffusion/cli/train.py ADDED Viewed

	@@ -0,0 +1,453 @@

+"""Training command for MolCraft CLI.
+Adapted from scripts/train.py for package-level execution.
+"""
+from typing import Any, Dict, Optional, Tuple
+import os
+import pickle
+import logging
+import hydra
+import torch
+from omegaconf import DictConfig, OmegaConf
+from MolecularDiffusion.core import Engine
+from MolecularDiffusion.runmodes.train import (
+    evaluate,
+    DataModule,
+    Logger,
+    OptimSchedulerFactory,
+    get_versioned_output_path,
+)
+from MolecularDiffusion.utils import (
+    RankedLogger,
+    task_wrapper,
+    seed_everything,
+)
+log = RankedLogger(__name__, rank_zero_only=True)
+def is_rank_zero():
+    """Check if current process is rank zero."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_rank() == 0
+    return True
+def load_weights(task, ckpt_path):
+    """Load model weights from a checkpoint file (weights only).
+    This loads the state_dict from the checkpoint into the task model,
+    ignoring optimizer/scheduler states and other metadata.
+    Useful for fine-tuning or starting from a pre-trained model.
+    """
+    if not os.path.exists(ckpt_path):
+        raise FileNotFoundError(f"Checkpoint not found at: {ckpt_path}")
+    log.info(f"Loading weights from: {ckpt_path}")
+    # Load checkpoint
+    checkpoint = torch.load(ckpt_path, map_location="cpu", weights_only=False)
+    state_dict = checkpoint.get("state_dict", checkpoint)
+    # Prepare state dict for loading
+    cleaned_state_dict = {}
+    for key, value in state_dict.items():
+        # Strip 'task.' prefix if present (common in Lightning checkpoints)
+        if key.startswith("task."):
+            cleaned_state_dict[key[5:]] = value
+        else:
+            cleaned_state_dict[key] = value
+    # Load into task
+    missing, unexpected = task.load_state_dict(cleaned_state_dict, strict=False)
+    if len(missing) > 0:
+        log.warning(f"Missing keys when loading weights: {missing[:5]}{'...' if len(missing)>5 else ''}")
+    if len(unexpected) > 0:
+        log.warning(f"Unexpected keys in checkpoint: {unexpected[:5]}{'...' if len(unexpected)>5 else ''}")
+    log.info(f"Successfully loaded {len(cleaned_state_dict)} parameters into task.")
+# Lightning imports (optional)
+try:
+    import pytorch_lightning as pl
+    from pytorch_lightning.callbacks import ModelCheckpoint
+    from pytorch_lightning.callbacks import LearningRateMonitor
+    from MolecularDiffusion.core.engine_lightning import EngineLightning
+    from MolecularDiffusion.data.lightning_data_module import MolecularDiffusionDataModule
+    from MolecularDiffusion.core.lightning_callbacks import GenerativeEvalCallback
+    LIGHTNING_AVAILABLE = True
+except ImportError as e:
+    LIGHTNING_AVAILABLE = False
+    log.warning(f"PyTorch Lightning not found: {e}. Only original Engine available.")
+def engine_wrapper(task_module, data_module, trainer_module, logger_module,
+                   resume_from_checkpoint=None, **kwargs):
+    """Training loop using original Engine."""
+    trainer_module.get_optimizer()
+    trainer_module.get_scheduler()
+    solver = Engine(
+        task_module.task,
+        data_module.train_set,
+        data_module.valid_set,
+        data_module.test_set,
+        batch_size=data_module.batch_size,
+        collate_fn=data_module.collate_fn,
+        optimizer=trainer_module.optimizer,
+        ema_decay=trainer_module.ema_decay,
+        scheduler=trainer_module.scheduler,
+        clipping_gradient=trainer_module.gradient_clip_mode,
+        clip_value=trainer_module.gradnorm_queue,
+        logger=logger_module.logger,
+        log_interval=logger_module.log_interval,
+        name_wandb=logger_module.name_wandb,
+        project_wandb=logger_module.project_wandb,
+        dir_wandb=trainer_module.output_path,
+    )
+    # Resume from checkpoint if provided
+    start_epoch = 0
+    if resume_from_checkpoint:
+        start_epoch = solver.resume(resume_from_checkpoint, strict=False)
+        log.info(f"Resumed from epoch {start_epoch}")
+    use_amp = trainer_module.precision in ["bf16", 16]
+    best_checkpoints = []
+    best_checkpoints = []
+    if hasattr(task_module.task, "sample") and kwargs.get("generative_analysis"):
+        best_metrics = -torch.inf
+        models_to_save = {"node": task_module.task.node_dist_model}
+        if len(task_module.condition_names) > 0:
+            models_to_save["prop"] = task_module.task.prop_dist_model
+        if is_rank_zero():
+            with open(os.path.join(trainer_module.output_path, "edm_stat.pkl"), "wb") as f:
+                pickle.dump(models_to_save, f)
+    else:
+        best_metrics = torch.inf
+    # Create versioned checkpoint folder (like Lightning's version_X folders)
+    versioned_ckpt_path = get_versioned_output_path(trainer_module.output_path)
+    # Adjust loop to continue from start_epoch
+    for i in range(start_epoch, trainer_module.num_epochs):
+        solver.train(num_epoch=1, use_amp=use_amp, precision=trainer_module.precision)
+        if i % trainer_module.validation_interval == 0 or i == trainer_module.num_epochs - 1:
+            if hasattr(task_module.task, "sample"):
+                output_generated_dir = os.path.join(versioned_ckpt_path, "generated_molecules")
+                os.makedirs(output_generated_dir, exist_ok=True)
+                best_metrics, best_checkpoints = evaluate(
+                    task_module.task_type, solver, i, best_metrics, best_checkpoints,
+                    logger_module.logger, output_generated_dir=output_generated_dir,
+                    generative_analysis=kwargs.get("generative_analysis", False),
+                    n_samples=kwargs.get("n_samples", 100),
+                    metric=kwargs.get("metric", "Validity Relax and connected"),
+                    output_path=versioned_ckpt_path,
+                    use_amp=use_amp, precision=trainer_module.precision,
+                    use_posebuster=kwargs.get("use_posebuster", False),
+                    batch_size=kwargs.get("batch_size", 1),
+                    save_top_k=getattr(trainer_module, "save_top_k", 3),
+                    save_every_val_epoch=getattr(trainer_module, "save_every_val_epoch", False),
+                )
+            else:
+                best_metrics, best_checkpoints = evaluate(
+                    task_module.task_type, solver, i, best_metrics, best_checkpoints,
+                    logger_module.logger, output_path=versioned_ckpt_path,
+                    save_top_k=getattr(trainer_module, "save_top_k", 3),
+                    save_every_val_epoch=getattr(trainer_module, "save_every_val_epoch", False),
+                )
+    return best_metrics, solver
+def lightning_wrapper(task_module, data_module, trainer_module, logger_module, engine_cfg,
+                      ckpt_path=None, monitor_metric=None, monitor_mode=None, model_config=None, **kwargs):
+    """Training using PyTorch Lightning Trainer."""
+    if not LIGHTNING_AVAILABLE:
+        raise ImportError("PyTorch Lightning required. Install with: pip install pytorch-lightning")
+    if hasattr(task_module.task, "preprocess"):
+        log.info("Calling task.preprocess() for Lightning engine")
+        result = task_module.task.preprocess(data_module.train_set)
+        if result is not None:
+            data_module.train_set, data_module.valid_set, data_module.test_set = result
+    pl_data_module = MolecularDiffusionDataModule(
+        data_module=data_module,
+        batch_size=data_module.batch_size,
+        num_workers=getattr(trainer_module, "num_worker", 0),
+    )
+    pl_module = EngineLightning(
+        task=task_module.task,
+        optimizer_config={
+            "optimizer_choice": trainer_module.optimizer_choice,
+            "lr": trainer_module.lr,
+            "weight_decay": trainer_module.weight_decay,
+            "betas": trainer_module.betas,
+            "eps": trainer_module.eps,
+        },
+        scheduler_config={
+            "scheduler": trainer_module.scheduler_choice,
+            "scheduler_kwargs": trainer_module.scheduler_choice_kwargs,
+        },
+        model_config=model_config,
+        monitor_metric=monitor_metric,
+        ema_decay=trainer_module.ema_decay,
+        gradnorm_queue=trainer_module.gradnorm_queue,
+        gradient_clip_algorithm=getattr(trainer_module, 'gradient_clip_algorithm', 'adaptive'),
+    )
+    callbacks = []
+    if hasattr(task_module.task, "sample") and kwargs.get("generative_analysis"):
+        callbacks.append(GenerativeEvalCallback(
+            n_samples=kwargs.get("n_samples", 100),
+            batch_size=kwargs.get("batch_size", 100),
+            metric=kwargs.get("metric", "Validity Relax and connected"),
+            output_dir=os.path.join(trainer_module.output_path, "generated_molecules"),
+            use_posebuster=kwargs.get("use_posebuster", False),
+            monitor_metric=monitor_metric,
+        ))
+    # Checkpoint callback
+    # Handle OmegaConf ListConfig properly
+    if monitor_metric is not None:
+        # Convert OmegaConf types to Python types
+        if OmegaConf.is_list(monitor_metric):
+            monitor_metric_key = str(monitor_metric[0])
+        elif isinstance(monitor_metric, (list, tuple)):
+            monitor_metric_key = str(monitor_metric[0])
+        else:
+            monitor_metric_key = str(monitor_metric)
+        mode = monitor_mode or ("min" if "loss" in monitor_metric_key else "max")
+    elif hasattr(task_module.task, "sample"):
+        monitor_metric_key = f"gen/{kwargs.get('metric', 'Validity Relax and connected')}"
+        mode = "max"
+    else:
+        monitor_metric_key = "val/loss"
+        mode = "min"
+    # Handle save_every_val_epoch
+    save_top_k = trainer_module.save_top_k
+    if getattr(trainer_module, "save_every_val_epoch", False) or kwargs.get("save_every_val_epoch", False):
+        log.info("save_every_val_epoch=True: Overriding save_top_k to -1 (save all checkpoints)")
+        save_top_k = -1
+    callbacks.append(ModelCheckpoint(
+        monitor=monitor_metric_key,
+        mode=mode,
+        save_top_k=save_top_k,
+        filename=f"epoch={{epoch}}-{monitor_metric_key.replace('/', '_').replace(' ', '_')}={{{monitor_metric_key}:.3f}}",
+        save_last=True,
+    ))
+    # Learning rate monitor for wandb logging
+    callbacks.append(LearningRateMonitor(logging_interval='step'))
+    trainer_config = OmegaConf.to_container(engine_cfg.trainer_config, resolve=True)
+    precision_map = {32: 32, 16: "16-mixed", "16": "16-mixed", "bf16": "bf16-mixed"}
+    trainer_config["precision"] = precision_map.get(trainer_config.get("precision", 32), 32)
+    if logger_module.logger == "wandb":
+        pl_logger = pl.loggers.WandbLogger(
+            project=logger_module.project_wandb,
+            name=logger_module.name_wandb,
+            save_dir=trainer_module.output_path,
+        )
+    else:
+        pl_logger = True
+    trainer = hydra.utils.instantiate(trainer_config, callbacks=callbacks, logger=pl_logger)
+    if ckpt_path:
+        trainer.fit(pl_module, datamodule=pl_data_module, ckpt_path=ckpt_path)
+    else:
+        trainer.fit(pl_module, datamodule=pl_data_module)
+    return trainer.callback_metrics, trainer
+@task_wrapper
+def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    """Main training function."""
+    output_path = cfg.trainer.output_path
+    os.makedirs(output_path, exist_ok=True)
+    if is_rank_zero():
+        config_path = os.path.join(output_path, "config.yaml")
+        with open(config_path, "w") as f:
+            OmegaConf.save(config=cfg, f=f)
+        log.info(f"Configuration saved to {config_path}")
+    if cfg.get("seed"):
+        seed_everything(cfg.seed, workers=True)
+    log.info(f"Instantiating datamodule <{cfg.data._target_}>")
+    data_module: DataModule = hydra.utils.instantiate(cfg.data, task_type=cfg.tasks.task_type)
+    data_module.load()
+    log.info(f"Instantiating task <{cfg.tasks._target_}>")
+    data_point_chk = data_module.train_set[0]
+    node_feature_0 = getattr(data_point_chk, "node_feature", None)
+    if node_feature_0 is not None:
+        n_dim = node_feature_0.shape[1]
+    else:
+        try:
+            node_feature_0 = getattr(data_point_chk, "x", None)
+            n_dim = node_feature_0.shape[1]
+        except:
+            n_dim = 0
+    factory_cfg = cfg.tasks
+    overrides = {}
+    if "tasks_egt" in factory_cfg._target_ or "tasks_esen" in factory_cfg._target_ or "diffusion_tabasco" in factory_cfg._target_:
+        overrides["train_set"] = data_module.train_set
+        if "condition_names" in factory_cfg:
+            overrides["task_names"] = factory_cfg.condition_names
+    if "atom_vocab" in cfg.data:
+        overrides["atom_vocab"] = list(cfg.data.atom_vocab)
+    if cfg.data.get("allow_unknown", False):
+        overrides["atom_vocab"].append("Suisei")
+    if cfg.tasks.get("metrics", None) == "valid_posebuster":
+        overrides["use_posebuster"] = True
+        try:
+            import posebusters
+        except ImportError:
+            log.warning("PoseBuster not installed. Falling back to 'Validity Relax and connected'.")
+            overrides["use_posebuster"] = False
+            overrides["metrics"] = ["Validity Relax and connected"]
+    task_module = hydra.utils.instantiate(factory_cfg, **overrides)
+    task_module.build()
+    # Optional: Load weights from checkpoint (without resuming full state)
+    if cfg.trainer.get("load_weights_from"):
+        load_weights(task_module.task, cfg.trainer.load_weights_from)
+    log.info(f"Instantiating trainer <{cfg.trainer._target_}>")
+    trainer_module: OptimSchedulerFactory = hydra.utils.instantiate(
+        cfg.trainer, parameters=task_module.task.parameters()
+    )
+    name_wandb = trainer_module.output_path.split('/')[-1] if "/" in trainer_module.output_path else trainer_module.output_path
+    log.info(f"Instantiating loggers... <{cfg.logger._target_}>")
+    logger_module: Logger = hydra.utils.instantiate(cfg.logger, name_wandb=name_wandb)
+    object_dict = {
+        "cfg": cfg,
+        "datamodule": data_module,
+        "task": task_module,
+        "trainer": trainer_module,
+        "logger": logger_module,
+    }
+    log.info("Logging hyperparameters!")
+    log_hyperparameters(object_dict)
+    engine_type = cfg.get("engine", {}).get("engine_type", "original")
+    log.info(f"Using engine: {engine_type}")
+    if engine_type == "lightning":
+        gen_analysis = cfg.get("generative_analysis", cfg.tasks.get("generative_analysis", False))
+        n_samples = cfg.get("n_samples", cfg.tasks.get("n_samples", 100))
+        metric = cfg.get("metrics", cfg.get("metric", cfg.tasks.get("metrics", "Validity Relax and connected")))
+        use_posebuster = cfg.get("use_posebuster", cfg.tasks.get("use_posebuster", False))
+        gen_batch_size = cfg.get("batch_size", cfg.tasks.get("batch_size", 100))
+        # Always save model_config for checkpoint reconstruction (VAE, LDM, etc.)
+        model_config = OmegaConf.to_container(factory_cfg, resolve=True)
+        for k, v in overrides.items():
+            if k != "train_set":
+                model_config[k] = v
+        if hasattr(task_module.task, "sample"):
+            metrics = lightning_wrapper(
+                task_module, data_module, trainer_module, logger_module,
+                engine_cfg=cfg.engine,
+                generative_analysis=gen_analysis, n_samples=n_samples,
+                metric=metric, use_posebuster=use_posebuster, batch_size=gen_batch_size,
+                ckpt_path=cfg.trainer.get("resume_from_checkpoint", None),
+                monitor_metric=cfg.trainer.get("monitor_metric", None),
+                monitor_mode=cfg.trainer.get("monitor_mode", None),
+                model_config=model_config,
+            )
+        else:
+            metrics = lightning_wrapper(
+                task_module, data_module, trainer_module, logger_module,
+                engine_cfg=cfg.engine,
+                ckpt_path=cfg.trainer.get("resume_from_checkpoint", None),
+                monitor_metric=cfg.trainer.get("monitor_metric", None),
+                monitor_mode=cfg.trainer.get("monitor_mode", None),
+                model_config=model_config,
+            )
+    elif engine_type == "original":
+        resume_ckpt = cfg.trainer.get("resume_from_checkpoint", None)
+        if hasattr(task_module.task, "sample"):
+            metrics = engine_wrapper(
+                task_module, data_module, trainer_module, logger_module,
+                resume_from_checkpoint=resume_ckpt,
+                generative_analysis=cfg.tasks.generative_analysis,
+                n_samples=cfg.tasks.n_samples,
+                metric=cfg.tasks.metrics,
+                use_posebuster=cfg.tasks.use_posebuster,
+                batch_size=cfg.tasks.batch_size,
+            )
+        else:
+            metrics = engine_wrapper(
+                task_module, data_module, trainer_module, logger_module,
+                resume_from_checkpoint=resume_ckpt,
+            )
+    else:
+        raise ValueError(f"Unknown engine_type: {engine_type}")
+    return metrics, object_dict
+def log_hyperparameters(object_dict: dict):
+    """Log hyperparameters for debugging."""
+    if not is_rank_zero():
+        return
+    log.info("\n========== Logging Hyperparameters ==========\n")
+    for name, obj in object_dict.items():
+        log.info(f"{'=' * 20} {name.upper()} {'=' * 20}")
+        if name == "cfg":
+            if isinstance(obj, dict):
+                log.info("\n" + OmegaConf.to_yaml(OmegaConf.create(obj)))
+            else:
+                log.info("\n" + OmegaConf.to_yaml(obj))
+        else:
+            if hasattr(obj, '__dict__'):
+                for k, v in vars(obj).items():
+                    if not k.startswith("_"):
+                        log.info(f"{k}: {v}")
+        log.info(f"{'=' * (44 + len(name))}\n")
+    if "task" in object_dict and hasattr(object_dict["task"], "task"):
+        model = object_dict["task"].task
+        total = sum(p.numel() for p in model.parameters())
+        trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        log.info(f"{'=' * 20} MODEL PARAMS {'=' * 20}")
+        log.info(f"model/params/total: {total}")
+        log.info(f"model/params/trainable: {trainable}")
+        log.info("=" * 54 + "\n")
+    log.info("========== End of Hyperparameters ==========\n")
+def train_main(cfg: DictConfig):
+    """Entry point for CLI train command."""
+    metric, _ = train(cfg)
+    return metric

MolecularDiffusion/configs/data/filter_molecules_by_property.py ADDED Viewed

File without changes

MolecularDiffusion/configs/data/formed_data.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+_target_: MolecularDiffusion.runmodes.train.DataModule
+root: /home/pregabalin/RF/blue_edm/data/formed
+filename: /home/pregabalin/RF/blue_edm/data/formed/Data_FORMED_scored.csv # 4k or ready
+atom_vocab: [H,B,C,N,O,F,Al,Si,P,S,Cl,As,Se,Br,I,Hg,Bi]
+dataset_name: formed
+with_hydrogen: True
+node_feature: null # atom_topological, atom_geom, atom_geom_compact, atom_geom_opt
+max_atom: 120
+xyz_dir: /home/pregabalin/RF/blue_edm/data/formed/XYZ_FORMED/
+coord_file: null
+natoms_file: null
+forbidden_atom: []
+data_efficient_collator: True
+train_ratio: 0.8
+load_pkl: null
+save_pkl: data/test.pkl
+data_type: pyg # pyg or pointcloud
+batch_size: 32
+num_workers: 0
+allow_unknown: False # additional atom type for the unknown in OHE

MolecularDiffusion/configs/data/mol_dataset.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+_target_: MolecularDiffusion.runmodes.train.DataModule
+root: data/
+filename: path_to_csv.csv # 4k or ready
+atom_vocab: [H,B,C,N,O,F,Al,Si,P,S,Cl,As,Se,Br,I,Hg,Bi] #Ge,Sn,Te,Sb
+dataset_name: qm9
+with_hydrogen: True
+use_ohe_feature: True
+allow_unknown: False  # True to add +1 "unknown" column to OHE for rare/unseen atoms
+node_feature_choice: null # atom_topological, atom_geom, atom_geom_compact, atom_geom_opt
+max_atom: 29
+xyz_dir: path_to_xyz
+coord_file: null
+natoms_file: null
+forbidden_atom: []
+data_efficient_collator: True
+train_ratio: 0.8
+load_pkl: null
+save_pkl: data/test.pkl #TODO this is not really used anymore
+data_type: pointcloud # pyg or pointcloud
+batch_size: 48
+num_workers: 0
+edge_type: fully_connected
+radius: 4.0
+n_neigh: 5
+# consider_global_attributes: False  #depricated

MolecularDiffusion/configs/data/mol_dataset_extraf.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+_target_: MolecularDiffusion.runmodes.train.DataModule
+root: /home/pregabalin/RF/blue_edm/data/qm9
+filename: /home/pregabalin/RF/blue_edm/data/qm9/dsgdb9nsd_4k.csv # 4k or ready
+atom_vocab: [H,B,C,N,O,F,Al,Si,P,S,Cl,As,Se,Br,I,Hg,Bi]
+dataset_name: qm9
+with_hydrogen: True
+node_feature: atom_geom_compact # atom_topological, atom_geom, atom_geom_compact, atom_geom_opt
+max_atom: 29
+xyz_dir: /home/pregabalin/RF/blue_edm/data/qm9/dsgdb9nsd/
+coord_file: null
+natoms_file: null
+forbidden_atom: []
+data_efficient_collator: True
+train_ratio: 0.8
+load_pkl: null
+save_pkl: data/test.pkl
+data_type: pointcloud # pyg or pointcloud
+batch_size: 48
+num_workers: 0
+allow_unknown: False # additional atom type for the unknown in OHE
+edge_type: fully_connected
+radius: 4.0
+n_neigh: 5

MolecularDiffusion/configs/engine/lightning.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+# Use PyTorch Lightning Trainer
+engine_type: lightning
+# Lightning-specific trainer configuration
+trainer_config:
+  _target_: pytorch_lightning.Trainer
+  # Training
+  max_epochs: ${trainer.num_epochs}
+  accelerator: auto
+  devices: auto
+  strategy: auto  # Lightning auto-selects ddp/ddp_spawn based on devices
+  # Precision - will be converted to Lightning format in Python
+  precision: ${trainer.precision}
+  # Optimization
+  accumulate_grad_batches: 1
+  gradient_clip_val: ${trainer.grad_clip_value}
+  gradient_clip_algorithm: ${trainer.gradient_clip_mode}
+  # Logging & Validation
+  log_every_n_steps: ${logger.log_interval}
+  check_val_every_n_epoch: ${trainer.validation_interval}
+  # Checkpointing
+  enable_checkpointing: true
+  default_root_dir: ${trainer.output_path}
+  # Other
+  num_sanity_val_steps: 0  # Skip sanity validation
+  enable_progress_bar: true
+  enable_model_summary: true

MolecularDiffusion/configs/engine/original.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# Use the original custom Engine class
+engine_type: original
+# Engine is instantiated inline in train.py using engine_wrapper()

MolecularDiffusion/configs/hydra/default.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+# https://hydra.cc/docs/configure_hydra/intro/
+# enable color logging
+# install hydra-colorlog==1.2.0
+defaults:
+  - override hydra_logging: colorlog
+  - override job_logging: colorlog
+# output directory, generated dynamically on each run
+run:
+  dir: ${trainer.output_path}
+  # dir: ${trainer.output_path}/${tasks.task_type}/runs/${name}_${now:%Y-%m-%d}_${now:%H-%M-%S}
+job_logging:
+  handlers:
+    file:
+      # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242
+      filename: ${hydra.runtime.output_dir}/${name}_${now:%Y-%m-%d}_${now:%H-%M-%S}.log

MolecularDiffusion/configs/interference/gen_cfg.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+_target_: MolecularDiffusion.runmodes.generate.GenerativeFactory
+task_type: cfg
+sampling_mode: "ddpm"
+num_generate: 100
+mol_size: [0,0]
+max_mol_size: 0
+target_values: [3,1.5]
+property_names: ["S1_exc", "T1_exc"]
+batch_size: 1
+seed: 86
+n_frames: 0
+output_path: generated_mol
+condition_configs:
+  cfg_scale: 1
+  cfg_scale_schedule: null

MolecularDiffusion/configs/interference/gen_cfggg.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+_target_: MolecularDiffusion.runmodes.generate.GenerativeFactory
+task_type: gradient_guidance # cfggg
+sampling_mode: "ddpm"
+num_generate: 100
+mol_size: [0,0]
+  max_mol_size: 0
+target_values: [3,1.5]
+property_names: ["S1_exc", "T1_exc"]
+batch_size: 1
+seed: 86
+n_frames: 0
+output_path: generated_mol
+condition_configs:
+  cfg_scale: 1
+  target_function:
+    _target_: scripts.gradient_guidance.sf_energy_score.SFEnergyScore
+    _partial_: true
+    chkpt_directory: trained_models/egcl_guidance_s1t1.ckpt
+  gg_scale: 1e-3
+  max_norm: 1e-3
+  scheduler:
+    _target_: scripts.gradient_guidance.scheduler.CosineAnnealing
+    _partial_: true
+    T_max: 1000
+    eta_min: 0
+  guidance_ver: 2
+  guidance_at: 1
+  guidance_stop: 0
+  n_backwards: 3

MolecularDiffusion/configs/interference/gen_conditional.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+_target_: MolecularDiffusion.runmodes.generate.GenerativeFactory
+task_type: conditional
+sampling_mode: "ddpm"
+num_generate: 100
+mol_size: [0,0]
+  max_mol_size: 0
+target_values: [3,1.5]
+property_names: ["S1_exc", "T1_exc"]
+batch_size: 1
+seed: 86
+n_frames: 0
+output_path: generated_mol

MolecularDiffusion/configs/interference/gen_gg.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+_target_: MolecularDiffusion.runmodes.generate.GenerativeFactory
+task_type: gradient_guidance # gg
+sampling_mode: "ddpm"
+num_generate: 100
+mol_size: [0,0]
+  max_mol_size: 0
+target_values: []
+property_names: []
+batch_size: 1
+seed: 86
+n_frames: 0
+output_path: generated_mol
+condition_configs:
+  cfg_scale: 0
+  target_function:
+    _target_: scripts.gradient_guidance.sf_energy_score.SFEnergyScore
+    _partial_: true
+    chkpt_directory: trained_models/egcl_guidance_s1t1.ckpt
+  gg_scale: 1e-3
+  max_norm: 1e-3
+  scheduler:
+    _target_: scripts.gradient_guidance.scheduler.CosineAnnealing
+    _partial_: true
+    T_max: 1000
+    eta_min: 0
+  guidance_ver: 2
+  guidance_at: 1
+  guidance_stop: 0
+  n_backwards: 0

MolecularDiffusion/configs/interference/gen_hybrid.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+_target_: MolecularDiffusion.runmodes.generate.GenerativeFactory
+task_type: inpaint_cfg
+sampling_mode: "ddpm"
+num_generate: 100
+mol_size: [0,0]
+  max_mol_size: 0
+target_values: [3,1.5]
+property_names: ["S1_exc", "T1_exc"]
+batch_size: 1
+seed: 86
+n_frames: 0
+output_path: generated_mol
+condition_configs:
+  cfg_scale: 1
+  reference_structure_path: "data/template_structures/INT2_0.xyz"
+  inpaint_cfgs:
+    t_start: 0.8
+    t_critical: 0.05
+  # inpaint
+  #   denoising_strength: 0.7
+  #   noise_initial_mask: False
+  #   mask_node_index:
+  #   - 5
+  #   - 30
+  #   - 31

MolecularDiffusion/configs/interference/gen_inpaint.yaml ADDED Viewed

	@@ -0,0 +1,69 @@

+_target_: MolecularDiffusion.runmodes.generate.GenerativeFactory
+task_type: inpaint
+sampling_mode: "ddpm"
+num_generate: 100
+mol_size: [0,0]
+  max_mol_size: 0
+target_values: []
+property_names: []
+batch_size: 1
+seed: 86
+n_frames: 0
+output_path: generated_mol
+condition_configs:
+  reference_structure_path: "data/template_structures/BINOLCpHHH.xyz"
+  condition_component: xh
+  inpaint_cfgs:
+    mask_node_index:
+    - 5
+    - 30
+    - 31
+    - 6
+    - 7
+    - 45
+    - 8
+    - 32
+    - 9
+    - 10
+    - 33
+    - 11
+    - 34
+    - 12
+    - 35
+    - 13
+    - 36
+    - 14
+    - 15
+    - 16
+    - 17
+    - 18
+    - 37
+    - 19
+    - 38
+    - 20
+    - 39
+    - 21
+    - 40
+    - 22
+    - 23
+    - 41
+    - 24
+    - 44
+    - 25
+    - 26
+    - 43
+    - 42
+    denoising_strength: 0.75
+    t_start: 0.8
+    t_critical_1: 0.8
+    t_critical_2: 1
+    d_threshold_f: 1.5
+    w_b: 10
+    all_frozen: True
+    use_covalent_radii: True
+    scale_factor: 1.2
+    noise_initial_mask: True
+  n_frames: 0
+  n_retrys: 0
+  t_retry: 180

MolecularDiffusion/configs/interference/gen_outpaint.yaml ADDED Viewed

	@@ -0,0 +1,31 @@

+_target_: MolecularDiffusion.runmodes.generate.GenerativeFactory
+task_type: outpaint
+sampling_mode: ddpm
+num_generate: 100
+mol_size: [0, 0]
+max_mol_size: 0
+target_values: []
+property_names: []
+batch_size: 1
+seed: 86
+n_frames: 0
+output_path: generated_mol
+condition_configs:
+  reference_structure_path: data/template_structures/BINOLCp.xyz
+  condition_component: xh
+  outpaint_cfgs:
+    t_start: 0.8
+    t_critical_1: 0.7
+    t_critical_2: 0.4
+    d_threshold_f: 2
+    w_b: 0.1
+    all_frozen: false
+    use_covalent_radii: true
+    scale_factor: 1.1
+    noise_initial_mask: false
+    connector_dicts: {}   # fill if needed, e.g. {0: [3]}
+  n_retrys: 3
+  t_retry: 180

MolecularDiffusion/configs/interference/gen_outpaintft.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+_target_: MolecularDiffusion.runmodes.generate.GenerativeFactory
+task_type: outpaintft
+sampling_mode: "ddpm"
+num_generate: 100
+mol_size:  [76,76]
+target_values: []
+property_names: []
+batch_size: 1
+seed: 86
+n_frames: 0
+output_path: generated_mol
+condition_configs:
+  reference_structure_path: "data/template_structures/INT2_0.xyz"
+  outpaint_cfgs:
+    t_start: 1
+  n_retrys: 0
+  t_retry: 180

MolecularDiffusion/configs/interference/gen_unconditional.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+_target_: MolecularDiffusion.runmodes.generate.GenerativeFactory
+task_type: unconditional
+sampling_mode: "ddpm"
+num_generate: 100
+mol_size:  [16]
+target_values: []
+property_names: []
+batch_size: 1
+seed: 86
+n_frames: 0
+output_path: generated_mol

MolecularDiffusion/configs/interference/prediction.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ prop_names: ["S1_exc", "T1_exc"]
2	+ hit_criteria: null

MolecularDiffusion/configs/logger/default.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+_target_: MolecularDiffusion.runmodes.train.Logger
+logger: logging # wandb, logging
+log_interval: 2
+name_wandb: MolecularDiffusion
+project_wandb: MolecularDiffusion
+dir_wandb: ${trainer.output_path}

MolecularDiffusion/configs/logger/wandb.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+_target_: MolecularDiffusion.runmodes.train.Logger
+logger: wandb # wandb, logging
+log_interval: 2
+name_wandb: MolecularDiffusion
+project_wandb: MolecularDiffusion
+dir_wandb: ${trainer.output_path}

MolecularDiffusion/configs/models/tabasco_transformer.yaml ADDED Viewed

	@@ -0,0 +1,72 @@

+# TABASCO Transformer model configuration
+# State-of-the-art non-equivariant flow matching model for molecules
+_target_: MolecularDiffusion.modules.tasks.diffusion_tabasco.TabascoDiffusionTask
+# Number of atom types from dataset vocabulary
+num_atom_types: 19  # Will be overridden by ${data.num_atom_types} at runtime
+# Transformer backbone configuration
+transformer_config:
+  _target_: MolecularDiffusion.modules.layers.tabasco.transformer_module.TransformerModule
+  spatial_dim: 3
+  atom_dim: 19  # Will be overridden by ${data.num_atom_types}
+  hidden_dim: 256
+  num_layers: 16
+  num_heads: 8
+  activation: SiLU
+  implementation: pytorch  # or 'reimplemented'
+  cross_attention: true
+  add_sinusoid_posenc: true
+  concat_combine_input: false
+  custom_weight_init: null  # or 'xavier', 'kaiming', etc.
+# Continuous coordinate interpolant configuration
+coords_interpolant_config:
+  _target_: MolecularDiffusion.modules.models.tabasco.flow.interpolate.SDEMetricInterpolant
+  key: coords
+  loss_weight: 1.0
+  centered: true
+  scale_noise_by_log_num_atoms: false
+  noise_scale: 1.0
+  # Langevin sampling schedule for SDE integration
+  langevin_sampling_schedule:
+    _target_: MolecularDiffusion.modules.models.tabasco.sample.noise_schedule.SampleNoiseSchedule
+    cutoff: 0.9
+  white_noise_sampling_scale: 0.01
+  # Time-dependent loss weighting
+  time_factor:
+    _target_: MolecularDiffusion.modules.models.tabasco.flow.time_factor.InverseTimeFactor
+    max_value: 100.0
+    min_value: 0.05
+    zero_before: 0.0
+    eps: 1.0e-6
+# Discrete atom type interpolant configuration
+atomics_interpolant_config:
+  _target_: MolecularDiffusion.modules.models.tabasco.flow.interpolate.DiscreteInterpolant
+  key: atomics
+  loss_weight: 0.1
+  # Time-dependent loss weighting
+  time_factor:
+    _target_: MolecularDiffusion.modules.models.tabasco.flow.time_factor.InverseTimeFactor
+    max_value: 100.0
+    min_value: 0.05
+    zero_before: 0.0
+    eps: 1.0e-6
+# Flow matching training configuration
+flow_matching_config:
+  _target_: MolecularDiffusion.modules.models.tabasco.flow_model.FlowMatchingModel
+  time_distribution:
+    _target_: MolecularDiffusion.modules.models.tabasco.flow.utils.HistogramTimeDistribution
+  time_alpha_factor: 1.8
+  num_random_augmentations: 7  # +1 original = 8 total
+  sample_schedule: log  # or 'linear', 'power'
+  compile: false
+  interdist_loss: null
+# Dataset statistics (populated at runtime)
+dataset_stats:
+  max_atoms: 29  # Will be set from data config
+  atom_count_histogram: {}  # Computed from dataset
+  all_smiles: []  # Collected from dataset

MolecularDiffusion/configs/tasks/diffusion.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+_target_: MolecularDiffusion.runmodes.train.ModelTaskFactory_EGCL
+task_type: diffusion
+atom_vocab: ${data.atom_vocab}
+condition_names: []
+hidden_size: 192
+act_fn:
+  _target_: torch.nn.SiLU
+num_layers: 9
+attention: True
+tanh: True
+num_sublayers: 1
+sin_embedding: False
+aggregation_method: "sum"
+dropout: 0.0
+normalization: False
+include_cosine: True
+norm_constant: 1.0
+normalization_factor: 1.0
+chkpt_path: null
+# specific to diffusion
+diffusion_steps : 900
+diffusion_noise_schedule : polynomial_2 # learned, cosine_x, polynomial_x, issnr_x, smld_x
+diffusion_noise_precision: 1e-5
+diffusion_loss_type:  vlb
+normalize_factors: [1,4,10]
+extra_norm_values: []
+augment_noise: False
+data_augmentation: False
+context_mask_rate: 0.2
+mask_value: 5
+normalize_condition: value_10 # [None, "maxmin", "mad"]
+sp_regularizer_deploy: False
+sp_regularizer_regularizer: hard
+sp_regularizer_lambda_: 0
+sp_regularizer_lambda_2: 1000
+sp_regularizer_lambda_update_value: 1
+sp_regularizer_lambda_update_step: 100
+sp_regularizer_polynomial_p: 1.1
+sp_regularizer_warm_up_steps: 100
+use_unknown_fallback: False
+reference_indices: null # indices of core atoms for the outpainting objective
+# evaluator parameters
+use_posebuster: True
+metrics: valid_posebuster # use_posebuster must be true
+n_samples: 48
+batch_size: 4
+generative_analysis: True

MolecularDiffusion/configs/tasks/diffusion_egt.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+_target_: MolecularDiffusion.runmodes.train.tasks_egt.ModelTaskFactory
+task_type: diffusion
+model_class: GraphTransformer
+atom_vocab: ${data.atom_vocab}
+condition_names: []
+hidden_dims:
+  dx: 256
+  de: 64
+  dy: 4
+  n_head: 4
+  dim_ffX: 256
+  dim_ffE: 64
+  dim_ffy: 1
+hidden_mlp_dims:
+  X: 256
+  E: 64
+  y: 256
+  pos: 512
+act_fn_in:
+  _target_: torch.nn.SiLU
+act_fn_out:
+  _target_: torch.nn.SiLU
+num_layers: 6
+dropout: 0.1
+chkpt_path: null
+# specific to diffusion
+diffusion_steps : 400
+diffusion_noise_schedule : polynomial_2 # learned, cosine_x, polynomial_x, issnr_x, smld_x
+diffusion_noise_precision: 1e-5
+diffusion_loss_type:  vlb
+normalize_factors: [1,4,10]
+extra_norm_values: []
+augment_noise: False
+data_augmentation: False
+context_mask_rate: 0.2
+mask_value: 5
+normalize_condition: value_10 # [None, "maxmin", "mad"]
+sp_regularizer_deploy: False
+sp_regularizer_regularizer: hard
+sp_regularizer_lambda_: 0
+sp_regularizer_lambda_2: 1000
+sp_regularizer_lambda_update_value: 1
+sp_regularizer_lambda_update_step: 100
+sp_regularizer_polynomial_p: 1.1
+sp_regularizer_warm_up_steps: 100
+reference_indices: null # indices of core atoms for the outpainting objective
+# evaluator parameters
+use_posebuster: True
+metrics: valid_posebuster # use_posebuster must be true
+n_samples: 24
+generative_analysis: True
+batch_size: 4

MolecularDiffusion/configs/tasks/diffusion_extraf.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+_target_: MolecularDiffusion.runmodes.train.ModelTaskFactory_EGCL
+task_type: diffusion
+atom_vocab: ${data.atom_vocab}
+condition_names: []
+hidden_size: 192
+act_fn:
+  _target_: torch.nn.SiLU
+num_layers: 1
+attention: True
+tanh: True
+num_sublayers: 12
+sin_embedding: False
+aggregation_method: "sum"
+dropout: 0.0
+normalization: False
+include_cosine: True
+norm_constant: 1.0
+normalization_factor: 1.0
+chkpt_path: null
+# specific to diffusion
+diffusion_steps : 400
+diffusion_noise_schedule : polynomial_2 # learned, cosine_x, polynomial_x, issnr_x, smld_x
+diffusion_noise_precision: 1e-5
+diffusion_loss_type:  vlb
+normalize_factors: [1,4,10]
+extra_norm_values: [10,10]
+augment_noise: False
+data_augmentation: False
+context_mask_rate: 0.2
+mask_value: 5
+normalize_condition: value_10 # [None, "maxmin", "mad"]
+sp_regularizer_deploy: False
+sp_regularizer_regularizer: hard
+sp_regularizer_lambda_: 0
+sp_regularizer_lambda_2: 1000
+sp_regularizer_lambda_update_value: 1
+sp_regularizer_lambda_update_step: 100
+sp_regularizer_polynomial_p: 1.1
+sp_regularizer_warm_up_steps: 100
+reference_indices: null # indices of core atoms for the outpainting objective
+# evaluator parameters
+use_posebuster: True
+metrics: valid_posebuster # use_posebuster must be true
+n_samples: 24
+generative_analysis: True
+batch_size: 4

MolecularDiffusion/configs/tasks/diffusion_hybrid.yaml ADDED Viewed

	@@ -0,0 +1,95 @@

+_target_: MolecularDiffusion.runmodes.train.tasks_esen.ModelTaskFactory
+task_type: diffusion_hybrid
+# === Atom Vocabulary ===
+# Specify either atom_vocab directly OR use the one from data config
+# Available base vocabularies: H, C, N, O, F, P, S, Cl, Br, I (common organic)
+# The number of classes is automatically determined from vocab length
+atom_vocab: ${data.atom_vocab}
+# atom_vocab: ["C", "N", "O", "H", "F", "S", "Cl", "Br", "P", "I"]  # Example custom
+condition_names: []
+# eSEN specific parameters
+hidden_size: 64
+hidden_channels: 64
+num_layers: 9
+lmax: 2
+mmax: 2
+grid_resolution: null
+cutoff: 30
+edge_channels: 128
+distance_function: "gaussian"
+num_distance_basis: 512
+norm_type: "rms_norm_sh"
+act_type: "s2"
+mlp_type: "grid"
+otf_graph: True
+use_envelope: False
+activation_checkpointing: False
+global_attributes: False
+sphere_embedding_type: "mixed" # DO NOT CHANGE
+aggregation_method: "sum"
+chkpt_path: null
+# === Continuous Diffusion Parameters ===
+diffusion_steps: 450
+diffusion_noise_schedule: polynomial_2  # Options: cosine, polynomial_2, polynomial_3, learned
+diffusion_noise_precision: 1e-5
+diffusion_loss_type: l2  # Options: vlb, l2
+normalize_factors: [1, 1]
+extra_norm_values: []
+augment_noise: False
+data_augmentation: False
+context_mask_rate: 0.2
+mask_value: 5
+normalize_condition: value_10
+sp_regularizer_deploy: False
+sp_regularizer_regularizer: hard
+sp_regularizer_lambda_: 0
+sp_regularizer_lambda_2: 1000
+sp_regularizer_lambda_update_value: 1
+sp_regularizer_lambda_update_step: 100
+sp_regularizer_polynomial_p: 1.1
+sp_regularizer_warm_up_steps: 100
+reference_indices: null
+# === Discrete Diffusion Parameters (Atom Types) ===
+# Number of atom classes (automatically set from atom_vocab length if not specified)
+num_atom_classes: 19
+# Weight for discrete loss in combined loss: L_total = L_continuous + λ * L_discrete
+discrete_loss_weight: 0.2
+# Discrete masking schedule for absorbing-state diffusion
+# Each schedule controls how quickly tokens get masked during forward diffusion
+#
+# Available schedules:
+#   - "cosine"      : Smooth cosine decay (default, from improved DDPM)
+#   - "linear"      : Linear increase in masking probability
+#   - "sqrt"        : Square root schedule (faster initial masking)
+#   - "quadratic"   : Quadratic schedule (slower initial, faster later)
+#   - "cubic"       : Cubic schedule (even slower start than quadratic)
+#   - "sigmoid"     : S-curve transition (smooth start and end)
+#   - "exponential" : Exponential decay of survival probability
+#   - "log"         : Logarithmic schedule (fast early, slow late)
+#   - "uniform"     : Constant masking rate each step
+#
+discrete_schedule: "cosine"
+# MLP layers for atom classification head
+atom_head_mlp_layers: 2
+# === eSEN Dynamics specific ===
+use_adapter_module: False
+tanh: True
+coords_range: 10
+normalization_factor: 1.0
+# === Evaluator Parameters ===
+use_posebuster: True
+metrics: valid_posebuster
+n_samples: 96
+batch_size: 8
+generative_analysis: True

MolecularDiffusion/configs/tasks/diffusion_hybrid_egcl.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+_target_: MolecularDiffusion.runmodes.train.tasks_egcl.ModelTaskFactory
+task_type: diffusion_hybrid
+# === Atom Vocabulary ===
+atom_vocab: ${data.atom_vocab}
+condition_names: []
+# === EGNN Parameters ===
+hidden_size: 192
+num_layers: 9
+attention: True
+norm_diff: True
+tanh: True
+coords_range: 15
+num_sublayers: 1
+sin_embedding: True
+include_cosine: False
+normalization_factor: 1.0
+aggregation_method: "sum"
+dropout: 0.0
+normalization: False
+chkpt_path: null
+# === Continuous Diffusion Parameters ===
+diffusion_steps: 900
+diffusion_noise_schedule: polynomial_2
+diffusion_noise_precision: 1e-5
+diffusion_loss_type: vlb
+normalize_factors: [1, 4]
+extra_norm_values: []
+augment_noise: False
+data_augmentation: False
+context_mask_rate: 0.0
+mask_value: 5.0
+normalize_condition: value_10
+sp_regularizer_deploy: False
+# === Discrete Diffusion Parameters (Atom Types) ===
+num_atom_classes: 19
+discrete_loss_weight: 0.2
+discrete_schedule: "cosine"
+# MLP layers for atom classification head
+atom_head_mlp_layers: 2
+# === Evaluator Parameters ===
+use_posebuster: True
+metrics: valid_posebuster
+n_samples: 48
+batch_size: 8
+generative_analysis: True

MolecularDiffusion/configs/tasks/diffusion_integer.yaml ADDED Viewed

	@@ -0,0 +1,62 @@

+_target_: MolecularDiffusion.runmodes.train.tasks_esen.ModelTaskFactory
+task_type: diffusion
+atom_vocab: ${data.atom_vocab}
+condition_names: []
+# eSEN specific parameters
+hidden_size: 256
+hidden_channels: 256
+num_layers: 4
+lmax: 2
+mmax: 2
+grid_resolution: null
+cutoff: 5.0
+edge_channels: 128
+distance_function: "gaussian"
+num_distance_basis: 512
+norm_type: "rms_norm_sh"
+act_type: "s2"
+mlp_type: "grid"
+otf_graph: True  #!!
+use_envelope: False
+activation_checkpointing: False
+global_attributes: False
+sphere_embedding_type: "gaussian" #!!
+aggregation_method: "sum"
+chkpt_path: null
+# Diffusion kwargs
+diffusion_steps: 450
+diffusion_noise_schedule: polynomial_2
+diffusion_noise_precision: 1e-5
+diffusion_loss_type: vlb
+normalize_factors: [1, 1]
+extra_norm_values: []
+augment_noise: False
+data_augmentation: False
+context_mask_rate: 0.2
+mask_value: 5
+normalize_condition: value_10
+sp_regularizer_deploy: False
+sp_regularizer_regularizer: hard
+sp_regularizer_lambda_: 0
+sp_regularizer_lambda_2: 1000
+sp_regularizer_lambda_update_value: 1
+sp_regularizer_lambda_update_step: 100
+sp_regularizer_polynomial_p: 1.1
+sp_regularizer_warm_up_steps: 100
+reference_indices: null
+# eSEN_dynamics specific kwargs
+use_adapter_module: False
+tanh: True
+coords_range: 10
+normalization_factor: 1.0
+# Evaluator parameters
+use_posebuster: True
+metrics: valid_posebuster
+n_samples: 96
+batch_size: 8
+generative_analysis: True

MolecularDiffusion/configs/tasks/diffusion_pretrained.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+_target_: MolecularDiffusion.runmodes.train.ModelTaskFactory_EGCL
+task_type: diffusion
+atom_vocab: ${data.atom_vocab}
+condition_names: []
+hidden_size: 192
+act_fn:
+  _target_: torch.nn.SiLU
+num_layers: 9
+attention: True
+tanh: True
+num_sublayers: 1
+sin_embedding: False
+aggregation_method: "sum"
+dropout: 0.0
+normalization: False
+include_cosine: True
+norm_constant: 1.0
+normalization_factor: 1.0
+chkpt_path: null
+# specific to diffusion
+diffusion_steps : 900
+diffusion_noise_schedule : polynomial_2 # learned, cosine_x, polynomial_x, issnr_x, smld_x
+diffusion_noise_precision: 1e-5
+diffusion_loss_type:  vlb
+normalize_factors: [1,4,10]
+extra_norm_values: []
+augment_noise: False
+data_augmentation: False
+context_mask_rate: 0.2
+mask_value: 5
+normalize_condition: value_10 # [None, "maxmin", "mad"]
+sp_regularizer_deploy: False
+sp_regularizer_regularizer: hard
+sp_regularizer_lambda_: 0
+sp_regularizer_lambda_2: 1000
+sp_regularizer_lambda_update_value: 1
+sp_regularizer_lambda_update_step: 100
+sp_regularizer_polynomial_p: 1.1
+sp_regularizer_warm_up_steps: 100
+reference_indices: null # indices of core atoms for the outpainting objective
+# evaluator parameters
+use_posebuster: True
+metrics: valid_posebuster # use_posebuster must be true
+n_samples: 24
+generative_analysis: True
+batch_size: 4

MolecularDiffusion/configs/tasks/diffusion_pyg.yaml ADDED Viewed

	@@ -0,0 +1,82 @@

+_target_: MolecularDiffusion.runmodes.train.tasks_esen.ModelTaskFactory
+task_type: diffusion_pyg
+# === Atom Vocabulary ===
+atom_vocab: ${data.atom_vocab}
+condition_names: []
+# === eSEN Model Parameters ===
+hidden_size: 256
+hidden_channels: 32
+num_layers: 9
+lmax: 2
+mmax: 2
+grid_resolution: null
+cutoff: 15
+edge_channels: 128
+distance_function: "gaussian"
+num_distance_basis: 10
+norm_type: "rms_norm_sh"
+act_type: "s2"
+mlp_type: "grid"
+otf_graph: True
+use_envelope: False
+activation_checkpointing: False
+global_attributes: False
+# IMPORTANT: Use "gaussian" for float features during diffusion!
+# "gaussian" uses Gaussian smearing + MLP, fully float-compatible
+# Other options ("embedding", "mixed") require integer atomic_numbers
+sphere_embedding_type: "gaussian"
+aggregation_method: "sum"
+chkpt_path: null
+# === Continuous Diffusion Parameters ===
+# All features (positions, one-hot, integer) use continuous Gaussian diffusion
+diffusion_steps: 900
+diffusion_noise_schedule: polynomial_2  # Options: cosine, polynomial_2, polynomial_3, learned
+diffusion_noise_precision: 1e-5
+diffusion_loss_type: vlb  # Options: vlb, l2
+# Normalization factors: [positions, categorical (one-hot), integer (atomic_numbers)]
+normalize_factors: [1.0, 4.0, 10.0]
+extra_norm_values: []
+# Data augmentation
+augment_noise: False
+data_augmentation: False
+# Context masking for classifier-free guidance
+context_mask_rate: 0.0
+mask_value: 0.0
+normalize_condition: null
+# Self-paced learning regularizer
+sp_regularizer_deploy: False
+sp_regularizer_regularizer: hard
+sp_regularizer_lambda_: 0
+sp_regularizer_lambda_2: 1000
+sp_regularizer_lambda_update_value: 1
+sp_regularizer_lambda_update_step: 100
+sp_regularizer_polynomial_p: 1.1
+sp_regularizer_warm_up_steps: 100
+# Outpainting/inpainting
+reference_indices: null
+use_unknown_fallback: False  # Set to True when data.allow_unknown is True
+# === eSEN Dynamics Specific ===
+use_adapter_module: False
+tanh: True
+coords_range: 10
+normalization_factor: 1.0
+# === Evaluation Parameters ===
+use_posebuster: True
+metrics: valid_posebuster
+n_samples: 48
+batch_size: 8
+generative_analysis: True

MolecularDiffusion/configs/tasks/diffusion_pyg_egcl.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+_target_: MolecularDiffusion.runmodes.train.tasks_egcl.ModelTaskFactory
+task_type: diffusion_pyg
+# === Atom Vocabulary ===
+atom_vocab: ${data.atom_vocab}
+condition_names: []
+# === EGNN Parameters ===
+hidden_size: 256
+num_layers: 9
+attention: True
+norm_diff: True
+tanh: True
+coords_range: 10
+num_sublayers: 1
+sin_embedding: False
+include_cosine: True
+normalization_factor: 1.0
+aggregation_method: "sum"
+dropout: 0.0
+normalization: False
+chkpt_path: null
+# === Continuous Diffusion Parameters ===
+# All features use continuous Gaussian diffusion (same as EnVariationalDiffusion)
+diffusion_steps: 900
+diffusion_noise_schedule: polynomial_2
+diffusion_noise_precision: 1e-5
+diffusion_loss_type: vlb
+# Normalization factors: [positions, categorical (one-hot), integer (atomic_numbers)]
+normalize_factors: [1.0, 4.0, 10.0]
+extra_norm_values: []
+# Data augmentation
+augment_noise: False
+data_augmentation: False
+# Context masking for classifier-free guidance
+context_mask_rate: 0.0
+mask_value: 0.0
+normalize_condition: null
+# Self-paced learning regularizer
+sp_regularizer_deploy: False
+use_unknown_fallback: False  # Set to True when data.allow_unknown is True
+# === Evaluation Parameters ===
+use_posebuster: True
+metrics: valid_posebuster
+n_samples: 48
+batch_size: 8
+generative_analysis: True

MolecularDiffusion/configs/tasks/diffusion_pyg_egt.yaml ADDED Viewed

	@@ -0,0 +1,56 @@

+_target_: MolecularDiffusion.runmodes.train.tasks_egt.ModelTaskFactory
+task_type: diffusion_pyg
+# === Atom Vocabulary ===
+atom_vocab: ${data.atom_vocab}
+condition_names: []
+# === Graph Transformer Parameters ===
+model_class: GraphTransformerPyG
+hidden_dims:
+  dx: 256
+  de: 1
+  dy: 32
+  n_head: 4
+  dim_ffX: 256
+  dim_ffE: 1
+  dim_ffy: 32
+hidden_mlp_dims:
+  X: 256
+  E: 1
+  y: 32
+  pos: 512
+act_fn_in:
+  _target_: torch.nn.SiLU
+act_fn_out:
+  _target_: torch.nn.SiLU
+num_layers: 6
+dropout: 0.1
+chkpt_path: null
+# === Diffusion Parameters ===
+diffusion_steps: 900
+diffusion_noise_schedule: polynomial_2
+diffusion_noise_precision: 1e-5
+diffusion_loss_type: vlb
+normalize_factors: [1.0, 4.0, 10.0]
+extra_norm_values: []
+# Data augmentation
+augment_noise: False
+data_augmentation: False
+# Context masking for CFG
+context_mask_rate: 0.0
+mask_value: 0.0
+normalize_condition: null
+# Self-paced regularizer
+sp_regularizer_deploy: False
+# === Evaluation ===
+use_posebuster: True
+metrics: valid_posebuster
+n_samples: 48
+batch_size: 8
+generative_analysis: True

MolecularDiffusion/configs/tasks/diffusion_tabasco.yaml ADDED Viewed

	@@ -0,0 +1,66 @@

+# TABASCO diffusion task configuration
+# This config is referenced by: defaults: - override tasks: diffusion_tabasco
+_target_: MolecularDiffusion.modules.tasks.diffusion_tabasco.ModelTaskFactory
+task_type: diffusion_tabasco
+# Automatically populated from dataset
+num_atom_types: ???
+# Transformer backbone configuration
+transformer_config:
+  spatial_dim: 3
+  atom_dim: ???
+  hidden_dim: 256
+  num_layers: 16
+  num_heads: 8
+  activation: SiLU
+  implementation: pytorch
+  cross_attention: true
+  add_sinusoid_posenc: true
+  concat_combine_input: false
+  custom_weight_init: null
+# Continuous coordinate interpolant
+coords_interpolant_config:
+  key: coords
+  loss_weight: 1.0
+  centered: true
+  scale_noise_by_log_num_atoms: false
+  noise_scale: 1.0
+  langevin_sampling_schedule:
+    _target_: MolecularDiffusion.modules.models.tabasco.sample.noise_schedule.SampleNoiseSchedule
+    cutoff: 0.9
+  white_noise_sampling_scale: 0.01
+  time_factor:
+    _target_: MolecularDiffusion.modules.models.tabasco.flow.time_factor.InverseTimeFactor
+    max_value: 100.0
+    min_value: 0.05
+    zero_before: 0.0
+    eps: 1.0e-6
+# Discrete atom type interpolant
+atomics_interpolant_config:
+  key: atomics
+  loss_weight: 0.1
+  time_factor:
+    _target_: MolecularDiffusion.modules.models.tabasco.flow.time_factor.InverseTimeFactor
+    max_value: 100.0
+    min_value: 0.05
+    zero_before: 0.0
+    eps: 1.0e-6
+# Flow matching configuration
+flow_matching_config:
+  time_distribution: beta
+  time_alpha_factor: 1.8
+  num_random_augmentations: 7
+  sample_schedule: log
+  compile: false
+  interdist_loss: null
+# Dataset statistics (populated at runtime)
+dataset_stats:
+  max_atoms: ???
+  atom_count_histogram: {}
+  all_smiles: []

MolecularDiffusion/configs/tasks/guidance.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+_target_: MolecularDiffusion.runmodes.train.ModelTaskFactory_EGCL
+task_type: guidance
+atom_vocab: ${data.atom_vocab}
+condition_names: []
+hidden_size: 512
+act_fn:
+  _target_: torch.nn.ReLU
+num_layers: 1
+attention: True
+tanh: True
+num_sublayers: 5
+sin_embedding: False
+aggregation_method: "sum"
+dropout: 0.0
+normalization: False
+include_cosine: True
+norm_constant: 1.0
+normalization_factor: 1.0
+chkpt_path: null
+# specific to diffusion
+task_learn: [S1_exc,T1_exc]
+criterion: mse
+metric: [mae]
+num_mlp_layer: 3
+mlp_dropout: 0.2
+mlp_batch_norm: True  # True/False for legacy mode, null/'layernorm'/'batchnorm' for new mode
+prediction_mlp_type: legacy  # 'legacy' (backward compat), 'pernode', or 'padded'
+prediction_activation: relu  # 'relu' or 'silu'
+diffusion_steps: 900
+diffusion_noise_precision: 1e-5
+nu_arr: [2,2,2]
+mapping: ["pos", "categorical", "integer"]
+weight_classes: null
+norm_values: [1,4,10]
+t_max: 0.7
+loss_weighting: linear

MolecularDiffusion/configs/tasks/guidance_esen.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+# Uses the existing ModelTaskFactory from tasks_esen.py with task_type: guidance
+_target_: MolecularDiffusion.runmodes.train.tasks_esen.ModelTaskFactory
+task_type: guidance
+atom_vocab: ${data.atom_vocab}
+condition_names: []
+# eSEN Backbone parameters
+sphere_channels: 128
+hidden_channels: 128
+lmax: 2
+mmax: 2
+num_layers: 4
+edge_channels: 128
+distance_function: "gaussian"
+num_distance_basis: 512
+cutoff: 5.0
+max_neighbors: 300
+norm_type: "rms_norm_sh"
+act_type: "s2"
+mlp_type: "grid"
+# CRITICAL: Use "mlp" or "gaussian" for differentiable gradients
+sphere_embedding_type: "mlp"
+# in_node_channels is computed by factory: len(atom_vocab) + n_extra + 1 (charge) + 1 (time)
+# Guidance-specific parameters
+task_learn: [S1_exc, T1_exc]
+criterion: mse
+metric: [mae]
+num_mlp_layer: 3
+mlp_dropout: 0.2
+mlp_batch_norm: True  # True/False for legacy mode, null/'layernorm'/'batchnorm' for new mode
+prediction_mlp_type: legacy  # 'legacy' (backward compat), 'pernode', or 'padded'
+prediction_activation: relu  # 'relu' or 'silu'
+diffusion_steps: 600
+diffusion_noise_precision: 1e-5
+nu_arr: [2, 2, 2]
+mapping: ["pos", "categorical", "integer"]
+weight_classes: null
+norm_values: [1, 4, 10]
+t_max: 0.7
+loss_weighting: linear
+normalization: False

MolecularDiffusion/configs/tasks/guidance_pc.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+# Configuration for PointCloud-optimized EGCL Guidance Model
+# Uses GuidanceModelPredictionPointCloud with dense_mode=True
+_target_: MolecularDiffusion.runmodes.train.ModelTaskFactory_EGCL
+task_type: guidance
+atom_vocab: ${data.atom_vocab}
+condition_names: []
+hidden_size: 512
+act_fn:
+  _target_: torch.nn.ReLU
+num_layers: 1
+attention: True
+tanh: True
+num_sublayers: 5
+sin_embedding: False
+aggregation_method: "sum"
+dropout: 0.0
+normalization: False
+include_cosine: True
+norm_constant: 1.0
+normalization_factor: 1.0
+chkpt_path: null
+# Enable dense mode for PointCloud inference
+dense_mode: True
+# Guidance-specific parameters
+task_learn: [S1_exc, T1_exc]
+criterion: mse
+metric: [mae]
+num_mlp_layer: 3
+mlp_dropout: 0.2
+mlp_batch_norm: True  # True/False for legacy mode, null/'layernorm'/'batchnorm' for new mode
+prediction_mlp_type: legacy  # 'legacy' (backward compat), 'pernode', or 'padded'
+prediction_activation: relu  # 'relu' or 'silu'
+diffusion_steps: 900
+diffusion_noise_precision: 1e-5
+nu_arr: [2, 2, 2]
+mapping: ["pos", "categorical", "integer"]
+weight_classes: null
+norm_values: [1, 4, 10]
+t_max: 0.7
+loss_weighting: linear

MolecularDiffusion/configs/tasks/ldm_dit.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# Latent Diffusion with DiT denoiser
+_target_: MolecularDiffusion.modules.tasks.diffusion_ldm.LDMTaskFactory
+task_type: ldm_dit
+_recursive: False
+autoencoder_ckpt: ???  # Required: path to pre-trained VAE
+denoiser:
+  _target_: MolecularDiffusion.modules.models.ldm.denoisers.dit.DiT
+  # d_x is auto-inferred from VAE latent_dim
+  d_model: 384
+  num_layers: 12
+  nhead: 6
+  class_dropout_prob: 0.1
+interpolant:
+  type: flow_matching
+  min_t: 0.01
+  corrupt: true
+  num_timesteps: 100
+  self_condition: false
+  self_condition_prob: 0.5
+# Data augmentation
+augment_rotation: true

MolecularDiffusion/configs/tasks/regression.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+_target_: MolecularDiffusion.runmodes.train.ModelTaskFactory_EGCL
+task_type: regression
+atom_vocab: ${data.atom_vocab}
+condition_names: []
+hidden_size: 512
+act_fn:
+  _target_: torch.nn.ReLU
+num_layers: 1
+attention: True
+tanh: True
+num_sublayers: 5
+sin_embedding: False
+aggregation_method: "sum"
+dropout: 0.0
+normalization: False  # For EGNN backbone layer norm
+include_cosine: True
+norm_constant: 1.0
+normalization_factor: 1.0
+chkpt_path: null
+# specific to regression
+task_learn: [S1_exc,T1_exc]
+criterion: mse
+metric: [mae]
+num_mlp_layer: 3
+mlp_batch_norm: batchnorm  # Options: null, layernorm, batchnorm
+target_normalization: True  # Normalize targets by mean/std in loss
+mlp_dropout: 0.2
+prediction_mlp_type: "pernode"
+prediction_activation: "relu"

MolecularDiffusion/configs/tasks/regression_esen.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+_target_: MolecularDiffusion.runmodes.train.ModelTaskFactory_ESEN
+task_type: regression
+atom_vocab: ${data.atom_vocab}
+# condition_names: []
+hidden_size: 256
+hidden_channels: 256
+num_layers: 4
+lmax: 2
+mmax: 2
+grid_resolution: null
+cutoff: 5.0
+edge_channels: 128
+distance_function: "gaussian"
+num_distance_basis: 512
+norm_type: "rms_norm_sh"
+act_type: "s2"
+mlp_type: "grid"
+use_envelope: False
+activation_checkpointing: False
+global_attributes: False
+sphere_embedding_type: "mixed"
+aggregation_method: mean
+chkpt_path: null
+# specific to regression
+task_learn: [S1_exc,T1_exc]
+criterion: mse
+metric: [mae]
+num_mlp_layer: 3
+mlp_dropout: 0.2
+mlp_batch_norm: batchnorm  # Options: null, layernorm, batchnorm
+target_normalization: True  # Normalize targets by mean/std in loss
+prediction_mlp_type: "pernode"
+prediction_activation: "relu"