Spaces:

kaurm43
/

PolyFusionAgent

Running

App Files Files Community

PolyFusionAgent / Downstream Tasks /Polymer_Generation.py

kaurm43

Update Downstream Tasks/Polymer_Generation.py

662687e verified 9 days ago

raw

history blame contribute delete

94.4 kB

	import os
	import re
	import sys
	import csv
	import json
	import math
	import time
	import copy
	import random
	import shutil
	import warnings
	from dataclasses import dataclass
	from pathlib import Path
	from typing import List, Dict, Optional, Tuple, Any

	import numpy as np
	import pandas as pd

	from sklearn.model_selection import train_test_split, KFold
	from sklearn.preprocessing import StandardScaler
	from sklearn.decomposition import PCA
	from sklearn.gaussian_process import GaussianProcessRegressor
	from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.data import Dataset, DataLoader

	# Increase CSV field size limit safely (helps when JSON blobs are stored in CSV cells)
	try:
	csv.field_size_limit(sys.maxsize)
	except OverflowError:
	csv.field_size_limit(2**31 - 1)

	# HF Transformers (SELFIES-TED)
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	from transformers.modeling_outputs import BaseModelOutput

	# Shared encoders/helpers from PolyFusion
	from PolyFusion.GINE import GineEncoder
	from PolyFusion.SchNet import NodeSchNetWrapper
	from PolyFusion.Transformer import PooledFingerprintEncoder as FingerprintEncoder
	from PolyFusion.DeBERTav2 import PSMILESDebertaEncoder, build_psmiles_tokenizer

	# Optional chemistry dependencies (recommended)
	RDKit_AVAILABLE = False
	SELFIES_AVAILABLE = False
	try:
	from rdkit import Chem
	from rdkit.Chem import AllChem, DataStructs
	RDKit_AVAILABLE = True
	except Exception:
	RDKit_AVAILABLE = False

	try:
	import selfies as sf
	SELFIES_AVAILABLE = True
	except Exception:
	SELFIES_AVAILABLE = False


	# =============================================================================
	# Configuration (paths are placeholders; replace with your actual filesystem paths)
	# =============================================================================

	@dataclass(frozen=True)
	class Config:
	# -------------------------------------------------------------------------
	# Input data and pretrained artifacts (placeholders)
	# -------------------------------------------------------------------------
	BASE_DIR: str = "/path/to/Polymer_Foundational_Model"
	POLYINFO_PATH: str = "/path/to/polyinfo_with_modalities.csv"

	# Multimodal CL checkpoint (for the fused encoder)
	PRETRAINED_MULTIMODAL_DIR: str = "/path/to/multimodal_output/best"

	# Unimodal encoder checkpoints
	BEST_GINE_DIR: str = "/path/to/gin_output/best"
	BEST_SCHNET_DIR: str = "/path/to/schnet_output/best"
	BEST_FP_DIR: str = "/path/to/fingerprint_mlm_output/best"
	BEST_PSMILES_DIR: str = "/path/to/polybert_output/best"

	# SentencePiece model for PSMILES tokenizer (placeholder)
	SPM_MODEL_PATH: str = "/path/to/spm.model"

	# -------------------------------------------------------------------------
	# Output folders
	# -------------------------------------------------------------------------
	OUTPUT_DIR: str = "/path/to/multimodal_inverse_design_output"

	@property
	def OUTPUT_RESULTS(self) -> str:
	return os.path.join(self.OUTPUT_DIR, "inverse_design_results.txt")

	@property
	def OUTPUT_MODELS_DIR(self) -> str:
	return os.path.join(self.OUTPUT_DIR, "best_models")

	@property
	def OUTPUT_GENERATIONS_DIR(self) -> str:
	return os.path.join(self.OUTPUT_DIR, "best_fold_generations")


	CFG = Config()

	# Properties to run
	REQUESTED_PROPERTIES = [
	"density",
	"glass transition",
	"melting",
	"thermal decomposition",
	]

	# -------------------------------------------------------------------------
	# Model sizes / dims (match CL encoder + pretraining)
	# -------------------------------------------------------------------------
	CL_EMB_DIM = 600

	MAX_ATOMIC_Z = 85
	MASK_ATOM_ID = MAX_ATOMIC_Z + 1

	# GINE params
	NODE_EMB_DIM = 300
	EDGE_EMB_DIM = 300
	NUM_GNN_LAYERS = 5

	# SchNet params
	SCHNET_NUM_GAUSSIANS = 50
	SCHNET_NUM_INTERACTIONS = 6
	SCHNET_CUTOFF = 10.0
	SCHNET_MAX_NEIGHBORS = 64
	SCHNET_HIDDEN = 600

	# Fingerprint params
	FP_LENGTH = 2048
	MASK_TOKEN_ID_FP = 2
	VOCAB_SIZE_FP = 3

	# DeBERTa params
	DEBERTA_HIDDEN = 600
	PSMILES_MAX_LEN = 128

	# SELFIES-TED generation limits
	GEN_MAX_LEN = 256
	GEN_MIN_LEN = 10

	# -------------------------------------------------------------------------
	# Decoder fine-tuning schedule (single head)
	# -------------------------------------------------------------------------
	BATCH_SIZE = 32
	NUM_EPOCHS = 100
	PATIENCE = 10
	WEIGHT_DECAY = 0.0
	LEARNING_RATE = 1e-4
	COSINE_ETA_MIN = 1e-6

	# Noise injection (latent space)
	LATENT_NOISE_STD_TRAIN = 0.10 # training-time denoising std
	LATENT_NOISE_STD_GEN = 0.15 # generation-time exploration std
	N_FOLD_NOISE_SAMPLING = 16 # sampling multiplicity around each seed embedding

	# Sampling config (decoder)
	GEN_TOP_P = 0.92
	GEN_TEMPERATURE = 1.0
	GEN_REPETITION_PENALTY = 1.05

	# Cross-validation
	NUM_FOLDS = 5

	# Property guidance tolerance (scaled space)
	PROP_TOL_SCALED = 0.5
	PROP_TOL_UNSCALED_ABS = None

	# GPR settings (PSMILES latent)
	USE_PCA_BEFORE_GPR = True
	PCA_DIM = 64
	GPR_ALPHA = 1e-6

	# Verification (optional auxiliary predictor trained per fold)
	VERIFY_GENERATED_PROPERTIES = True
	PROP_PRED_EPOCHS = 20
	PROP_PRED_PATIENCE = 5
	PROP_PRED_BATCH_SIZE = 32
	PROP_PRED_LR = 3e-4
	PROP_PRED_WEIGHT_DECAY = 0.0

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	USE_AMP = bool(torch.cuda.is_available())
	AMP_DTYPE = torch.float16
	NUM_WORKERS = 0 if os.name == "nt" else 1

	warnings.filterwarnings("ignore", category=UserWarning)


	def ensure_output_dirs(cfg: Config) -> None:
	"""Create output directories if they do not exist."""
	os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
	os.makedirs(cfg.OUTPUT_MODELS_DIR, exist_ok=True)
	os.makedirs(cfg.OUTPUT_GENERATIONS_DIR, exist_ok=True)


	# =============================================================================
	# Utilities
	# =============================================================================
	def _safe_json_load(x):
	"""Robust JSON parsing for CSV cells that may contain dict/list JSON (or slightly malformed strings)."""
	if x is None:
	return None
	if isinstance(x, (dict, list)):
	return x
	s = str(x).strip()
	if not s:
	return None
	try:
	return json.loads(s)
	except Exception:
	try:
	return json.loads(s.replace("'", '"'))
	except Exception:
	return None


	def set_seed(seed: int):
	"""Set random seeds for reproducibility (best effort)."""
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed_all(seed)
	try:
	# Keep cuDNN fast; reproducibility across GPUs/drivers is not guaranteed.
	torch.backends.cudnn.benchmark = True
	except Exception:
	pass


	def make_json_serializable(obj):
	"""Convert common scientific objects into JSON-serializable Python types."""
	if isinstance(obj, dict):
	return {make_json_serializable(k): make_json_serializable(v) for k, v in obj.items()}
	if isinstance(obj, (list, tuple, set)):
	return [make_json_serializable(x) for x in obj]
	if isinstance(obj, np.ndarray):
	return obj.tolist()
	if isinstance(obj, np.generic):
	try:
	return obj.item()
	except Exception:
	return float(obj)
	if isinstance(obj, torch.Tensor):
	try:
	return obj.detach().cpu().tolist()
	except Exception:
	return None
	if isinstance(obj, (pd.Timestamp, pd.Timedelta)):
	return str(obj)
	try:
	if isinstance(obj, (float, int, str, bool, type(None))):
	return obj
	except Exception:
	pass
	return str(obj)


	def find_property_columns(columns):
	"""
	Heuristically map requested property names to dataframe columns.

	Notes:
	- Uses lowercase matching; prefers token-level matches.
	- Special-case: exclude "cohesive energy" when searching for "density".
	"""
	lowered = {c.lower(): c for c in columns}
	found = {}

	for req in REQUESTED_PROPERTIES:
	req_low = req.lower().strip()
	exact = None

	# First, attempt a token match
	for c_low, c_orig in lowered.items():
	tokens = set(c_low.replace('_', ' ').split())
	if req_low in tokens or c_low == req_low:
	if req_low == "density" and ("cohesive" in c_low or "cohesive energy" in c_low):
	continue
	exact = c_orig
	break
	if exact is not None:
	found[req] = exact
	continue

	# Fallback: substring match
	candidates = [c_orig for c_low, c_orig in lowered.items() if req_low in c_low]
	if req_low == "density":
	candidates = [c for c in candidates if "cohesive" not in c.lower() and "cohesive energy" not in c.lower()]

	chosen = candidates[0] if candidates else None
	found[req] = chosen

	if chosen is None:
	print(f"[WARN] Could not match requested property '{req}' to any column.")
	else:
	print(f"[INFO] Mapped requested property '{req}' -> column '{chosen}'")

	return found


	# =============================================================================
	# Graph / geometry / fingerprint parsing for multimodal CL encoding
	# =============================================================================
	def _parse_graph_for_gine(graph_field):
	"""
	Convert a stored 'graph' JSON blob into the tensor inputs expected by GineEncoder.
	Returns None if graph is missing or malformed.
	"""
	gf = _safe_json_load(graph_field)
	if not isinstance(gf, dict):
	return None

	node_features = gf.get("node_features", None)
	if not node_features or not isinstance(node_features, list):
	return None

	atomic_nums, chirality_vals, formal_charges = [], [], []
	for nf in node_features:
	if not isinstance(nf, dict):
	continue
	an = nf.get("atomic_num", nf.get("atomic_number", 0))
	ch = nf.get("chirality", 0)
	fc = nf.get("formal_charge", 0)
	try:
	atomic_nums.append(int(an))
	except Exception:
	atomic_nums.append(0)
	try:
	chirality_vals.append(float(ch))
	except Exception:
	chirality_vals.append(0.0)
	try:
	formal_charges.append(float(fc))
	except Exception:
	formal_charges.append(0.0)

	if len(atomic_nums) == 0:
	return None

	edge_indices_raw = gf.get("edge_indices", None)
	edge_features_raw = gf.get("edge_features", None)

	srcs, dsts = [], []

	# Handle two common representations:
	# (a) edge_indices = [[u,v], [u,v], ...]
	# (b) edge_indices = [[srcs...], [dsts...]]
	if edge_indices_raw is None:
	adj = gf.get("adjacency_matrix", None)
	if isinstance(adj, list):
	for i_r, row_adj in enumerate(adj):
	if not isinstance(row_adj, list):
	continue
	for j, val in enumerate(row_adj):
	if val:
	srcs.append(i_r)
	dsts.append(j)
	else:
	try:
	if isinstance(edge_indices_raw, list) and len(edge_indices_raw) > 0:
	if isinstance(edge_indices_raw[0], list) and len(edge_indices_raw[0]) == 2:
	srcs = [int(p[0]) for p in edge_indices_raw]
	dsts = [int(p[1]) for p in edge_indices_raw]
	elif len(edge_indices_raw) == 2:
	srcs = [int(x) for x in edge_indices_raw[0]]
	dsts = [int(x) for x in edge_indices_raw[1]]
	except Exception:
	srcs, dsts = [], []

	if len(srcs) == 0:
	edge_index = torch.empty((2, 0), dtype=torch.long)
	edge_attr = torch.zeros((0, 3), dtype=torch.float)
	return {
	"z": torch.tensor(atomic_nums, dtype=torch.long),
	"chirality": torch.tensor(chirality_vals, dtype=torch.float),
	"formal_charge": torch.tensor(formal_charges, dtype=torch.float),
	"edge_index": edge_index,
	"edge_attr": edge_attr,
	}

	edge_index = torch.tensor([srcs, dsts], dtype=torch.long)

	# Edge attributes (bond_type, stereo, is_conjugated) if present; else zeros
	if isinstance(edge_features_raw, list) and len(edge_features_raw) == len(srcs):
	bt, st, ic = [], [], []
	for ef in edge_features_raw:
	if isinstance(ef, dict):
	bt.append(float(ef.get("bond_type", 0)))
	st.append(float(ef.get("stereo", 0)))
	ic.append(float(1.0 if ef.get("is_conjugated", False) else 0.0))
	else:
	bt.append(0.0)
	st.append(0.0)
	ic.append(0.0)
	edge_attr = torch.tensor(list(zip(bt, st, ic)), dtype=torch.float)
	else:
	edge_attr = torch.zeros((len(srcs), 3), dtype=torch.float)

	return {
	"z": torch.tensor(atomic_nums, dtype=torch.long),
	"chirality": torch.tensor(chirality_vals, dtype=torch.float),
	"formal_charge": torch.tensor(formal_charges, dtype=torch.float),
	"edge_index": edge_index,
	"edge_attr": edge_attr,
	}


	def _parse_geometry_for_schnet(geom_field):
	"""
	Convert stored 'geometry' JSON blob into SchNet inputs:
	- atomic_numbers -> z
	- coordinates -> pos
	Returns None if missing/malformed.
	"""
	gf = _safe_json_load(geom_field)
	if not isinstance(gf, dict):
	return None

	conf = gf.get("best_conformer", None)
	if not isinstance(conf, dict):
	return None

	atomic = conf.get("atomic_numbers", [])
	coords = conf.get("coordinates", [])
	if not (isinstance(atomic, list) and isinstance(coords, list)):
	return None
	if len(atomic) == 0 or len(atomic) != len(coords):
	return None

	return {"z": torch.tensor(atomic, dtype=torch.long), "pos": torch.tensor(coords, dtype=torch.float)}


	def _parse_fingerprints(fp_field, fp_len: int = 2048):
	"""
	Parse a fingerprint field (either list or dict containing 'morgan_r3_bits') into:
	- input_ids: LongTensor [fp_len] with 0/1 bits
	- attention_mask: BoolTensor [fp_len] all True
	"""
	fp = _safe_json_load(fp_field)
	bits = None

	if isinstance(fp, dict):
	bits = fp.get("morgan_r3_bits", None)
	elif isinstance(fp, list):
	bits = fp
	elif fp is None:
	bits = None

	if bits is None:
	bits = [0] * fp_len
	else:
	norm = []
	for b in bits[:fp_len]:
	if isinstance(b, str):
	bc = b.strip().strip('"').strip("'")
	norm.append(1 if bc in ("1", "True", "true") else 0)
	elif isinstance(b, (int, np.integer, float, np.floating)):
	norm.append(1 if int(b) != 0 else 0)
	else:
	norm.append(0)
	if len(norm) < fp_len:
	norm.extend([0] * (fp_len - len(norm)))
	bits = norm

	return {
	"input_ids": torch.tensor(bits, dtype=torch.long),
	"attention_mask": torch.ones(fp_len, dtype=torch.bool),
	}


	# =============================================================================
	# PSELFIES utilities (polymer-safe SELFIES encoding with endpoint markers)
	# =============================================================================

	_SELFIES_TOKEN_RE = re.compile(r"\[[^\[\]]+\]")

	def _split_selfies_tokens(selfies_str: str) -> List[str]:
	"""Split a SELFIES string into tokens; prefers selfies.split_selfies if available."""
	if not isinstance(selfies_str, str) or len(selfies_str) == 0:
	return []
	if SELFIES_AVAILABLE:
	try:
	toks = list(sf.split_selfies(selfies_str.replace(" ", "")))
	return [t for t in toks if isinstance(t, str) and t]
	except Exception:
	pass
	return _SELFIES_TOKEN_RE.findall(selfies_str)

	def _selfies_for_tokenizer(selfies_str: str) -> str:
	"""Normalize SELFIES formatting so the HF tokenizer sees token boundaries."""
	s = str(selfies_str).strip()
	if not s:
	return ""
	s = s.replace(" ", "")
	s = s.replace("][", "] [")
	return s

	def _selfies_compact(selfies_str: str) -> str:
	"""Remove spaces and trim."""
	return str(selfies_str).replace(" ", "").strip()

	def _ensure_two_at_endpoints(selfies_str: str) -> str:
	"""
	Ensure polymer endpoints exist: enforce exactly two [At] tokens (one at each end).
	This is used as a polymerization marker compatible with the At-based conversion.
	"""
	s = _selfies_compact(selfies_str)
	toks = _split_selfies_tokens(s)
	if not toks:
	return s

	at = "[At]"
	at_pos = [i for i, t in enumerate(toks) if t == at]

	if len(at_pos) == 0:
	toks = [at] + toks + [at]
	elif len(at_pos) == 1:
	toks = toks + [at]
	elif len(at_pos) > 2:
	first = at_pos[0]
	last = at_pos[-1]
	new = []
	for i, t in enumerate(toks):
	if t == at and i not in (first, last):
	continue
	new.append(t)
	toks = new

	return "".join(toks)


	def psmiles_to_at_smiles(psmiles: str, root_at: bool = True) -> Optional[str]:
	"""
	Convert polymer PSMILES (two []) into RDKit SMILES where [] is represented as element At (Z=85).
	This allows SELFIES encoding/decoding while preserving polymer endpoints.
	"""
	if not RDKit_AVAILABLE:
	return None
	try:
	mol = Chem.MolFromSmiles(psmiles)
	if mol is None:
	return None
	mol = Chem.RWMol(mol)

	at_indices = []
	for atom in mol.GetAtoms():
	if atom.GetAtomicNum() == 0:
	atom.SetAtomicNum(85)
	try:
	atom.SetNoImplicit(True)
	except Exception:
	pass
	try:
	atom.SetNumExplicitHs(0)
	except Exception:
	pass
	try:
	atom.SetFormalCharge(0)
	except Exception:
	pass
	at_indices.append(int(atom.GetIdx()))

	mol = mol.GetMol()
	try:
	Chem.SanitizeMol(mol, catchErrors=True)
	except Exception:
	return None

	if root_at and len(at_indices) > 0:
	try:
	can = Chem.MolToSmiles(mol, canonical=True, rootedAtAtom=at_indices[0])
	except Exception:
	can = Chem.MolToSmiles(mol, canonical=True)
	else:
	can = Chem.MolToSmiles(mol, canonical=True)

	return can
	except Exception:
	return None


	def at_smiles_to_psmiles(at_smiles: str) -> Optional[str]:
	"""Inverse of psmiles_to_at_smiles: convert At (Z=85) back to polymer [*] endpoints."""
	if not RDKit_AVAILABLE:
	return None
	try:
	mol = Chem.MolFromSmiles(at_smiles)
	if mol is None:
	return None

	rw = Chem.RWMol(mol)
	for atom in rw.GetAtoms():
	if atom.GetAtomicNum() == 85:
	atom.SetAtomicNum(0)
	try:
	atom.SetNoImplicit(True)
	except Exception:
	pass
	try:
	atom.SetNumExplicitHs(0)
	except Exception:
	pass
	try:
	atom.SetFormalCharge(0)
	except Exception:
	pass

	mol2 = rw.GetMol()
	try:
	Chem.SanitizeMol(mol2, catchErrors=True)
	except Exception:
	return None

	can = Chem.MolToSmiles(mol2, canonical=True)
	can = can.replace("[]", "")
	return can
	except Exception:
	return None


	def smiles_to_pselfies(smiles: str) -> Optional[str]:
	"""Encode RDKit-canonical SMILES into SELFIES."""
	if not (RDKit_AVAILABLE and SELFIES_AVAILABLE):
	return None
	try:
	mol = Chem.MolFromSmiles(smiles)
	if mol is None:
	return None
	can = Chem.MolToSmiles(mol, canonical=True)
	s = sf.encoder(can)
	if not isinstance(s, str) or len(s) == 0:
	return None
	return s
	except Exception:
	return None


	def psmiles_to_pselfies(psmiles: str) -> Optional[str]:
	"""Convert polymer PSMILES -> At-SMILES -> PSELFIES, ensuring endpoint markers."""
	if not (RDKit_AVAILABLE and SELFIES_AVAILABLE):
	return None
	at_smiles = psmiles_to_at_smiles(psmiles, root_at=True)
	if at_smiles is None:
	return None
	s = smiles_to_pselfies(at_smiles)
	if s is None:
	return None
	return _ensure_two_at_endpoints(s)


	def selfies_to_smiles(selfies_str: str) -> Optional[str]:
	"""Decode SELFIES -> SMILES and canonicalize with RDKit."""
	if not (RDKit_AVAILABLE and SELFIES_AVAILABLE):
	return None
	try:
	s = _selfies_compact(selfies_str)
	smi = sf.decoder(s)
	if not isinstance(smi, str) or len(smi) == 0:
	return None
	mol = Chem.MolFromSmiles(smi)
	if mol is None:
	return None
	try:
	Chem.SanitizeMol(mol, catchErrors=True)
	except Exception:
	return None
	can = Chem.MolToSmiles(mol, canonical=True)
	return can
	except Exception:
	return None


	def pselfies_to_psmiles(selfies_str: str) -> Optional[str]:
	"""Decode PSELFIES -> At-SMILES -> polymer PSMILES."""
	if not (RDKit_AVAILABLE and SELFIES_AVAILABLE):
	return None
	at_smiles = selfies_to_smiles(selfies_str)
	if at_smiles is None:
	return None
	return at_smiles_to_psmiles(at_smiles)


	def canonicalize_psmiles(psmiles: str) -> Optional[str]:
	"""RDKit-canonicalize PSMILES (best effort)."""
	psmiles = str(psmiles).strip()
	if not psmiles:
	return None
	if not RDKit_AVAILABLE:
	return psmiles
	try:
	mol = Chem.MolFromSmiles(psmiles)
	if mol is None:
	return None
	try:
	Chem.SanitizeMol(mol, catchErrors=True)
	except Exception:
	return None
	can = Chem.MolToSmiles(mol, canonical=True)
	can = can.replace("[]", "")
	return can
	except Exception:
	return None


	def chem_validity_psmiles(psmiles: str) -> bool:
	"""Basic chemical validity check via RDKit parse + sanitize."""
	if not RDKit_AVAILABLE:
	return False
	try:
	s = str(psmiles).strip()
	if not s:
	return False
	mol = Chem.MolFromSmiles(s)
	if mol is None:
	return False
	try:
	Chem.SanitizeMol(mol, catchErrors=True)
	except Exception:
	return False
	return True
	except Exception:
	return False


	def polymer_validity_psmiles_strict(psmiles: str) -> bool:
	"""
	Strict polymer validity:
	- exactly two [*] atoms
	- each [*] has degree 1 (a single attachment)
	"""
	if not RDKit_AVAILABLE:
	return False
	try:
	s = str(psmiles).strip()
	if not s:
	return False
	mol = Chem.MolFromSmiles(s)
	if mol is None:
	return False
	try:
	Chem.SanitizeMol(mol, catchErrors=True)
	except Exception:
	return False
	stars = [a for a in mol.GetAtoms() if a.GetAtomicNum() == 0]
	if len(stars) != 2:
	return False
	for a in stars:
	if a.GetTotalDegree() != 1:
	return False
	return True
	except Exception:
	return False


	# =============================================================================
	# CL encoder (multimodal) + fusion pooling
	# =============================================================================
	def resolve_cl_checkpoint_path(cl_weights_dir: str) -> Optional[str]:
	"""Resolve a checkpoint file inside a directory (or accept a file path directly)."""
	if cl_weights_dir is None:
	return None
	if os.path.isfile(cl_weights_dir):
	return cl_weights_dir
	if not os.path.isdir(cl_weights_dir):
	return None

	candidates = [
	os.path.join(cl_weights_dir, "pytorch_model.bin"),
	os.path.join(cl_weights_dir, "model.pt"),
	os.path.join(cl_weights_dir, "best.pt"),
	os.path.join(cl_weights_dir, "state_dict.pt"),
	]
	for p in candidates:
	if os.path.isfile(p):
	return p

	for ext in (".bin", ".pt"):
	files = sorted(Path(cl_weights_dir).glob(ext))
	if files:
	return str(files[0])

	return None


	def load_state_dict_any(ckpt_path: str) -> Dict[str, torch.Tensor]:
	"""Load a checkpoint that may wrap the model state dict under common keys."""
	obj = torch.load(ckpt_path, map_location="cpu")
	if isinstance(obj, dict):
	if "state_dict" in obj and isinstance(obj["state_dict"], dict):
	return obj["state_dict"]
	if "model_state_dict" in obj and isinstance(obj["model_state_dict"], dict):
	return obj["model_state_dict"]
	if not isinstance(obj, dict):
	raise RuntimeError(f"Checkpoint at {ckpt_path} did not contain a state_dict-like dict.")
	return obj


	def safe_load_into_module(module: nn.Module, sd: Dict[str, torch.Tensor], strict: bool = False) -> Tuple[int, int]:
	"""Load a (possibly partial) state dict and return counts of missing/unexpected keys."""
	incompatible = module.load_state_dict(sd, strict=strict)
	missing = getattr(incompatible, "missing_keys", [])
	unexpected = getattr(incompatible, "unexpected_keys", [])
	return len(missing), len(unexpected)


	class PolyFusionModule(nn.Module):
	"""
	Tiny fusion transformer:
	- self-attention over modality tokens
	- learned query pooling (attention weights -> pooled representation)
	"""
	def __init__(self, d_model: int, nhead: int = 8, ffn_mult: int = 4, dropout: float = 0.1):
	super().__init__()
	self.ln1 = nn.LayerNorm(d_model)
	self.attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
	self.ln2 = nn.LayerNorm(d_model)
	self.ffn = nn.Sequential(
	nn.Linear(d_model, ffn_mult * d_model),
	nn.GELU(),
	nn.Dropout(dropout),
	nn.Linear(ffn_mult * d_model, d_model),
	nn.Dropout(dropout),
	)
	self.pool_ln = nn.LayerNorm(d_model)
	self.pool_q = nn.Parameter(torch.randn(d_model))

	def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
	# mask: True for valid tokens; MultiheadAttention uses key_padding_mask where True means "ignore"
	key_padding = ~mask
	h = self.ln1(x)
	attn_out, _ = self.attn(h, h, h, key_padding_mask=key_padding)
	x = x + attn_out
	x = x + self.ffn(self.ln2(x))

	# query pooling
	x = self.pool_ln(x)
	q = self.pool_q.unsqueeze(0).unsqueeze(-1) # [1, d, 1]
	scores = torch.matmul(x, q).squeeze(-1) # [B, T]
	scores = scores.masked_fill(~mask, -1e9)
	w = torch.softmax(scores, dim=-1).unsqueeze(-1)
	pooled = (x * w).sum(dim=1) # [B, d]
	return pooled


	class MultiModalCLPolymerEncoder(nn.Module):
	"""
	Frozen multimodal encoder used as the conditioning interface:
	- encodes any subset of modalities (graph/geometry/fingerprint/psmiles)
	- projects each modality into a shared CL embedding space
	- fuses available modality tokens into a single normalized vector
	"""
	def __init__(
	self,
	psmiles_tokenizer,
	emb_dim: int = CL_EMB_DIM,
	cl_weights_dir: Optional[str] = CFG.PRETRAINED_MULTIMODAL_DIR,
	use_gine: bool = True,
	use_schnet: bool = True,
	use_fp: bool = True,
	use_psmiles: bool = True,
	):
	super().__init__()
	self.psm_tok = psmiles_tokenizer
	self.emb_dim = int(emb_dim)

	self.gine = None
	self.schnet = None
	self.fp = None
	self.psmiles = None

	if use_gine:
	try:
	self.gine = GineEncoder(NODE_EMB_DIM, EDGE_EMB_DIM, NUM_GNN_LAYERS, MAX_ATOMIC_Z)
	except Exception as e:
	print(f"[CL][WARN] Disabling GINE encoder: {e}")
	self.gine = None

	if use_schnet:
	try:
	self.schnet = NodeSchNetWrapper(
	SCHNET_HIDDEN, SCHNET_NUM_INTERACTIONS, SCHNET_NUM_GAUSSIANS, SCHNET_CUTOFF, SCHNET_MAX_NEIGHBORS
	)
	except Exception as e:
	print(f"[CL][WARN] Disabling SchNet encoder: {e}")
	self.schnet = None

	if use_fp:
	try:
	self.fp = FingerprintEncoder(VOCAB_SIZE_FP, 256, FP_LENGTH, 4, 8, 1024, 0.1)
	except Exception as e:
	print(f"[CL][WARN] Disabling fingerprint encoder: {e}")
	self.fp = None

	if use_psmiles:
	enc_src = CFG.BEST_PSMILES_DIR if (CFG.BEST_PSMILES_DIR and os.path.isdir(CFG.BEST_PSMILES_DIR)) else None
	self.psmiles = PSMILESDebertaEncoder(
	model_dir_or_name=enc_src,
	vocab_fallback=int(getattr(psmiles_tokenizer, "vocab_size", 300)),
	)

	# Projection layers into shared CL space
	self.proj_gine = nn.Linear(NODE_EMB_DIM, self.emb_dim) if self.gine is not None else None
	self.proj_schnet = nn.Linear(SCHNET_HIDDEN, self.emb_dim) if self.schnet is not None else None
	self.proj_fp = nn.Linear(256, self.emb_dim) if self.fp is not None else None
	self.proj_psmiles = nn.Linear(DEBERTA_HIDDEN, self.emb_dim) if self.psmiles is not None else None

	self.dropout = nn.Dropout(0.1)
	self.out_dim = self.emb_dim
	self.fusion = PolyFusionModule(d_model=self.emb_dim, nhead=8, ffn_mult=4, dropout=0.1)

	# Optionally load a trained multimodal CL checkpoint
	self._load_multimodal_cl_checkpoint(cl_weights_dir)

	def _load_multimodal_cl_checkpoint(self, cl_weights_dir: Optional[str]):
	ckpt_path = resolve_cl_checkpoint_path(cl_weights_dir) if cl_weights_dir else None
	if ckpt_path is None:
	print(f"[CL][INFO] No multimodal CL checkpoint found at '{cl_weights_dir}'. Using initialized weights.")
	return

	sd = load_state_dict_any(ckpt_path)
	model_sd = self.state_dict()

	# Load only compatible keys (shape match) to be robust across versions
	filtered = {}
	for k, v in sd.items():
	if k not in model_sd:
	continue
	if hasattr(v, "shape") and hasattr(model_sd[k], "shape") and tuple(v.shape) != tuple(model_sd[k].shape):
	continue
	filtered[k] = v

	missing, unexpected = safe_load_into_module(self, filtered, strict=False)
	print(
	f"[CL][INFO] Loaded multimodal CL checkpoint '{ckpt_path}'. "
	f"loaded_keys={len(filtered)} missing={missing} unexpected={unexpected}"
	)

	def freeze_cl_encoders(self):
	"""Freeze encoders and fusion module (decoder training should not update them)."""
	for name, enc in [("gine", self.gine), ("schnet", self.schnet), ("fp", self.fp), ("psmiles", self.psmiles)]:
	if enc is None:
	continue
	enc.eval()
	for p in enc.parameters():
	p.requires_grad = False
	print(f"[CL][INFO] Froze {name} encoder parameters.")

	self.fusion.eval()
	for p in self.fusion.parameters():
	p.requires_grad = False
	print("[CL][INFO] Froze fusion module parameters.")

	def forward_multimodal(self, batch_mods: dict) -> torch.Tensor:
	"""Encode a batch containing any subset of modalities and return normalized CL embeddings."""
	device = next(self.parameters()).device

	# Infer batch size from whichever modality is present
	if batch_mods.get("fp", None) is not None and isinstance(batch_mods["fp"].get("input_ids", None), torch.Tensor):
	B = int(batch_mods["fp"]["input_ids"].size(0))
	elif batch_mods.get("psmiles", None) is not None and isinstance(batch_mods["psmiles"].get("input_ids", None), torch.Tensor):
	B = int(batch_mods["psmiles"]["input_ids"].size(0))
	else:
	if batch_mods.get("gine", None) is not None and isinstance(batch_mods["gine"].get("batch", None), torch.Tensor):
	B = int(batch_mods["gine"]["batch"].max().item() + 1) if batch_mods["gine"]["batch"].numel() > 0 else 1
	elif batch_mods.get("schnet", None) is not None and isinstance(batch_mods["schnet"].get("batch", None), torch.Tensor):
	B = int(batch_mods["schnet"]["batch"].max().item() + 1) if batch_mods["schnet"]["batch"].numel() > 0 else 1
	else:
	B = 1

	tokens: List[torch.Tensor] = []

	def _append_token(z_token: torch.Tensor):
	tokens.append(z_token)

	# GINE token
	if self.gine is not None and batch_mods.get("gine", None) is not None:
	g = batch_mods["gine"]
	if isinstance(g.get("z", None), torch.Tensor) and g["z"].numel() > 0:
	emb_g = self.gine(
	g["z"].to(device),
	g.get("chirality", torch.zeros_like(g["z"], dtype=torch.float)).to(device) if isinstance(g.get("chirality", None), torch.Tensor) else None,
	g.get("formal_charge", torch.zeros_like(g["z"], dtype=torch.float)).to(device) if isinstance(g.get("formal_charge", None), torch.Tensor) else None,
	g.get("edge_index", torch.empty((2, 0), dtype=torch.long)).to(device),
	g.get("edge_attr", torch.zeros((0, 3), dtype=torch.float)).to(device),
	g.get("batch", None).to(device) if isinstance(g.get("batch", None), torch.Tensor) else None,
	)
	zg = self.proj_gine(emb_g)
	zg = self.dropout(zg)
	_append_token(zg)

	# SchNet token
	if self.schnet is not None and batch_mods.get("schnet", None) is not None:
	s = batch_mods["schnet"]
	if isinstance(s.get("z", None), torch.Tensor) and s["z"].numel() > 0:
	emb_s = self.schnet(
	s["z"].to(device),
	s["pos"].to(device),
	s.get("batch", None).to(device) if isinstance(s.get("batch", None), torch.Tensor) else None,
	)
	zs = self.proj_schnet(emb_s)
	zs = self.dropout(zs)
	_append_token(zs)

	# Fingerprint token
	if self.fp is not None and batch_mods.get("fp", None) is not None:
	f = batch_mods["fp"]
	if isinstance(f.get("input_ids", None), torch.Tensor) and f["input_ids"].numel() > 0:
	emb_f = self.fp(
	f["input_ids"].to(device),
	f.get("attention_mask", None).to(device) if isinstance(f.get("attention_mask", None), torch.Tensor) else None,
	)
	zf = self.proj_fp(emb_f)
	zf = self.dropout(zf)
	_append_token(zf)

	# PSMILES token
	if self.psmiles is not None and batch_mods.get("psmiles", None) is not None:
	p = batch_mods["psmiles"]
	if isinstance(p.get("input_ids", None), torch.Tensor) and p["input_ids"].numel() > 0:
	emb_p = self.psmiles(
	p["input_ids"].to(device),
	p.get("attention_mask", None).to(device) if isinstance(p.get("attention_mask", None), torch.Tensor) else None,
	)
	zp = self.proj_psmiles(emb_p)
	zp = self.dropout(zp)
	_append_token(zp)

	if not tokens:
	# No modalities present; return a safe zero vector
	z = torch.zeros((B, self.emb_dim), device=device)
	return F.normalize(z, dim=-1)

	X = torch.stack(tokens, dim=1) # [B, T, d]
	mask = torch.ones((B, X.size(1)), dtype=torch.bool, device=device)
	pooled = self.fusion(X, mask)
	pooled = F.normalize(pooled, dim=-1)
	return pooled

	@torch.no_grad()
	def encode_psmiles(
	self,
	psmiles_list: List[str],
	max_len: int = PSMILES_MAX_LEN,
	batch_size: int = 64,
	device: str = DEVICE,
	) -> np.ndarray:
	self.eval()
	if self.psm_tok is None or self.psmiles is None or self.proj_psmiles is None:
	raise RuntimeError("PSMILES tokenizer/encoder/projection not available.")

	dev = torch.device(device)
	self.to(dev)

	outs = []
	for i in range(0, len(psmiles_list), batch_size):
	chunk = [str(x) for x in psmiles_list[i : i + batch_size]]
	enc = self.psm_tok(chunk, truncation=True, padding="max_length", max_length=max_len, return_tensors="pt")
	input_ids = enc["input_ids"].to(dev)
	attn = enc["attention_mask"].to(dev).bool()

	emb_p = self.psmiles(input_ids, attn)
	z = self.proj_psmiles(emb_p)
	z = F.normalize(z, dim=-1)
	outs.append(z.detach().cpu().numpy())

	return np.concatenate(outs, axis=0) if outs else np.zeros((0, self.emb_dim), dtype=np.float32)

	@torch.no_grad()
	def encode_multimodal(self, records: List[dict], batch_size: int = 32, device: str = DEVICE) -> np.ndarray:
	"""Encode a list of records that may contain any subset of modalities."""
	self.eval()
	dev = torch.device(device)
	self.to(dev)

	outs = []
	for i in range(0, len(records), batch_size):
	chunk = records[i : i + batch_size]

	# PSMILES tokenization
	psmiles_texts = [str(r.get("psmiles", "")) for r in chunk]
	p_enc = None
	if self.psm_tok is not None:
	p_enc = self.psm_tok(
	psmiles_texts,
	truncation=True,
	padding="max_length",
	max_length=PSMILES_MAX_LEN,
	return_tensors="pt",
	)

	# Fingerprints
	fp_ids, fp_attn = [], []
	for r in chunk:
	f = _parse_fingerprints(r.get("fingerprints", None), fp_len=FP_LENGTH)
	fp_ids.append(f["input_ids"])
	fp_attn.append(f["attention_mask"])
	fp_ids = torch.stack(fp_ids, dim=0)
	fp_attn = torch.stack(fp_attn, dim=0)

	# GINE batch assembly (concat nodes; keep per-graph batch indices)
	gine_all = {"z": [], "chirality": [], "formal_charge": [], "edge_index": [], "edge_attr": [], "batch": []}
	node_offset = 0
	for bi, r in enumerate(chunk):
	g = _parse_graph_for_gine(r.get("graph", None))
	if g is None or g["z"].numel() == 0:
	continue
	n = g["z"].size(0)
	gine_all["z"].append(g["z"])
	gine_all["chirality"].append(g["chirality"])
	gine_all["formal_charge"].append(g["formal_charge"])
	gine_all["batch"].append(torch.full((n,), bi, dtype=torch.long))
	ei = g["edge_index"]
	ea = g["edge_attr"]
	if ei is not None and ei.numel() > 0:
	gine_all["edge_index"].append(ei + node_offset)
	gine_all["edge_attr"].append(ea)
	node_offset += n

	gine_batch = None
	if len(gine_all["z"]) > 0:
	z_b = torch.cat(gine_all["z"], dim=0)
	ch_b = torch.cat(gine_all["chirality"], dim=0)
	fc_b = torch.cat(gine_all["formal_charge"], dim=0)
	b_b = torch.cat(gine_all["batch"], dim=0)
	if len(gine_all["edge_index"]) > 0:
	ei_b = torch.cat(gine_all["edge_index"], dim=1)
	ea_b = torch.cat(gine_all["edge_attr"], dim=0)
	else:
	ei_b = torch.empty((2, 0), dtype=torch.long)
	ea_b = torch.zeros((0, 3), dtype=torch.float)

	gine_batch = {
	"z": z_b,
	"chirality": ch_b,
	"formal_charge": fc_b,
	"edge_index": ei_b,
	"edge_attr": ea_b,
	"batch": b_b,
	}

	# SchNet batch assembly (concat atoms; keep per-structure batch indices)
	sch_all_z, sch_all_pos, sch_all_batch = [], [], []
	for bi, r in enumerate(chunk):
	s = _parse_geometry_for_schnet(r.get("geometry", None))
	if s is None or s["z"].numel() == 0:
	continue
	n = s["z"].size(0)
	sch_all_z.append(s["z"])
	sch_all_pos.append(s["pos"])
	sch_all_batch.append(torch.full((n,), bi, dtype=torch.long))

	schnet_batch = None
	if len(sch_all_z) > 0:
	schnet_batch = {
	"z": torch.cat(sch_all_z, dim=0),
	"pos": torch.cat(sch_all_pos, dim=0),
	"batch": torch.cat(sch_all_batch, dim=0),
	}

	batch_mods = {
	"gine": gine_batch,
	"schnet": schnet_batch,
	"fp": {"input_ids": fp_ids, "attention_mask": fp_attn},
	"psmiles": {"input_ids": p_enc["input_ids"], "attention_mask": p_enc["attention_mask"]} if p_enc is not None else None,
	}

	z = self.forward_multimodal(batch_mods)
	outs.append(z.detach().cpu().numpy())

	return np.concatenate(outs, axis=0) if outs else np.zeros((0, self.emb_dim), dtype=np.float32)


	# =============================================================================
	# SELFIES-TED decoder conditioned on CL embeddings
	# =============================================================================
	SELFIES_TED_MODEL_NAME = os.environ.get("SELFIES_TED_MODEL_NAME", "ibm-research/materials.selfies-ted")
	HF_TOKEN = os.environ.get("HF_TOKEN", None)

	def _hf_load_with_retries(load_fn, max_tries: int = 5, base_sleep: float = 2.0):
	"""Retry wrapper for HF downloads (useful when the hub is flaky or rate-limited)."""
	last_err = None
	for t in range(max_tries):
	try:
	return load_fn()
	except Exception as e:
	last_err = e
	sleep_s = base_sleep * (1.6 ** t) + random.random()
	print(f"[HF][WARN] Load attempt {t+1}/{max_tries} failed: {e} \| retrying in {sleep_s:.1f}s")
	time.sleep(sleep_s)
	raise RuntimeError(f"Failed to load model from HF after {max_tries} attempts. Last error: {last_err}")


	def load_selfies_ted_and_tokenizer(model_name: str = SELFIES_TED_MODEL_NAME):
	"""Load SELFIES-TED tokenizer and model from Hugging Face."""
	def _load_tok():
	return AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN, use_fast=True)

	def _load_model():
	return AutoModelForSeq2SeqLM.from_pretrained(model_name, token=HF_TOKEN)

	tok = _hf_load_with_retries(_load_tok, max_tries=5)
	model = _hf_load_with_retries(_load_model, max_tries=5)
	return tok, model


	class CLConditionedSelfiesTEDGenerator(nn.Module):
	"""
	Condition SELFIES-TED on CL embeddings by:
	- mapping CL vector -> d_model
	- expanding into a short "memory" sequence (mem_len)
	- passing this as encoder_outputs to the seq2seq model
	"""
	def __init__(self, tok, seq2seq_model, cl_emb_dim: int = CL_EMB_DIM, mem_len: int = 4):
	super().__init__()
	self.tok = tok
	self.model = seq2seq_model
	self.mem_len = int(mem_len)

	d_model = int(getattr(self.model.config, "d_model", 1024))
	self.cl_to_d = nn.Sequential(
	nn.Linear(cl_emb_dim, d_model),
	nn.Tanh(),
	nn.Dropout(0.1),
	nn.Linear(d_model, d_model),
	)
	self.mem_pos = nn.Embedding(self.mem_len, d_model)

	def build_encoder_outputs(self, z: torch.Tensor) -> Tuple[BaseModelOutput, torch.Tensor]:
	"""Determine encoder outputs from a CL latent vector."""
	device = z.device
	B = z.size(0)

	d = self.cl_to_d(z) # [B, d_model]
	d = d.unsqueeze(1).expand(B, self.mem_len, d.size(-1)).contiguous()

	pos = torch.arange(self.mem_len, device=device).unsqueeze(0).expand(B, -1)
	d = d + self.mem_pos(pos)

	attn = torch.ones((B, self.mem_len), dtype=torch.long, device=device)
	return BaseModelOutput(last_hidden_state=d), attn

	def forward_train(self, z: torch.Tensor, labels: torch.Tensor) -> Dict[str, torch.Tensor]:
	"""Teacher-forced training step (labels are decoder targets)."""
	enc_out, attn = self.build_encoder_outputs(z)
	out = self.model(encoder_outputs=enc_out, attention_mask=attn, labels=labels)
	loss = out.loss
	return {"loss": loss, "ce": loss.detach()}

	@torch.no_grad()
	def generate(
	self,
	z: torch.Tensor,
	num_return_sequences: int = 1,
	max_len: int = GEN_MAX_LEN,
	top_p: float = GEN_TOP_P,
	temperature: float = GEN_TEMPERATURE,
	repetition_penalty: float = GEN_REPETITION_PENALTY,
	) -> List[str]:
	"""Stochastic decoding from a batch of CL latents."""
	self.eval()
	z = z.to(next(self.parameters()).device)
	enc_out, attn = self.build_encoder_outputs(z)

	gen = self.model.generate(
	encoder_outputs=enc_out,
	attention_mask=attn,
	do_sample=True,
	top_p=float(top_p),
	temperature=float(temperature),
	repetition_penalty=float(repetition_penalty),
	num_return_sequences=int(num_return_sequences),
	max_length=int(max_len),
	min_length=int(GEN_MIN_LEN),
	pad_token_id=int(self.tok.pad_token_id) if self.tok.pad_token_id is not None else None,
	eos_token_id=int(self.tok.eos_token_id) if self.tok.eos_token_id is not None else None,
	)

	outs = self.tok.batch_decode(gen, skip_special_tokens=True, clean_up_tokenization_spaces=True)
	outs = [_ensure_two_at_endpoints(_selfies_compact(o)) for o in outs]
	return outs


	def create_optimizer_and_scheduler_decoder(model: CLConditionedSelfiesTEDGenerator):
	"""Create AdamW + CosineAnnealingLR for decoder fine-tuning."""
	for p in model.parameters():
	p.requires_grad = True
	opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
	sch = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=NUM_EPOCHS, eta_min=COSINE_ETA_MIN)
	return opt, sch


	# =============================================================================
	# Datasets for latent-to-SELFIES training
	# =============================================================================
	class LatentToPSELFIESDataset(Dataset):
	"""
	Each sample:
	- z: frozen CL embedding (optionally with Gaussian noise added for denoising)
	- labels: tokenized PSELFIES target sequence (pad tokens masked as -100)
	"""
	def __init__(
	self,
	records: List[dict],
	cl_encoder: MultiModalCLPolymerEncoder,
	selfies_tok,
	max_len: int = GEN_MAX_LEN,
	latent_noise_std: float = 0.0,
	cache_embeddings: bool = True,
	renormalize_after_noise: bool = True,
	use_multimodal: bool = True,
	):
	self.records = records
	self.cl_encoder = cl_encoder
	self.tok = selfies_tok
	self.max_len = int(max_len)
	self.latent_noise_std = float(latent_noise_std)
	self.renorm = bool(renormalize_after_noise)
	self.use_multimodal = bool(use_multimodal)

	self.pad_id = int(self.tok.pad_token_id) if getattr(self.tok, "pad_token_id", None) is not None else 1
	self._cache = None

	# Optionally precompute latents (saves a lot of time during decoder training)
	if cache_embeddings:
	if self.use_multimodal:
	emb = self.cl_encoder.encode_multimodal(self.records, batch_size=32, device=DEVICE)
	else:
	psm = [str(r.get("psmiles", "")) for r in self.records]
	emb = self.cl_encoder.encode_psmiles(psm, max_len=PSMILES_MAX_LEN, batch_size=64, device=DEVICE)
	self._cache = emb.astype(np.float32)

	def __len__(self):
	return len(self.records)

	def __getitem__(self, idx):
	r = self.records[idx]

	tgt = str(r["pselfies"]).strip()
	tgt = _selfies_for_tokenizer(tgt)

	# Get latent z (cached or computed on the fly)
	if self._cache is not None:
	z = torch.tensor(self._cache[idx], dtype=torch.float32)
	else:
	if self.use_multimodal:
	z_np = self.cl_encoder.encode_multimodal([r], batch_size=1, device=DEVICE)
	z = torch.tensor(z_np[0], dtype=torch.float32)
	else:
	psm = str(r.get("psmiles", "")).strip()
	z_np = self.cl_encoder.encode_psmiles([psm], max_len=PSMILES_MAX_LEN, batch_size=1, device=DEVICE)
	z = torch.tensor(z_np[0], dtype=torch.float32)

	# Denoising noise
	if self.latent_noise_std > 0:
	z = z + torch.randn_like(z) * self.latent_noise_std
	if self.renorm:
	z = F.normalize(z, dim=-1)

	# Tokenize target SELFIES; mask padding to -100 for CE
	enc = self.tok(tgt, truncation=True, padding="max_length", max_length=self.max_len, return_tensors=None)
	labels = torch.tensor(enc["input_ids"], dtype=torch.long)
	labels = labels.masked_fill(labels == self.pad_id, -100)

	return {
	"z": z,
	"labels": labels,
	"psmiles": str(r.get("psmiles", "")).strip(),
	"pselfies_raw": _selfies_compact(r["pselfies"]),
	}


	def latent_collate(batch: List[dict]) -> dict:
	"""Collate latents and labels into batch tensors."""
	z = torch.stack([b["z"] for b in batch], dim=0)
	labels = torch.stack([b["labels"] for b in batch], dim=0)
	return {
	"z": z,
	"labels": labels,
	"psmiles": [b["psmiles"] for b in batch],
	"pselfies_raw": [b["pselfies_raw"] for b in batch],
	}


	def move_latent_batch_to_device(batch: dict, device: str):
	batch["z"] = batch["z"].to(device)
	batch["labels"] = batch["labels"].to(device)


	# =============================================================================
	# Aux PSMILES property oracle (optional)
	# =============================================================================
	class PSMILESPropertyDataset(Dataset):
	"""Text regression dataset: PSMILES -> scaled property (single scalar)."""
	def __init__(self, samples: List[dict], psmiles_tokenizer, max_len: int = PSMILES_MAX_LEN):
	self.samples = samples
	self.tok = psmiles_tokenizer
	self.max_len = max_len

	def __len__(self):
	return len(self.samples)

	def __getitem__(self, idx):
	s = str(self.samples[idx].get("psmiles", "")).strip()
	y = float(self.samples[idx].get("target_scaled", self.samples[idx].get("target", 0.0)))
	enc = self.tok(s, truncation=True, padding="max_length", max_length=self.max_len)
	return {
	"input_ids": torch.tensor(enc["input_ids"], dtype=torch.long),
	"attention_mask": torch.tensor(enc["attention_mask"], dtype=torch.bool),
	"y": torch.tensor([y], dtype=torch.float32),
	}


	def psmiles_prop_collate_fn(batch: List[dict]):
	input_ids = torch.stack([b["input_ids"] for b in batch], dim=0)
	attn = torch.stack([b["attention_mask"] for b in batch], dim=0)
	y = torch.stack([b["y"] for b in batch], dim=0)
	return {"input_ids": input_ids, "attention_mask": attn, "y": y}


	class TextPropertyOracle(nn.Module):
	"""
	Lightweight regressor for verification:
	- Frozen PSMILES encoder (DeBERTa variant)
	- Trainable MLP head
	"""
	def __init__(self, encoder_dir: Optional[str], vocab_size: Optional[int] = None, y_dim: int = 1):
	super().__init__()
	if encoder_dir is not None and os.path.isdir(encoder_dir):
	enc_src = encoder_dir
	elif os.path.isdir(CFG.BEST_PSMILES_DIR):
	enc_src = CFG.BEST_PSMILES_DIR
	else:
	enc_src = "microsoft/deberta-v2-xlarge"

	self.encoder = PSMILESDebertaEncoder(
	model_dir_or_name=enc_src,
	vocab_fallback=int(vocab_size) if vocab_size is not None else 300,
	)
	h = getattr(self.encoder, "out_dim", DEBERTA_HIDDEN)
	self.head = nn.Sequential(
	nn.Linear(h, 256),
	nn.ReLU(),
	nn.Dropout(0.1),
	nn.Linear(256, y_dim),
	)

	def forward(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	h = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
	return self.head(h)


	def move_prop_batch_to_device(batch: dict, device: str):
	batch["input_ids"] = batch["input_ids"].to(device)
	batch["attention_mask"] = batch["attention_mask"].to(device)
	batch["y"] = batch["y"].to(device)


	def train_prop_oracle_one_epoch(model: TextPropertyOracle, dl: DataLoader, opt, scaler_amp, device: str):
	model.train()
	total = 0.0
	n = 0
	for batch in dl:
	move_prop_batch_to_device(batch, device)
	y = batch["y"]
	opt.zero_grad(set_to_none=True)

	with torch.cuda.amp.autocast(enabled=USE_AMP, dtype=AMP_DTYPE):
	y_hat = model(batch["input_ids"], batch["attention_mask"])
	loss = F.smooth_l1_loss(y_hat, y, beta=1.0)

	if USE_AMP:
	scaler_amp.scale(loss).backward()
	scaler_amp.unscale_(opt)
	torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
	scaler_amp.step(opt)
	scaler_amp.update()
	else:
	loss.backward()
	torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
	opt.step()

	bs = y.size(0)
	total += float(loss.item()) * bs
	n += bs

	return total / max(1, n)


	@torch.no_grad()
	def eval_prop_oracle(model: TextPropertyOracle, dl: DataLoader, device: str):
	model.eval()
	total = 0.0
	n = 0
	for batch in dl:
	move_prop_batch_to_device(batch, device)
	y = batch["y"]
	with torch.cuda.amp.autocast(enabled=USE_AMP, dtype=AMP_DTYPE):
	y_hat = model(batch["input_ids"], batch["attention_mask"])
	loss = F.smooth_l1_loss(y_hat, y, beta=1.0)
	bs = y.size(0)
	total += float(loss.item()) * bs
	n += bs
	return total / max(1, n)


	def train_property_oracle_per_fold(
	train_samples: List[dict],
	val_samples: List[dict],
	psmiles_tokenizer,
	device: str,
	max_len: int = PSMILES_MAX_LEN,
	) -> Optional[TextPropertyOracle]:
	"""Train a per-fold auxiliary oracle for scaled property prediction (verification only)."""
	if psmiles_tokenizer is None:
	return None

	try:
	model = TextPropertyOracle(
	encoder_dir=CFG.BEST_PSMILES_DIR if os.path.isdir(CFG.BEST_PSMILES_DIR) else None,
	vocab_size=getattr(psmiles_tokenizer, "vocab_size", None),
	y_dim=1,
	).to(device)
	except Exception as e:
	print(f"[ORACLE][WARN] Could not initialize auxiliary property predictor: {e}")
	return None

	# Freeze encoder; train only head (fast + stable)
	for p in model.encoder.parameters():
	p.requires_grad = False
	for p in model.head.parameters():
	p.requires_grad = True

	ds_tr = PSMILESPropertyDataset(train_samples, psmiles_tokenizer, max_len=max_len)
	ds_va = PSMILESPropertyDataset(val_samples, psmiles_tokenizer, max_len=max_len)
	dl_tr = DataLoader(ds_tr, batch_size=PROP_PRED_BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True, collate_fn=psmiles_prop_collate_fn)
	dl_va = DataLoader(ds_va, batch_size=PROP_PRED_BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True, collate_fn=psmiles_prop_collate_fn)

	opt = torch.optim.AdamW([p for p in model.parameters() if p.requires_grad], lr=PROP_PRED_LR, weight_decay=PROP_PRED_WEIGHT_DECAY)
	scaler_amp = torch.cuda.amp.GradScaler(enabled=USE_AMP)

	best_val = float("inf")
	best_state = None
	no_imp = 0

	for epoch in range(1, PROP_PRED_EPOCHS + 1):
	tr = train_prop_oracle_one_epoch(model, dl_tr, opt, scaler_amp, device)
	va = eval_prop_oracle(model, dl_va, device)
	if va < best_val - 1e-8:
	best_val = va
	no_imp = 0
	best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
	else:
	no_imp += 1
	if no_imp >= PROP_PRED_PATIENCE:
	break

	if best_state is not None:
	model.load_state_dict({k: v.to(device) for k, v in best_state.items()}, strict=False)

	try:
	model.aux_val_loss = float(best_val)
	except Exception:
	pass

	return model


	@torch.no_grad()
	def oracle_predict_scaled(
	oracle: Optional[TextPropertyOracle],
	psmiles_tokenizer,
	psmiles_list: List[str],
	device: str,
	max_len: int = PSMILES_MAX_LEN,
	) -> Optional[np.ndarray]:
	"""Batch predict scaled properties with the auxiliary oracle."""
	if oracle is None or psmiles_tokenizer is None:
	return None
	if not psmiles_list:
	return np.array([], dtype=np.float32)

	oracle.eval()
	ys = []
	bs = 32
	for i in range(0, len(psmiles_list), bs):
	chunk = psmiles_list[i : i + bs]
	enc = psmiles_tokenizer(chunk, truncation=True, padding="max_length", max_length=max_len, return_tensors="pt")
	input_ids = enc["input_ids"].to(device)
	attn = enc["attention_mask"].to(device).bool()
	with torch.cuda.amp.autocast(enabled=USE_AMP, dtype=AMP_DTYPE):
	y_hat = oracle(input_ids, attn)
	ys.append(y_hat.detach().cpu().numpy().reshape(-1))

	return np.concatenate(ys, axis=0) if ys else np.array([], dtype=np.float32)


	# =============================================================================
	# Latent property model (per property)
	# =============================================================================
	@dataclass
	class LatentPropertyModel:
	y_scaler: StandardScaler
	pca: Optional[PCA]
	gpr: GaussianProcessRegressor


	def fit_latent_property_model(z_train: np.ndarray, y_train: np.ndarray, y_scaler: StandardScaler) -> LatentPropertyModel:
	"""
	Fit a GPR mapping (PSMILES latent) -> (scaled property).
	Uses optional PCA for stability when latent dim is large.
	"""
	y_train = np.array(y_train, dtype=np.float32).reshape(-1, 1)
	y_s = y_scaler.transform(y_train).reshape(-1).astype(np.float32)

	z_use = z_train.astype(np.float32)
	pca = None

	if USE_PCA_BEFORE_GPR:
	ncomp = int(min(PCA_DIM, z_use.shape[0] - 1, z_use.shape[1]))
	ncomp = max(2, ncomp)
	pca = PCA(n_components=ncomp, random_state=0)
	z_use = pca.fit_transform(z_use)

	kernel = (
	C(1.0, (1e-3, 1e3))
	* RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))
	+ WhiteKernel(noise_level=1e-3, noise_level_bounds=(1e-6, 1e-1))
	)
	gpr = GaussianProcessRegressor(kernel=kernel, alpha=GPR_ALPHA, normalize_y=True, random_state=0, n_restarts_optimizer=2)
	gpr.fit(z_use, y_s)

	return LatentPropertyModel(y_scaler=y_scaler, pca=pca, gpr=gpr)


	def predict_latent_property(model: LatentPropertyModel, z: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
	"""Predict scaled and unscaled properties for candidate latents."""
	z_use = z.astype(np.float32)
	if model.pca is not None:
	z_use = model.pca.transform(z_use)
	y_s = model.gpr.predict(z_use, return_std=False)
	y_s = np.array(y_s, dtype=np.float32).reshape(-1)
	y_u = model.y_scaler.inverse_transform(y_s.reshape(-1, 1)).reshape(-1)
	return y_s, y_u


	# =============================================================================
	# Train / eval loops (decoder)
	# =============================================================================
	def train_one_epoch_decoder(model: CLConditionedSelfiesTEDGenerator, dl: DataLoader, optimizer, scaler_amp, device: str):
	"""One epoch of teacher-forced decoder fine-tuning."""
	model.train()
	total = 0.0
	n = 0
	ce_sum = 0.0

	for batch in dl:
	move_latent_batch_to_device(batch, device)
	z = batch["z"]
	labels = batch["labels"]

	optimizer.zero_grad(set_to_none=True)

	with torch.cuda.amp.autocast(enabled=USE_AMP, dtype=AMP_DTYPE):
	out = model.forward_train(z, labels)
	loss = out["loss"]

	if USE_AMP:
	scaler_amp.scale(loss).backward()
	scaler_amp.unscale_(optimizer)
	torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
	scaler_amp.step(optimizer)
	scaler_amp.update()
	else:
	loss.backward()
	torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
	optimizer.step()

	bs = z.size(0)
	total += float(loss.item()) * bs
	ce_sum += float(out["ce"].item()) * bs
	n += bs

	return {"loss": total / max(1, n), "ce": ce_sum / max(1, n)}


	@torch.no_grad()
	def evaluate_decoder(model: CLConditionedSelfiesTEDGenerator, dl: DataLoader, device: str):
	"""Validation loss for early stopping."""
	model.eval()
	total = 0.0
	n = 0
	ce_sum = 0.0

	for batch in dl:
	move_latent_batch_to_device(batch, device)
	z = batch["z"]
	labels = batch["labels"]

	with torch.cuda.amp.autocast(enabled=USE_AMP, dtype=AMP_DTYPE):
	out = model.forward_train(z, labels)
	loss = out["loss"]

	bs = z.size(0)
	total += float(loss.item()) * bs
	ce_sum += float(out["ce"].item()) * bs
	n += bs

	return {"loss": total / max(1, n), "ce": ce_sum / max(1, n)}


	# =============================================================================
	# Generation / filtering (per target value, per property)
	# =============================================================================
	def compute_diversity_morgan(smiles_list: List[str], radius: int = 2, nbits: int = 2048, p: float = 1.0) -> Optional[float]:
	"""
	Diversity = 1 - mean(Tanimoto), computed on Morgan fingerprints of unique valid SMILES.
	Returns None if RDKit unavailable or insufficient valid molecules.
	"""
	if not RDKit_AVAILABLE:
	return None

	try:
	p = float(p)
	if not np.isfinite(p) or p <= 0:
	p = 1.0
	except Exception:
	p = 1.0

	uniq = []
	seen = set()
	for smi in smiles_list:
	smi = str(smi).strip()
	if not smi or smi in seen:
	continue
	seen.add(smi)
	uniq.append(smi)

	fps = []
	for smi in uniq:
	try:
	mol = Chem.MolFromSmiles(smi)
	if mol is None:
	continue
	try:
	Chem.SanitizeMol(mol, catchErrors=True)
	except Exception:
	continue
	fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nbits)
	fps.append(fp)
	except Exception:
	continue

	if len(fps) < 2:
	return 0.0 if len(fps) == 1 else None

	sims_p = []
	for i in range(len(fps)):
	for j in range(i + 1, len(fps)):
	try:
	s = float(DataStructs.TanimotoSimilarity(fps[i], fps[j]))
	sims_p.append(s ** p)
	except Exception:
	continue

	if not sims_p:
	return None

	mean_sim_p = float(np.mean(sims_p))
	try:
	mean_sim = mean_sim_p ** (1.0 / p)
	except Exception:
	mean_sim = float(
	np.mean([float(DataStructs.TanimotoSimilarity(fps[i], fps[j])) for i in range(len(fps)) for j in range(i + 1, len(fps))])
	)

	return float(1.0 - mean_sim)


	@torch.no_grad()
	def decode_from_latents(generator: CLConditionedSelfiesTEDGenerator, z: torch.Tensor, n_samples: int = 1) -> List[str]:
	"""Decode PSELFIES from a batch of CL latents."""
	return generator.generate(
	z=z,
	num_return_sequences=int(n_samples),
	max_len=GEN_MAX_LEN,
	top_p=GEN_TOP_P,
	temperature=GEN_TEMPERATURE,
	repetition_penalty=GEN_REPETITION_PENALTY,
	)


	def generate_for_target(
	target_y_scaled: float,
	prop_model: LatentPropertyModel,
	cl_encoder: MultiModalCLPolymerEncoder,
	generator: CLConditionedSelfiesTEDGenerator,
	train_seed_pool: List[dict],
	train_targets_set: set,
	n_seeds: int = 8,
	n_noise: int = N_FOLD_NOISE_SAMPLING,
	noise_std: float = LATENT_NOISE_STD_GEN,
	prop_tol_scaled: float = PROP_TOL_SCALED,
	oracle: Optional[TextPropertyOracle] = None,
	psmiles_tokenizer=None,
	) -> Dict[str, Any]:
	"""
	Core generation routine for a single target property value (scaled):
	1) Pick seed polymers close to target (in scaled property space).
	2) Encode seeds (multimodal) -> latent vectors.
	3) Add Gaussian noise to latents (exploration), renormalize.
	4) Decode to PSELFIES -> convert to polymer PSMILES.
	5) Filter by polymer/chem validity and property closeness (via GPR on PSMILES latents).
	6) Compute novelty/uniqueness/diversity metrics; optionally score with aux oracle.
	"""

	def _l2_normalize_np(x: np.ndarray, eps: float = 1e-12) -> np.ndarray:
	n = np.linalg.norm(x, axis=-1, keepdims=True)
	return x / np.clip(n, eps, None)

	# Choose nearest seeds by property distance (scaled)
	ys = np.array([float(d["y_scaled"]) for d in train_seed_pool], dtype=np.float32)
	diffs = np.abs(ys - float(target_y_scaled))
	order = np.argsort(diffs)
	chosen = [train_seed_pool[i] for i in order[: max(1, int(n_seeds))]]

	# Encode chosen seeds using multimodal encoder
	z_seed = cl_encoder.encode_multimodal(chosen, batch_size=32, device=DEVICE)
	if z_seed.shape[0] == 0:
	return {"generated": [], "metrics": {}}

	# Sample noise around each seed latent
	z_list = []
	for i in range(z_seed.shape[0]):
	z0 = z_seed[i].astype(np.float32)
	for _ in range(int(n_noise)):
	z = z0 + np.random.randn(z0.shape[0]).astype(np.float32) * float(noise_std)
	z = _l2_normalize_np(z.reshape(1, -1)).reshape(-1)
	z_list.append(z)

	z_all = np.stack(z_list, axis=0).astype(np.float32)
	z_t = torch.tensor(z_all, dtype=torch.float32, device=DEVICE)

	# Decode to PSELFIES
	pselfies = decode_from_latents(generator, z_t, n_samples=1)

	# Convert to polymer PSMILES; record validity flags
	valid_psmiles = []
	valid_flags, poly_flags = [], []

	for s in pselfies:
	s = _ensure_two_at_endpoints(_selfies_compact(s))
	psm = pselfies_to_psmiles(s) if (RDKit_AVAILABLE and SELFIES_AVAILABLE) else None
	if psm is None:
	valid_flags.append(False)
	poly_flags.append(False)
	continue

	psm_can = canonicalize_psmiles(psm)
	ok = chem_validity_psmiles(psm_can) if psm_can else False
	poly_ok = polymer_validity_psmiles_strict(psm_can) if psm_can else False
	valid_flags.append(bool(ok))
	poly_flags.append(bool(poly_ok))

	if ok and poly_ok and psm_can:
	valid_psmiles.append(psm_can)

	uniq_valid = sorted(set(valid_psmiles))
	novelty_valid = [1.0 if s not in train_targets_set else 0.0 for s in uniq_valid] if uniq_valid else []

	n_valid_poly = int(len(valid_psmiles))
	uniqueness_valid_unique = float(len(uniq_valid)) / float(max(1, n_valid_poly)) if n_valid_poly > 0 else 0.0

	# Property prediction via GPR on PSMILES latents (for filtering)
	if uniq_valid:
	z_cand = cl_encoder.encode_psmiles(uniq_valid, max_len=PSMILES_MAX_LEN, batch_size=64, device=DEVICE)
	else:
	z_cand = np.zeros((0, cl_encoder.out_dim), dtype=np.float32)

	yhat_s, yhat_u = (np.array([], dtype=np.float32), np.array([], dtype=np.float32))
	if z_cand.shape[0] > 0:
	yhat_s, yhat_u = predict_latent_property(prop_model, z_cand)

	keep, keep_pred_scaled, keep_pred_unscaled = [], [], []
	for i, psm in enumerate(uniq_valid):
	if abs(float(yhat_s[i]) - float(target_y_scaled)) <= float(prop_tol_scaled):
	keep.append(psm)
	keep_pred_scaled.append(float(yhat_s[i]))
	keep_pred_unscaled.append(float(yhat_u[i]))

	novelty_keep = [1.0 if s not in train_targets_set else 0.0 for s in keep] if keep else []

	# Optional aux oracle prediction for additional sanity checking
	aux_pred_scaled = None
	if VERIFY_GENERATED_PROPERTIES and oracle is not None and psmiles_tokenizer is not None and keep:
	aux = oracle_predict_scaled(oracle, psmiles_tokenizer, keep, DEVICE, PSMILES_MAX_LEN)
	aux_pred_scaled = aux.tolist() if aux is not None else None

	# Diversity computed on At-SMILES (to avoid polymer "*" parsing issues)
	at_smiles = []
	if RDKit_AVAILABLE and keep:
	for psm in keep:
	at_smi = psmiles_to_at_smiles(psm, root_at=False)
	if at_smi is not None:
	at_smiles.append(at_smi)
	div = compute_diversity_morgan(at_smiles) if at_smiles else None

	metrics = {
	"n_total": int(len(pselfies)),
	"validity": float(np.mean(valid_flags)) if valid_flags else 0.0,
	"polymer_validity": float(np.mean(poly_flags)) if poly_flags else 0.0,
	"n_valid_unique": int(len(uniq_valid)),
	"novelty_valid_unique": float(np.mean(novelty_valid)) if novelty_valid else 0.0,
	"uniqueness_valid_unique": float(uniqueness_valid_unique),
	"n_kept_property_filtered": int(len(keep)),
	"novelty_kept": float(np.mean(novelty_keep)) if novelty_keep else 0.0,
	"diversity": float(div) if div is not None else 0.0,
	}

	return {
	"generated": keep,
	"pred_scaled_kept": keep_pred_scaled,
	"pred_unscaled_kept": keep_pred_unscaled,
	"aux_pred_scaled": aux_pred_scaled,
	"metrics": metrics,
	}


	# =============================================================================
	# Data assembly (per property)
	# =============================================================================
	def build_polymer_records(df: pd.DataFrame, prop_col: str) -> List[dict]:
	"""
	Build records for a single property:
	- require chemically valid + strictly polymer-valid PSMILES
	- require finite property value
	- generate PSELFIES for decoder targets
	- preserve optional modalities for multimodal seed encoding
	"""
	if not (RDKit_AVAILABLE and SELFIES_AVAILABLE):
	raise RuntimeError("RDKit + selfies are required for this pipeline.")

	recs = []
	for _, row in df.iterrows():
	psmiles_raw = str(row.get("psmiles", "")).strip()
	if not psmiles_raw:
	continue

	psm_can = canonicalize_psmiles(psmiles_raw)
	if not psm_can:
	continue
	if not chem_validity_psmiles(psm_can):
	continue
	if not polymer_validity_psmiles_strict(psm_can):
	continue

	val = row.get(prop_col, None)
	if val is None:
	continue
	try:
	y = float(val)
	if not np.isfinite(y):
	continue
	except Exception:
	continue

	pself = psmiles_to_pselfies(psm_can)
	if pself is None:
	continue

	recs.append(
	{
	"psmiles": psm_can,
	"pselfies": pself,
	"y": y,
	"graph": row.get("graph", None),
	"geometry": row.get("geometry", None),
	"fingerprints": row.get("fingerprints", None),
	}
	)
	return recs


	# =============================================================================
	# Best-fold artifact saving (per property)
	# =============================================================================
	def save_best_fold_artifacts_for_property(
	property_name: str,
	fold_idx: int,
	decoder_state: Dict[str, torch.Tensor],
	prop_model: Optional[LatentPropertyModel],
	scaler: Optional[StandardScaler],
	best_val_loss: float,
	generations_payload: List[dict],
	):
	"""
	Persist the best fold for a property:
	- decoder state_dict
	- scaler + GPR (joblib, if available)
	- meta.json describing hyperparams
	- jsonl generations payload for traceability
	"""
	safe_prop = property_name.replace(" ", "_")
	prop_dir = os.path.join(CFG.OUTPUT_MODELS_DIR, safe_prop)
	os.makedirs(prop_dir, exist_ok=True)

	decoder_path = os.path.join(prop_dir, f"decoder_best_fold{fold_idx+1}.pt")
	torch.save(decoder_state, decoder_path)

	try:
	import joblib
	except Exception:
	joblib = None

	if joblib is not None:
	if scaler is not None:
	joblib.dump(scaler, os.path.join(prop_dir, f"standardscaler_{safe_prop}.joblib"))
	if prop_model is not None:
	joblib.dump(prop_model, os.path.join(prop_dir, f"gpr_psmiles_{safe_prop}.joblib"))

	meta = {
	"property": property_name,
	"best_fold": int(fold_idx + 1),
	"best_val_loss": float(best_val_loss),
	"selfies_ted_model": str(SELFIES_TED_MODEL_NAME),
	"cl_emb_dim": int(CL_EMB_DIM),
	"mem_len": 4,
	"tol_scaled": float(PROP_TOL_SCALED),
	"tol_unscaled_abs": float(PROP_TOL_UNSCALED_ABS) if PROP_TOL_UNSCALED_ABS is not None else None,
	"optimizer": "AdamW",
	"lr": float(LEARNING_RATE),
	"weight_decay": float(WEIGHT_DECAY),
	"lr_scheduler": "CosineAnnealingLR",
	"epochs": int(NUM_EPOCHS),
	"batch_size": int(BATCH_SIZE),
	"patience": int(PATIENCE),
	}

	try:
	with open(os.path.join(prop_dir, "meta.json"), "w", encoding="utf-8") as f:
	json.dump(meta, f, indent=2)
	except Exception:
	pass

	out_path = os.path.join(CFG.OUTPUT_GENERATIONS_DIR, f"{safe_prop}_best_fold{fold_idx+1}_generated_psmiles.jsonl")
	try:
	with open(out_path, "w", encoding="utf-8") as fh:
	for r in generations_payload:
	fh.write(json.dumps(make_json_serializable({"property": property_name, "best_fold": fold_idx + 1, **r})) + "\n")
	except Exception as e:
	print(f"[SAVE][WARN] Could not write generations for '{property_name}': {e}")


	# =============================================================================
	# Main per-property CV loop (single-task)
	# =============================================================================
	def run_inverse_design_single_property(
	df: pd.DataFrame,
	property_name: str,
	prop_col: str,
	cl_encoder: MultiModalCLPolymerEncoder,
	selfies_tok,
	selfies_model,
	) -> Dict[str, Any]:
	"""
	Run fivefold CV for a single property and log fold-level metrics.
	Best fold is tracked by decoder validation loss and saved to disk.
	"""
	polymers = build_polymer_records(df, prop_col)

	if len(polymers) < 200:
	print(f"[{property_name}][WARN] Only {len(polymers)} usable samples; results may be noisy.")
	if len(polymers) < 50:
	print(f"[{property_name}][WARN] Skipping due to insufficient usable samples (<50).")
	return {"property": property_name, "runs": [], "agg": None, "n_samples": len(polymers)}

	indices = np.arange(len(polymers))
	kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)

	runs = []
	best_overall_val = float("inf")
	best_bundle = None # kept for completeness; artifacts saved immediately when best improves

	for fold_idx, (trainval_idx, test_idx) in enumerate(kf.split(indices)):
	seed = 42 + fold_idx
	set_seed(seed)

	print(f"\n[{property_name}] Fold {fold_idx+1}/{NUM_FOLDS} \| seed={seed}")

	trainval_polys = [polymers[i] for i in trainval_idx]
	test_polys = [polymers[i] for i in test_idx]

	# Train/val split within trainval
	tr_idx, va_idx = train_test_split(np.arange(len(trainval_polys)), test_size=0.10, random_state=seed, shuffle=True)
	train_polys = [copy.deepcopy(trainval_polys[i]) for i in tr_idx]
	val_polys = [copy.deepcopy(trainval_polys[i]) for i in va_idx]

	# Scale property targets using TRAIN only
	sc = StandardScaler()
	sc.fit(np.array([p["y"] for p in train_polys], dtype=np.float32).reshape(-1, 1))

	# Helper to format records for latent dataset
	def _to_rec(p):
	return {
	"psmiles": p["psmiles"],
	"pselfies": p["pselfies"],
	"graph": p.get("graph", None),
	"geometry": p.get("geometry", None),
	"fingerprints": p.get("fingerprints", None),
	}

	# Decoder training datasets (cache CL embeddings for speed)
	ds_train = LatentToPSELFIESDataset(
	[_to_rec(p) for p in train_polys],
	cl_encoder,
	selfies_tok,
	max_len=GEN_MAX_LEN,
	latent_noise_std=LATENT_NOISE_STD_TRAIN,
	cache_embeddings=True,
	use_multimodal=True,
	)
	ds_val = LatentToPSELFIESDataset(
	[_to_rec(p) for p in val_polys],
	cl_encoder,
	selfies_tok,
	max_len=GEN_MAX_LEN,
	latent_noise_std=0.0,
	cache_embeddings=True,
	use_multimodal=True,
	)

	dl_train = DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True, collate_fn=latent_collate)
	dl_val = DataLoader(ds_val, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True, collate_fn=latent_collate)

	# Fit GPR on PSMILES latents for this property (train only)
	y_tr = [float(p["y"]) for p in train_polys]
	psm_tr = [p["psmiles"] for p in train_polys]
	z_tr = cl_encoder.encode_psmiles(psm_tr, max_len=PSMILES_MAX_LEN, batch_size=64, device=DEVICE)
	prop_model = fit_latent_property_model(z_tr, np.array(y_tr, dtype=np.float32), y_scaler=sc)
	print(f"[{property_name}] Fit PSMILES-latent GPR (n_train={len(y_tr)})")

	# Optional aux oracle (scaled)
	oracle = None
	if VERIFY_GENERATED_PROPERTIES and len(train_polys) >= 200 and len(val_polys) >= 50:
	tr_s, va_s = [], []
	for p in train_polys:
	y_s = float(sc.transform(np.array([[p["y"]]], dtype=np.float32))[0, 0])
	tr_s.append({"psmiles": p["psmiles"], "target": p["y"], "target_scaled": y_s})
	for p in val_polys:
	y_s = float(sc.transform(np.array([[p["y"]]], dtype=np.float32))[0, 0])
	va_s.append({"psmiles": p["psmiles"], "target": p["y"], "target_scaled": y_s})
	try:
	oracle = train_property_oracle_per_fold(tr_s, va_s, cl_encoder.psm_tok, DEVICE, PSMILES_MAX_LEN)
	print(f"[{property_name}] Trained aux oracle (val_loss={getattr(oracle, 'aux_val_loss', None)})")
	except Exception as e:
	print(f"[{property_name}][WARN] Oracle training failed: {e}")
	oracle = None

	# Fresh decoder per fold + optimizer
	selfies_tok_f, selfies_model_f = load_selfies_ted_and_tokenizer(SELFIES_TED_MODEL_NAME)
	decoder = CLConditionedSelfiesTEDGenerator(selfies_tok_f, selfies_model_f, cl_emb_dim=CL_EMB_DIM, mem_len=4).to(DEVICE)
	optimizer, scheduler = create_optimizer_and_scheduler_decoder(decoder)
	scaler_amp = torch.cuda.amp.GradScaler(enabled=USE_AMP)

	best_val = float("inf")
	best_state = None
	no_improve = 0

	for epoch in range(1, NUM_EPOCHS + 1):
	tr = train_one_epoch_decoder(decoder, dl_train, optimizer, scaler_amp, DEVICE)
	va = evaluate_decoder(decoder, dl_val, DEVICE)
	try:
	scheduler.step()
	except Exception:
	pass

	try:
	lr = float(optimizer.param_groups[0]["lr"])
	print(
	f"[{property_name}] fold {fold_idx+1}/{NUM_FOLDS} \| epoch {epoch:03d} \| "
	f"lr={lr:.2e} \| train_loss={tr['loss']:.6f} \| val_loss={va['loss']:.6f}"
	)
	except Exception:
	print(
	f"[{property_name}] fold {fold_idx+1}/{NUM_FOLDS} \| epoch {epoch:03d} \| "
	f"train_loss={tr['loss']:.6f} \| val_loss={va['loss']:.6f}"
	)

	if va["loss"] < best_val - 1e-8:
	best_val = va["loss"]
	no_improve = 0
	best_state = {k: v.detach().cpu().clone() for k, v in decoder.state_dict().items()}
	else:
	no_improve += 1
	if no_improve >= PATIENCE:
	print(f"[{property_name}] Early stopping (no val improvement for {PATIENCE} epochs).")
	break

	if best_state is None:
	print(f"[{property_name}][WARN] No best state captured; skipping this fold.")
	continue

	decoder.load_state_dict({k: v.to(DEVICE) for k, v in best_state.items()}, strict=False)

	# Seed pool for generation (scaled property values, plus modalities for encoding)
	seed_pool = []
	for p in train_polys:
	y_s = float(sc.transform(np.array([[p["y"]]], dtype=np.float32))[0, 0])
	seed_pool.append(
	{
	"psmiles": p["psmiles"],
	"y_scaled": y_s,
	"graph": p.get("graph", None),
	"geometry": p.get("geometry", None),
	"fingerprints": p.get("fingerprints", None),
	}
	)

	train_targets_set = set(ps["psmiles"] for ps in train_polys)

	# Compose test targets (scaled); subsample for runtime control
	ys_test_scaled = []
	for p in test_polys:
	ys_test_scaled.append(float(sc.transform(np.array([[p["y"]]], dtype=np.float32))[0, 0]))
	ys_test_scaled = np.array(ys_test_scaled, dtype=np.float32)
	if len(ys_test_scaled) > 64:
	ys_test_scaled = np.random.choice(ys_test_scaled, size=64, replace=False)

	# Generate per target
	all_valid, all_poly, all_kept, success_scaled, mae_best, diversity_vals = [], [], [], [], [], []
	novelty_vals, uniqueness_vals = [], []
	per_target_records = []

	for y_t in ys_test_scaled:
	out = generate_for_target(
	target_y_scaled=float(y_t),
	prop_model=prop_model,
	cl_encoder=cl_encoder,
	generator=decoder,
	train_seed_pool=seed_pool,
	train_targets_set=train_targets_set,
	n_seeds=8,
	n_noise=min(N_FOLD_NOISE_SAMPLING, 16),
	noise_std=LATENT_NOISE_STD_GEN,
	prop_tol_scaled=PROP_TOL_SCALED,
	oracle=oracle,
	psmiles_tokenizer=cl_encoder.psm_tok,
	)

	m = out["metrics"]
	all_valid.append(float(m.get("validity", 0.0)))
	all_poly.append(float(m.get("polymer_validity", 0.0)))
	all_kept.append(int(m.get("n_kept_property_filtered", 0)))
	diversity_vals.append(float(m.get("diversity", 0.0)))
	success_scaled.append(1.0 if int(m.get("n_kept_property_filtered", 0)) > 0 else 0.0)
	novelty_vals.append(float(m.get("novelty_kept", 0.0)))
	uniqueness_vals.append(float(m.get("uniqueness_valid_unique", 0.0)))

	# Best error among kept candidates
	if out["generated"]:
	z_keep = cl_encoder.encode_psmiles(out["generated"], max_len=PSMILES_MAX_LEN, batch_size=64, device=DEVICE)
	y_pred_s, _ = predict_latent_property(prop_model, z_keep)
	if len(y_pred_s):
	err = np.abs(y_pred_s - float(y_t))
	mae_best.append(float(np.min(err)))
	else:
	mae_best.append(float("inf"))
	else:
	mae_best.append(float("inf"))

	target_y_unscaled = float(sc.inverse_transform(np.array([[float(y_t)]], dtype=np.float32))[0, 0])
	aux_list = out.get("aux_pred_scaled", None)
	if aux_list is not None and not isinstance(aux_list, list):
	aux_list = None

	candidates = []
	gen_list = out.get("generated", []) or []
	pred_s_list = out.get("pred_scaled_kept", []) or []
	pred_u_list = out.get("pred_unscaled_kept", []) or []

	for i_c, psm in enumerate(gen_list):
	cand = {
	"psmiles": str(psm),
	"pred_scaled": float(pred_s_list[i_c]) if i_c < len(pred_s_list) else None,
	"pred_unscaled": float(pred_u_list[i_c]) if i_c < len(pred_u_list) else None,
	"aux_pred_scaled": float(aux_list[i_c]) if (aux_list is not None and i_c < len(aux_list)) else None,
	}
	candidates.append(cand)

	scaler_meta = {
	"scaler_type": "StandardScaler",
	"mean_": getattr(sc, "mean_", None),
	"scale_": getattr(sc, "scale_", None),
	"with_mean": bool(getattr(sc, "with_mean", True)),
	"with_std": bool(getattr(sc, "with_std", True)),
	}

	per_target_records.append(
	{
	"target_y_scaled": float(y_t),
	"target_y_unscaled": float(target_y_unscaled),
	"tol_scaled": float(PROP_TOL_SCALED),
	"tol_unscaled_abs": float(PROP_TOL_UNSCALED_ABS) if PROP_TOL_UNSCALED_ABS is not None else None,
	"scaler_meta": scaler_meta,
	"candidates": candidates,
	"metrics": m,
	}
	)

	def _finite(xs):
	return [x for x in xs if np.isfinite(x)]

	metrics_fold = {
	"validity_mean": float(np.mean(all_valid)) if all_valid else 0.0,
	"polymer_validity_mean": float(np.mean(all_poly)) if all_poly else 0.0,
	"avg_n_kept": float(np.mean(all_kept)) if all_kept else 0.0,
	"success_at_k_scaled": float(np.mean(success_scaled)) if success_scaled else 0.0,
	"mae_best_scaled": float(np.mean(_finite(mae_best))) if _finite(mae_best) else 0.0,
	"diversity_mean": float(np.mean(diversity_vals)) if diversity_vals else 0.0,
	"novelty_mean": float(np.mean(novelty_vals)) if novelty_vals else 0.0,
	"uniqueness_mean": float(np.mean(uniqueness_vals)) if uniqueness_vals else 0.0,
	"tol_scaled": float(PROP_TOL_SCALED),
	"tol_unscaled_abs": float(PROP_TOL_UNSCALED_ABS) if PROP_TOL_UNSCALED_ABS is not None else None,
	}

	run_record = {
	"property": property_name,
	"fold": int(fold_idx + 1),
	"seed": int(seed),
	"n_train": int(len(train_polys)),
	"n_val": int(len(val_polys)),
	"n_test": int(len(test_polys)),
	"best_val_loss": float(best_val),
	"gen_metrics": metrics_fold,
	}
	runs.append(run_record)

	with open(CFG.OUTPUT_RESULTS, "a", encoding="utf-8") as fh:
	fh.write(json.dumps(make_json_serializable(run_record)) + "\n")

	# Save best fold artifacts by lowest validation loss
	if best_val < best_overall_val - 1e-8:
	best_overall_val = best_val
	best_bundle = {
	"fold": int(fold_idx + 1),
	"decoder_state": best_state,
	"prop_model": prop_model,
	"scaler": sc,
	"best_val_loss": float(best_val),
	"generations": per_target_records,
	}
	save_best_fold_artifacts_for_property(
	property_name=property_name,
	fold_idx=fold_idx,
	decoder_state=best_state,
	prop_model=prop_model,
	scaler=sc,
	best_val_loss=best_val,
	generations_payload=per_target_records,
	)
	print(f"[{property_name}] Saved best-fold artifacts (fold {fold_idx+1}, val_loss={best_val:.6f}).")

	# Aggregate across folds
	if not runs:
	return {"property": property_name, "runs": [], "agg": None, "n_samples": len(polymers)}

	def _collect(key):
	xs = [float(r["gen_metrics"].get(key, 0.0)) for r in runs if r.get("gen_metrics", None) is not None]
	return (float(np.mean(xs)) if xs else 0.0, float(np.std(xs)) if xs else 0.0)

	agg = {}
	for k in [
	"validity_mean",
	"polymer_validity_mean",
	"avg_n_kept",
	"success_at_k_scaled",
	"mae_best_scaled",
	"diversity_mean",
	"novelty_mean",
	"uniqueness_mean",
	]:
	m, s = _collect(k)
	agg[k] = {"mean": m, "std": s}

	agg["tol_scaled"] = float(PROP_TOL_SCALED)
	agg["tol_unscaled_abs"] = float(PROP_TOL_UNSCALED_ABS) if PROP_TOL_UNSCALED_ABS is not None else None

	with open(CFG.OUTPUT_RESULTS, "a", encoding="utf-8") as fh:
	fh.write("AGG_PROPERTY: " + json.dumps(make_json_serializable({property_name: agg})) + "\n")

	return {"property": property_name, "runs": runs, "agg": agg, "n_samples": len(polymers)}


	# =============================================================================
	# Entrypoint (single-task per property)
	# =============================================================================
	def main():
	ensure_output_dirs(CFG)

	if not (RDKit_AVAILABLE and SELFIES_AVAILABLE):
	raise RuntimeError("This script requires RDKit and selfies. Install them before running.")

	# Reset results file
	if os.path.exists(CFG.OUTPUT_RESULTS):
	backup = CFG.OUTPUT_RESULTS + ".bak"
	shutil.copy(CFG.OUTPUT_RESULTS, backup)
	print(f"[IO][INFO] Backed up existing results file to: {backup}")
	open(CFG.OUTPUT_RESULTS, "w", encoding="utf-8").close()

	# Load dataset
	if not os.path.isfile(CFG.POLYINFO_PATH):
	raise FileNotFoundError(f"PolyInfo CSV not found: {CFG.POLYINFO_PATH}")

	df = pd.read_csv(CFG.POLYINFO_PATH, engine="python")
	found = find_property_columns(df.columns)
	prop_map = {req: found.get(req) for req in REQUESTED_PROPERTIES}

	print("\n" + "=" * 80)
	print("[RUN] Inverse design (single-task per property)")
	print("=" * 80)
	print(f"[ENV] RDKit_AVAILABLE={RDKit_AVAILABLE} \| SELFIES_AVAILABLE={SELFIES_AVAILABLE}")
	print(f"[ENV] DEVICE={DEVICE} \| USE_AMP={USE_AMP} \| NUM_WORKERS={NUM_WORKERS}")
	print(f"[DATA] POLYINFO_PATH={CFG.POLYINFO_PATH}")
	print(f"[DATA] Property map: {prop_map}")
	print(f"[CL] CL checkpoint dir: {CFG.PRETRAINED_MULTIMODAL_DIR}")
	print(f"[DEC] SELFIES_TED_MODEL_NAME={SELFIES_TED_MODEL_NAME}")
	print(
	f"[DEC] FT params: batch={BATCH_SIZE}, epochs={NUM_EPOCHS}, patience={PATIENCE}, "
	f"lr={LEARNING_RATE}, wd={WEIGHT_DECAY}, sched=CosineAnnealingLR(eta_min={COSINE_ETA_MIN})"
	)
	print(f"[GEN] Latent noise: train_std={LATENT_NOISE_STD_TRAIN}, gen_std={LATENT_NOISE_STD_GEN}, n_noise={N_FOLD_NOISE_SAMPLING}")
	print(f"[GEN] Filter tol: scaled={PROP_TOL_SCALED}, abs={PROP_TOL_UNSCALED_ABS}")
	print(f"[AUX] VERIFY_GENERATED_PROPERTIES={VERIFY_GENERATED_PROPERTIES}")
	print("=" * 80 + "\n")

	# Build PSMILES tokenizer for CL text encoder
	psmiles_tok = build_psmiles_tokenizer(spm_path=CFG.SPM_MODEL_PATH, max_len=PSMILES_MAX_LEN)
	if psmiles_tok is None:
	raise RuntimeError("Failed to build PSMILES tokenizer (check SPM_MODEL_PATH).")

	# Multimodal CL encoder (frozen; used as conditioning interface)
	cl_encoder = MultiModalCLPolymerEncoder(
	psmiles_tokenizer=psmiles_tok,
	emb_dim=CL_EMB_DIM,
	cl_weights_dir=CFG.PRETRAINED_MULTIMODAL_DIR,
	use_gine=True,
	use_schnet=True,
	use_fp=True,
	use_psmiles=True,
	).to(DEVICE)
	cl_encoder.freeze_cl_encoders()

	# Load SELFIES-TED backbone
	selfies_tok, selfies_model = load_selfies_ted_and_tokenizer(SELFIES_TED_MODEL_NAME)
	print(f"[HF][INFO] Loaded SELFIES-TED backbone: {SELFIES_TED_MODEL_NAME}")

	overall = {"per_property": {}}

	# Single-task loop per property
	for pname in REQUESTED_PROPERTIES:
	pcol = prop_map.get(pname, None)
	if pcol is None:
	print(f"[{pname}][WARN] No column match found; skipping.")
	continue

	print(f"\n>>> Property: '{pname}' \| column='{pcol}'")
	res = run_inverse_design_single_property(df, pname, pcol, cl_encoder, selfies_tok, selfies_model)
	overall["per_property"][pname] = res

	# Final summary (aggregated per property)
	final_agg = {}
	for pname, info in overall["per_property"].items():
	final_agg[pname] = info.get("agg", None)

	with open(CFG.OUTPUT_RESULTS, "a", encoding="utf-8") as fh:
	fh.write("\nFINAL_SUMMARY\n")
	fh.write(json.dumps(make_json_serializable(final_agg), indent=2))
	fh.write("\n")

	print("\n" + "=" * 80)
	print("Finished inverse design runs.")
	print(f"Results file: {CFG.OUTPUT_RESULTS}")
	print(f"Best models dir: {CFG.OUTPUT_MODELS_DIR}")
	print(f"Best-fold generations dir: {CFG.OUTPUT_GENERATIONS_DIR}")
	print("=" * 80)


	if __name__ == "__main__":
	main()