Enhance inference: tokenizer, classifier, scorer

Refactors and extends the METANO inference stack: adds a rule-based MolecularClassifier for coarse molecule typing, hardens CharacterLevelChemicalTokenizer (marker handling, normalization, vocab markers), and introduces a SymbolicScorer with heuristic penalties and balanced-bracket checks. Implements BalancedBracketsLogitsProcessor to constrain generation, updates predict_neurosymbolic to a hybrid decode/repair flow (rescore neural candidates with symbolic heuristics, run constrained repair rounds), normalizes candidate scoring and deduplication, and adjusts defaults (e.g. sym_lambda, generation modes). MetanoModel now accepts a pretrained T5 instance from HF and model loading wraps that accordingly. Also updates README formatting and test_run outputs to show richer test diagnostics.

Files changed (3) hide show

README.md +37 -27
metano_inference.py +576 -106
test_run.py +4 -3

README.md CHANGED Viewed

@@ -56,16 +56,16 @@ strings into human‑readable IUPAC names**.
 It is intended for:
--   Cheminformatics researchers\
--   Computational chemists\
--   Chemical database maintainers\
 -   AI-driven chemistry pipelines
 The model is particularly useful for molecules containing:
--   Transition metals\
--   Alkali metals\
--   Lanthanides\
 -   Actinides
 ------------------------------------------------------------------------
@@ -74,9 +74,9 @@ The model is particularly useful for molecules containing:
 The model is **not intended for:**
--   Generating molecular 3D structures\
--   Predicting chemical properties\
--   Reaction prediction\
 -   Translating formats other than InChI (e.g., SMILES) directly to
     IUPAC without conversion
@@ -98,13 +98,13 @@ bracket balancing and basic chemical syntax constraints, it remains a
 Potential issues include:
--   Hallucinated nomenclature for unseen structures\
--   Reduced accuracy for extremely large molecules\
 -   Errors for polymeric or highly unusual compounds
 Training limits:
--   **Maximum InChI length:** 400 characters\
 -   **Maximum IUPAC length:** 150 characters
 ------------------------------------------------------------------------
@@ -147,13 +147,23 @@ out = predict_neurosymbolic(
     inchi=test_inchi,
     scorer=scorer,
     num_candidates=5,
-    sym_lambda=1.0,
     repair_num_candidates=5,
     max_repair_rounds=1
 )
-print("Predicted IUPAC:", out["predicted_iupac"])
-print("Combined Score:", out["combined_score"])
 ```
 ------------------------------------------------------------------------
@@ -167,8 +177,8 @@ covering diverse chemical classes.
 Training subsets include:
--   \~294K inorganic combinations\
--   \~123K organometallic compounds\
 -   \~82K coordination complexes
 Both **standard and reconnected (/r) InChI strings** were included.
@@ -206,13 +216,13 @@ were added using a custom **CharacterLevelChemicalTokenizer**.
 ## Training Hyperparameters
--   **Training regime:** fp16 mixed precision (AMP)\
--   **Optimizer:** AdamW\
--   **Learning Rate:** 3e‑4 with 10% linear warmup and linear decay\
--   **Weight Decay:** 0.01\
--   **Batch Size:** 128 (effective via gradient accumulation = 2)\
--   **Max Input Length:** 410 tokens\
--   **Max Output Length:** 160 tokens\
 -   **Gradient Clipping:** 1.0
 ------------------------------------------------------------------------
@@ -224,9 +234,9 @@ were added using a custom **CharacterLevelChemicalTokenizer**.
 Evaluation was conducted on a **held‑out test split** containing a
 balanced distribution of:
--   **Inorganic Compounds:** METANO achieves a Top-1 accuracy of 0.378, outperforming previously reported results of 0.14.\
--   **Organometallic Compounds:** METANO achieves a Top-1 accuracy of 0.364, outperforming previously reported results of 0.20.\
--   **Co-ordination Compounds:** METANO achieves a Top-1 accuracy of 0.394.\
 -   **Top-K Decoding** Additional gains are observed using Top-K decoding, reaching Top-5 accuracies of 0.481 (inorganic), 0.488 (organometallic) and 0.521 (Co-ordination).
 ------------------------------------------------------------------------

 It is intended for:
+-   Cheminformatics researchers
+-   Computational chemists
+-   Chemical database maintainers
 -   AI-driven chemistry pipelines
 The model is particularly useful for molecules containing:
+-   Transition metals
+-   Alkali metals
+-   Lanthanides
 -   Actinides
 ------------------------------------------------------------------------
 The model is **not intended for:**
+-   Generating molecular 3D structures
+-   Predicting chemical properties
+-   Reaction prediction
 -   Translating formats other than InChI (e.g., SMILES) directly to
     IUPAC without conversion
 Potential issues include:
+-   Hallucinated nomenclature for unseen structures
+-   Reduced accuracy for extremely large molecules
 -   Errors for polymeric or highly unusual compounds
 Training limits:
+-   **Maximum InChI length:** 400 characters
 -   **Maximum IUPAC length:** 150 characters
 ------------------------------------------------------------------------
     inchi=test_inchi,
     scorer=scorer,
     num_candidates=5,
     repair_num_candidates=5,
     max_repair_rounds=1
 )
+print("=== TEST RESULTS ===")
+print(f"Predicted IUPAC: {out['predicted_iupac']}")
+print(f"Hard Fail Triggered: {out['hard_fail']}")
+print(f"Combined Score: {out['combined_score']:.3f}")
+print(f"Symbolic Score: {out['symbolic_score']:.3f}")
+print(f"Neural Score: {out['neural_score']:.3f}")
+if out['reasons']:
+    print(f"Penalty Reasons: {out['reasons']}")
+print("\nTop Candidates:")
+for cand in out["candidates"][1:]:
+    print(f"  [{cand['combined']:.3f}] {cand['text']}")
 ```
 ------------------------------------------------------------------------
 Training subsets include:
+-   \~294K inorganic combinations
+-   \~123K organometallic compounds
 -   \~82K coordination complexes
 Both **standard and reconnected (/r) InChI strings** were included.
 ## Training Hyperparameters
+-   **Training regime:** fp16 mixed precision (AMP)
+-   **Optimizer:** AdamW
+-   **Learning Rate:** 3e‑4 with 10% linear warmup and linear decay
+-   **Weight Decay:** 0.01
+-   **Batch Size:** 128 (effective via gradient accumulation = 2)
+-   **Max Input Length:** 410 tokens
+-   **Max Output Length:** 160 tokens
 -   **Gradient Clipping:** 1.0
 ------------------------------------------------------------------------
 Evaluation was conducted on a **held‑out test split** containing a
 balanced distribution of:
+-   **Inorganic Compounds:** METANO achieves a Top-1 accuracy of 0.378, outperforming previously reported results of 0.14.
+-   **Organometallic Compounds:** METANO achieves a Top-1 accuracy of 0.364, outperforming previously reported results of 0.20.
+-   **Co-ordination Compounds:** METANO achieves a Top-1 accuracy of 0.394.
 -   **Top-K Decoding** Additional gains are observed using Top-K decoding, reaching Top-5 accuracies of 0.481 (inorganic), 0.488 (organometallic) and 0.521 (Co-ordination).
 ------------------------------------------------------------------------

metano_inference.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import re
 import unicodedata
 import numpy as np
 import torch
@@ -13,29 +14,130 @@ from transformers.generation.logits_process import LogitsProcessor
 # Define device globally for inference
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 @dataclass
 class ModelConfig:
     """Configuration for METANO Model"""
     model_name: str = "t5-small"
     max_input_length: int = 410
     max_output_length: int = 160
     metal_elements: List[str] = field(
         default_factory=lambda: [
-            "Li", "Na", "K", "Rb", "Cs", "Be", "Mg", "Ca", "Sr", "Ba", "Sc", "Ti",
-            "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Y", "Zr", "Nb", "Mo",
-            "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "Hf", "Ta", "W", "Re", "Os", "Ir",
-            "Pt", "Au", "Hg", "Al", "Ga", "In", "Tl", "Sn", "Pb", "Bi",
         ]
     )
 class MolecularClassifier:
     def __init__(self):
-        self.transition_metals = {22, 23, 24, 25, 26, 27, 28, 29, 30, 40, 41, 42, 43, 44, 45, 46, 47, 48, 72, 73, 74, 75, 76, 77, 78, 79, 80}
-        self.main_group_metals = {3, 4, 11, 12, 13, 19, 20, 31, 37, 38, 49, 50, 55, 56, 81, 82, 83}
         self.lanthanides = set(range(57, 72))
         self.actinides = set(range(89, 104))
-        self.all_metals = self.transition_metals | self.main_group_metals | self.lanthanides | self.actinides
         self.organometallic_patterns = [
             "[Fe,Co,Ni,Cr,Mn,Mo,W,Ru,Os,Rh,Ir]-C=O",
             "[Fe,Co,Ni,Cr,Mn,Mo,W,Ru,Os,Rh,Ir]-[C-]#[O+]",
@@ -43,6 +145,7 @@ class MolecularClassifier:
             "[Fe,Co,Ni,Ru,Rh,Os,Ir,Ti,V,Cr,Mn,Zr,Mo,W]~c1ccccc1",
         ]
         self.compiled_organometallic = []
         for pattern in self.organometallic_patterns:
             try:
@@ -50,16 +153,37 @@ class MolecularClassifier:
                 if mol is not None:
                     self.compiled_organometallic.append(mol)
             except Exception:
                 pass
     def classify_molecule(self, mol: Chem.Mol) -> Dict[str, any]:
         try:
             has_carbon = self._has_element(mol, 6)
             has_metal = self._has_metals(mol)
             classification = self._classify_by_composition(mol, has_carbon, has_metal)
-            metal_info = self._extract_metals(mol) if has_metal else {"metal_atomic_nums": set(), "metal_symbols": [], "primary_metal": None}
-            return {"classification": classification, "has_metal": has_metal, "primary_metal": metal_info["primary_metal"]}
         except Exception as e:
             return {"classification": "error", "error": str(e)}
     def _has_element(self, mol: Chem.Mol, atomic_num: int) -> bool:
@@ -76,26 +200,42 @@ class MolecularClassifier:
             if z in self.all_metals:
                 metal_atomic_nums.add(z)
                 metal_symbols.append(atom.GetSymbol())
         seen = set()
         metal_symbols = [m for m in metal_symbols if not (m in seen or seen.add(m))]
-        return {"metal_atomic_nums": metal_atomic_nums, "metal_symbols": metal_symbols, "primary_metal": metal_symbols[0] if metal_symbols else None}
     def _has_metal_carbon_bond(self, mol: Chem.Mol) -> bool:
         for bond in mol.GetBonds():
-            a1_num, a2_num = bond.GetBeginAtom().GetAtomicNum(), bond.GetEndAtom().GetAtomicNum()
-            if (a1_num in self.all_metals and a2_num == 6) or (a1_num == 6 and a2_num in self.all_metals):
                 return True
         return False
     def _recover_organometallic_by_smarts(self, mol: Chem.Mol) -> bool:
-        return any(mol.HasSubstructMatch(pattern) for pattern in self.compiled_organometallic)
     def _has_metal_heteroatom_bond(self, mol: Chem.Mol) -> bool:
         donor_atoms = {7, 8, 9, 15, 16, 17, 35, 53}
         for bond in mol.GetBonds():
-            z1, z2 = bond.GetBeginAtom().GetAtomicNum(), bond.GetEndAtom().GetAtomicNum()
-            if (z1 in self.all_metals and z2 in donor_atoms) or (z2 in self.all_metals and z1 in donor_atoms):
                 return True
         return False
@@ -109,12 +249,20 @@ class MolecularClassifier:
                         return True
         return False
-    def _classify_by_composition(self, mol: Chem.Mol, has_carbon: bool, has_metal: bool) -> str:
         if has_metal and has_carbon:
-            if self._has_metal_carbon_bond(mol) or self._recover_organometallic_by_smarts(mol):
                 return "organometallic"
             elif self._has_metal_heteroatom_bond(mol):
-                return "inorganic" if self._is_simple_inorganic_salt(mol) else "coordination"
             return "inorganic"
         elif (has_metal and not has_carbon) or (not has_carbon and not has_metal):
             return "inorganic"
@@ -122,20 +270,96 @@ class MolecularClassifier:
             return "organic"
         return "unclassified"
 class CharacterLevelChemicalTokenizer:
     def __init__(self, config):
         self.config = config
         self.metals = set(self.config.metal_elements)
-        self.control_tokens = ["<ORGANIC>", "<ORGANOMETALLIC>", "<INORGANIC>", "<COORDINATION>", "<STANDARD_INCHI>", "<RECONNECTED_INCHI>"]
-        self.structural_markers = ["<METAL>"] + [f"<METAL_{metal.upper()}>" for metal in sorted(self.metals)]
         self.specials = ["<PAD>", "<UNK>", "<START>", "<END>"]
         base_chars = [
-            " ", "-", "=", "#", "+", "(", ")", "[", "]", "{", "}", "/", "\\", ",", ".", ":", ";", "@", "*", "&", "|", "'", '"',
-            *"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", *"0123456789",
-            "α", "β", "γ", "δ", "ε", "ζ", "η", "θ", "κ", "λ", "μ", "ν", "ξ", "π", "ρ", "σ", "τ", "φ", "χ", "ψ", "ω", "Δ", "Λ",
-            "⁰", "¹", "²", "³", "⁴", "⁵", "⁶", "⁷", "⁸", "⁹", "⁺", "⁻", "₀", "₁", "₂", "₃", "₄", "₅", "₆", "₇", "₈", "₉",
         ]
         all_markers = self.control_tokens + self.structural_markers
@@ -151,23 +375,31 @@ class CharacterLevelChemicalTokenizer:
         self.bos_token_id = self.token2idx["<START>"]
         self.eos_token_id = self.token2idx["<END>"]
-        self.marker_pattern = re.compile("|".join(map(re.escape, self.sorted_markers))) if self.sorted_markers else None
     def _normalize(self, text: str) -> str:
-        if text is None: return ""
         text = unicodedata.normalize("NFKC", str(text)).replace("\u00a0", " ").strip()
         return " ".join(text.split())
     def tokenize(self, text: str) -> List[str]:
         text = self._normalize(text)
-        if not text: return []
         tokens, pos = [], 0
         if self.marker_pattern:
             for m in self.marker_pattern.finditer(text):
-                if m.start() > pos: tokens.extend(list(text[pos : m.start()]))
                 tokens.append(m.group())
                 pos = m.end()
-        if pos < len(text): tokens.extend(list(text[pos:]))
         return tokens
     def encode(self, text: str, max_length: int, is_target: bool = False) -> Dict:
@@ -185,21 +417,32 @@ class CharacterLevelChemicalTokenizer:
             padded_ids = input_ids + [-100] * pad_len
         else:
             padded_ids = input_ids + [self.pad_token_id] * pad_len
         attention_mask = [1] * len(input_ids) + [0] * pad_len
         return {
             "input_ids": torch.tensor(padded_ids, dtype=torch.long),
             "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
         }
-    def decode(self, token_ids: Union[torch.Tensor, List[int]], skip_special_tokens: bool = True) -> str:
-        if isinstance(token_ids, torch.Tensor): token_ids = token_ids.tolist()
         out_tokens = []
         for idx in token_ids:
-            if idx == self.eos_token_id or idx == -100: break
-            if idx == self.pad_token_id: continue
             tok = self.idx2token.get(idx, "<UNK>")
-            if skip_special_tokens and (tok in self.specials or tok in self.control_tokens or tok in self.structural_markers):
                 continue
             out_tokens.append(tok)
         return "".join(out_tokens).strip()
@@ -207,155 +450,376 @@ class CharacterLevelChemicalTokenizer:
     def get_vocab_size(self) -> int:
         return self.vocab_size
-    def preprocess_inchi(self, inchi: str, category: Optional[str] = None, has_metal: bool = False, primary_metal: Optional[str] = None) -> str:
-        if not inchi: return ""
         control_prefix = []
         category_lower = category.lower() if category else "organic"
-        if "organometallic" in category_lower: control_prefix.append("<ORGANOMETALLIC>")
-        elif "coordination" in category_lower: control_prefix.append("<COORDINATION>")
-        elif "inorganic" in category_lower: control_prefix.append("<INORGANIC>")
-        else: control_prefix.append("<ORGANIC>")
-        control_prefix.append("<RECONNECTED_INCHI>" if "/r" in inchi else "<STANDARD_INCHI>")
         if has_metal:
-            metal_tok = f"<METAL_{primary_metal.upper()}>" if primary_metal else "<METAL>"
-            control_prefix.append(metal_tok if metal_tok in self.token2idx else "<METAL>")
         return "".join(control_prefix) + inchi
     def preprocess_iupac(self, iupac: str) -> str:
         return self._normalize(iupac)
 class MetanoModel(nn.Module):
-    def __init__(self, config: ModelConfig, classifier: MolecularClassifier, pretrained_t5: Optional[T5ForConditionalGeneration] = None):
         super().__init__()
         self.config = config
         self.classifier = classifier
         self.tokenizer = CharacterLevelChemicalTokenizer(config)
-        if pretrained_t5 is not None:
-            self.model = pretrained_t5
-        else:
-            t5_config = T5Config.from_pretrained(config.model_name)
-            t5_config.vocab_size = self.tokenizer.get_vocab_size()
-            t5_config.pad_token_id = self.tokenizer.pad_token_id
-            t5_config.eos_token_id = self.tokenizer.eos_token_id
-            t5_config.decoder_start_token_id = self.tokenizer.pad_token_id
-            self.model = T5ForConditionalGeneration(config=t5_config)
-            self.model.resize_token_embeddings(self.tokenizer.get_vocab_size())
         self.model.config.use_cache = True
 @dataclass
 class SymbolicResult:
     score: float
     hard_fail: bool
     reasons: List[str]
 class SymbolicScorer:
     def __init__(self, metals: List[str]):
         self.metals = [m.lower() for m in metals]
     def _balanced(self, s: str) -> Tuple[bool, List[str]]:
         stack, pairs = [], {")": "(", "]": "[", "}": "{"}
         opens, closes = set(pairs.values()), set(pairs.keys())
         for ch in s:
-            if ch in opens: stack.append(ch)
             elif ch in closes:
-                if not stack or stack[-1] != pairs[ch]: return False, [f"Unbalanced bracket: found '{ch}' without matching '{pairs[ch]}'"]
                 stack.pop()
-        if stack: return False, [f"Unbalanced bracket: missing closers for {stack}"]
         return True, []
     def score(self, src: str, pred: str) -> SymbolicResult:
         reasons = []
         ok_balance, balance_reasons = self._balanced(pred)
         reasons.extend(balance_reasons)
-        if len(pred.strip()) == 0: reasons.append("Empty prediction")
-        if re.search(r"[,\\.\\-]{3,}", pred): reasons.append("Repeated punctuation")
-        if "  " in pred: reasons.append("Double spaces")
         hard_fail = (not ok_balance) or ("Empty prediction" in reasons)
-        score = 0.5 if not reasons else sum([-2.0 if "Unbalanced" in r else -3.0 if "Empty" in r else -0.5 for r in reasons])
         return SymbolicResult(score=score, hard_fail=hard_fail, reasons=reasons)
 class BalancedBracketsLogitsProcessor(LogitsProcessor):
     def __init__(self, tok2id: Dict[str, int]):
-        self.ids = {ch: tok2id[ch] for ch in ["(", ")", "[", "]", "{", "}"] if ch in tok2id}
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        if not self.ids: return scores
         for b in range(input_ids.size(0)):
             seq = input_ids[b].tolist()
-            open_par, close_par = seq.count(self.ids.get("(", -1)), seq.count(self.ids.get(")", -1))
-            open_sq, close_sq = seq.count(self.ids.get("[", -1)), seq.count(self.ids.get("]", -1))
-            if close_par >= open_par and ")" in self.ids: scores[b, self.ids[")"]] = -float("inf")
-            if close_sq >= open_sq and "]" in self.ids: scores[b, self.ids["]"]] = -float("inf")
         return scores
 @torch.no_grad()
 def predict_neurosymbolic(
-    model: MetanoModel, inchi: str, scorer: SymbolicScorer,
-    category: Optional[str] = None, has_metal: Optional[bool] = None, primary_metal: Optional[str] = None,
-    num_candidates: int = 8, sym_lambda: float = 1.0, repair_num_candidates: int = 16, max_repair_rounds: int = 3
 ):
     model.model.eval()
     if category is None or has_metal is None:
         mol = Chem.MolFromInchi(inchi)
         if mol is not None:
             classification = model.classifier.classify_molecule(mol)
-            if category is None: category = classification.get("classification", "organic")
-            if has_metal is None: has_metal = classification.get("has_metal", False)
-            if primary_metal is None and has_metal: primary_metal = classification.get("primary_metal")
         else:
-            if category is None: category = "organic"
-            if has_metal is None: has_metal = False
     category = category or "organic"
-    src = model.tokenizer.preprocess_inchi(inchi, category=category, has_metal=has_metal, primary_metal=primary_metal)
     enc = model.tokenizer.encode(src, model.config.max_input_length)
     input_ids = enc["input_ids"].unsqueeze(0).to(device)
     attention_mask = enc["attention_mask"].unsqueeze(0).to(device)
     def _dedup_key(s: str) -> str:
-        if not s: return ""
         s = " ".join(s.strip().lower().split())
         return re.sub(r"\s*([(),\[\]{}\-+/=.:;·])\s*", r"\1", s)
     def _generate(ncand: int, use_constraints: bool, mode: str = "beam"):
         kwargs = dict(
-            input_ids=input_ids, attention_mask=attention_mask, max_length=model.config.max_output_length,
-            num_beams=ncand, num_return_sequences=ncand, early_stopping=True,
-            pad_token_id=model.tokenizer.pad_token_id, eos_token_id=model.tokenizer.eos_token_id,
-            return_dict_in_generate=True, output_scores=True,
         )
-        if use_constraints: kwargs["logits_processor"] = [BalancedBracketsLogitsProcessor(model.tokenizer.token2idx)]
-        if mode == "sample": kwargs.update(do_sample=True, top_p=0.92, temperature=0.8, num_beams=1)
-        out = model.model.generate(**kwargs) if device.type != "cuda" else \
-              torch.autocast(device_type="cuda", dtype=torch.float16)(model.model.generate)(**kwargs)
-        preds = [model.tokenizer.decode(seq, skip_special_tokens=True) for seq in out.sequences]
-        neural_scores = out.sequences_scores.detach().float().cpu().numpy() if hasattr(out, "sequences_scores") else np.zeros(ncand)
         return preds, neural_scores
     pool = {}
     def _add_to_pool(preds, nscores):
         for p, ns in zip(preds, nscores):
             sym = scorer.score(src, p)
-            combined = float(ns) + sym_lambda * float(sym.score)
             key = _dedup_key(p)
             if key not in pool or combined > pool[key][0]:
-                pool[key] = (combined, float(ns), float(sym.score), sym.hard_fail, sym.reasons, p)
     preds1, ns1 = _generate(num_candidates, use_constraints=False, mode="beam")
     _add_to_pool(preds1, ns1)
     best = sorted(pool.values(), key=lambda x: x[0], reverse=True)[0]
     repair_modes, repair_round = ["beam", "diverse", "sample"], 0
@@ -363,16 +827,22 @@ def predict_neurosymbolic(
         mode = repair_modes[min(repair_round, len(repair_modes) - 1)]
         preds2, ns2 = _generate(repair_num_candidates, use_constraints=True, mode=mode)
         _add_to_pool(preds2, ns2)
-        best = sorted(pool.values(), key=lambda x: x[0], reverse=True)[0]
         repair_round += 1
     ranked_all = sorted(pool.values(), key=lambda x: x[0], reverse=True)
     return {
-        "inchi": inchi, "predicted_iupac": best[5], "neural_score": best[1], "symbolic_score": best[2],
-        "combined_score": best[0], "hard_fail": best[3], "reasons": best[4],
         "candidates": [{"text": r[5], "combined": r[0]} for r in ranked_all[:10]],
     }
 def load_model_from_hf(repo_id: str) -> MetanoModel:
     """
     Downloads and loads the METANO T5 model directly from the Hugging Face Hub.
@@ -380,13 +850,13 @@ def load_model_from_hf(repo_id: str) -> MetanoModel:
     print(f"Loading METANO model from Hugging Face Hub: {repo_id}")
     config = ModelConfig()
     classifier = MolecularClassifier()
     # Load the underlying T5 model weights and config from the Hub
     t5_model = T5ForConditionalGeneration.from_pretrained(repo_id)
     # Wrap it in the custom MetanoModel architecture
     model = MetanoModel(config, classifier, pretrained_t5=t5_model)
     model.to(device)
     print("Model successfully loaded to device:", device)
-    return model

 import os
 import re
+import math
 import unicodedata
 import numpy as np
 import torch
 # Define device globally for inference
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 @dataclass
 class ModelConfig:
     """Configuration for METANO Model"""
     model_name: str = "t5-small"
     max_input_length: int = 410
     max_output_length: int = 160
     metal_elements: List[str] = field(
         default_factory=lambda: [
+            "Li",
+            "Na",
+            "K",
+            "Rb",
+            "Cs",
+            "Be",
+            "Mg",
+            "Ca",
+            "Sr",
+            "Ba",
+            "Sc",
+            "Ti",
+            "V",
+            "Cr",
+            "Mn",
+            "Fe",
+            "Co",
+            "Ni",
+            "Cu",
+            "Zn",
+            "Y",
+            "Zr",
+            "Nb",
+            "Mo",
+            "Tc",
+            "Ru",
+            "Rh",
+            "Pd",
+            "Ag",
+            "Cd",
+            "Hf",
+            "Ta",
+            "W",
+            "Re",
+            "Os",
+            "Ir",
+            "Pt",
+            "Au",
+            "Hg",
+            "Al",
+            "Ga",
+            "In",
+            "Tl",
+            "Sn",
+            "Pb",
+            "Bi",
         ]
     )
 class MolecularClassifier:
+    """Rule-based molecular category classifier used to condition generation."""
     def __init__(self):
+        # Atomic-number groups used by classifier heuristics.
+        self.transition_metals = {
+            22,
+            23,
+            24,
+            25,
+            26,
+            27,
+            28,
+            29,
+            30,
+            40,
+            41,
+            42,
+            43,
+            44,
+            45,
+            46,
+            47,
+            48,
+            72,
+            73,
+            74,
+            75,
+            76,
+            77,
+            78,
+            79,
+            80,
+        }
+        self.main_group_metals = {
+            3,
+            4,
+            11,
+            12,
+            13,
+            19,
+            20,
+            31,
+            37,
+            38,
+            49,
+            50,
+            55,
+            56,
+            81,
+            82,
+            83,
+        }
         self.lanthanides = set(range(57, 72))
         self.actinides = set(range(89, 104))
+        self.all_metals = (
+            self.transition_metals
+            | self.main_group_metals
+            | self.lanthanides
+            | self.actinides
+        )
+        # SMARTS recovery patterns for common organometallic motifs that may not
+        # be captured by simple direct metal–carbon checks.
         self.organometallic_patterns = [
             "[Fe,Co,Ni,Cr,Mn,Mo,W,Ru,Os,Rh,Ir]-C=O",
             "[Fe,Co,Ni,Cr,Mn,Mo,W,Ru,Os,Rh,Ir]-[C-]#[O+]",
             "[Fe,Co,Ni,Ru,Rh,Os,Ir,Ti,V,Cr,Mn,Zr,Mo,W]~c1ccccc1",
         ]
+        # Compile once for faster repeated substructure checks during inference.
         self.compiled_organometallic = []
         for pattern in self.organometallic_patterns:
             try:
                 if mol is not None:
                     self.compiled_organometallic.append(mol)
             except Exception:
+                # Ignore malformed SMARTS entries instead of failing startup.
                 pass
     def classify_molecule(self, mol: Chem.Mol) -> Dict[str, any]:
+        """
+        Return coarse category metadata for prompt conditioning:
+        - classification (organic / inorganic / organometallic / coordination)
+        - has_metal (bool)
+        - primary_metal (first detected metal symbol, if any)
+        """
         try:
             has_carbon = self._has_element(mol, 6)
             has_metal = self._has_metals(mol)
             classification = self._classify_by_composition(mol, has_carbon, has_metal)
+            metal_info = (
+                self._extract_metals(mol)
+                if has_metal
+                else {
+                    "metal_atomic_nums": set(),
+                    "metal_symbols": [],
+                    "primary_metal": None,
+                }
+            )
+            return {
+                "classification": classification,
+                "has_metal": has_metal,
+                "primary_metal": metal_info["primary_metal"],
+            }
         except Exception as e:
+            # Keep inference robust if RDKit parsing/classification fails
             return {"classification": "error", "error": str(e)}
     def _has_element(self, mol: Chem.Mol, atomic_num: int) -> bool:
             if z in self.all_metals:
                 metal_atomic_nums.add(z)
                 metal_symbols.append(atom.GetSymbol())
         seen = set()
         metal_symbols = [m for m in metal_symbols if not (m in seen or seen.add(m))]
+        return {
+            "metal_atomic_nums": metal_atomic_nums,
+            "metal_symbols": metal_symbols,
+            "primary_metal": metal_symbols[0] if metal_symbols else None,
+        }
     def _has_metal_carbon_bond(self, mol: Chem.Mol) -> bool:
         for bond in mol.GetBonds():
+            a1_num, a2_num = (
+                bond.GetBeginAtom().GetAtomicNum(),
+                bond.GetEndAtom().GetAtomicNum(),
+            )
+            if (a1_num in self.all_metals and a2_num == 6) or (
+                a1_num == 6 and a2_num in self.all_metals
+            ):
                 return True
         return False
     def _recover_organometallic_by_smarts(self, mol: Chem.Mol) -> bool:
+        return any(
+            mol.HasSubstructMatch(pattern) for pattern in self.compiled_organometallic
+        )
     def _has_metal_heteroatom_bond(self, mol: Chem.Mol) -> bool:
         donor_atoms = {7, 8, 9, 15, 16, 17, 35, 53}
         for bond in mol.GetBonds():
+            z1, z2 = (
+                bond.GetBeginAtom().GetAtomicNum(),
+                bond.GetEndAtom().GetAtomicNum(),
+            )
+            if (z1 in self.all_metals and z2 in donor_atoms) or (
+                z2 in self.all_metals and z1 in donor_atoms
+            ):
                 return True
         return False
                         return True
         return False
+    def _classify_by_composition(
+        self, mol: Chem.Mol, has_carbon: bool, has_metal: bool
+    ) -> str:
         if has_metal and has_carbon:
+            if self._has_metal_carbon_bond(
+                mol
+            ) or self._recover_organometallic_by_smarts(mol):
                 return "organometallic"
             elif self._has_metal_heteroatom_bond(mol):
+                return (
+                    "inorganic"
+                    if self._is_simple_inorganic_salt(mol)
+                    else "coordination"
+                )
             return "inorganic"
         elif (has_metal and not has_carbon) or (not has_carbon and not has_metal):
             return "inorganic"
             return "organic"
         return "unclassified"
 class CharacterLevelChemicalTokenizer:
     def __init__(self, config):
         self.config = config
         self.metals = set(self.config.metal_elements)
+        self.control_tokens = [
+            "<ORGANIC>",
+            "<ORGANOMETALLIC>",
+            "<INORGANIC>",
+            "<COORDINATION>",
+            "<STANDARD_INCHI>",
+            "<RECONNECTED_INCHI>",
+        ]
+        self.structural_markers = ["<METAL>"] + [
+            f"<METAL_{metal.upper()}>" for metal in sorted(self.metals)
+        ]
         self.specials = ["<PAD>", "<UNK>", "<START>", "<END>"]
         base_chars = [
+            " ",
+            "-",
+            "=",
+            "#",
+            "+",
+            "(",
+            ")",
+            "[",
+            "]",
+            "{",
+            "}",
+            "/",
+            "\\",
+            ",",
+            ".",
+            ":",
+            ";",
+            "@",
+            "*",
+            "&",
+            "|",
+            "'",
+            '"',
+            *"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
+            *"0123456789",
+            "α",
+            "β",
+            "γ",
+            "δ",
+            "ε",
+            "ζ",
+            "η",
+            "θ",
+            "κ",
+            "λ",
+            "μ",
+            "ν",
+            "ξ",
+            "π",
+            "ρ",
+            "σ",
+            "τ",
+            "φ",
+            "χ",
+            "ψ",
+            "ω",
+            "Δ",
+            "Λ",
+            "⁰",
+            "¹",
+            "²",
+            "³",
+            "⁴",
+            "⁵",
+            "⁶",
+            "⁷",
+            "⁸",
+            "⁹",
+            "⁺",
+            "⁻",
+            "₀",
+            "₁",
+            "₂",
+            "₃",
+            "₄",
+            "₅",
+            "₆",
+            "₇",
+            "₈",
+            "₉",
         ]
         all_markers = self.control_tokens + self.structural_markers
         self.bos_token_id = self.token2idx["<START>"]
         self.eos_token_id = self.token2idx["<END>"]
+        self.marker_pattern = (
+            re.compile("|".join(map(re.escape, self.sorted_markers)))
+            if self.sorted_markers
+            else None
+        )
     def _normalize(self, text: str) -> str:
+        if text is None:
+            return ""
         text = unicodedata.normalize("NFKC", str(text)).replace("\u00a0", " ").strip()
         return " ".join(text.split())
     def tokenize(self, text: str) -> List[str]:
         text = self._normalize(text)
+        if not text:
+            return []
         tokens, pos = [], 0
         if self.marker_pattern:
             for m in self.marker_pattern.finditer(text):
+                if m.start() > pos:
+                    tokens.extend(list(text[pos : m.start()]))
                 tokens.append(m.group())
                 pos = m.end()
+        if pos < len(text):
+            tokens.extend(list(text[pos:]))
         return tokens
     def encode(self, text: str, max_length: int, is_target: bool = False) -> Dict:
             padded_ids = input_ids + [-100] * pad_len
         else:
             padded_ids = input_ids + [self.pad_token_id] * pad_len
         attention_mask = [1] * len(input_ids) + [0] * pad_len
         return {
             "input_ids": torch.tensor(padded_ids, dtype=torch.long),
             "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
         }
+    def decode(
+        self,
+        token_ids: Union[torch.Tensor, List[int]],
+        skip_special_tokens: bool = True,
+    ) -> str:
+        if isinstance(token_ids, torch.Tensor):
+            token_ids = token_ids.tolist()
         out_tokens = []
         for idx in token_ids:
+            if idx == self.eos_token_id or idx == -100:
+                break
+            if idx == self.pad_token_id:
+                continue
             tok = self.idx2token.get(idx, "<UNK>")
+            if skip_special_tokens and (
+                tok in self.specials
+                or tok in self.control_tokens
+                or tok in self.structural_markers
+            ):
                 continue
             out_tokens.append(tok)
         return "".join(out_tokens).strip()
     def get_vocab_size(self) -> int:
         return self.vocab_size
+    def preprocess_inchi(
+        self,
+        inchi: str,
+        category: str,
+        has_metal: bool = False,
+        primary_metal: Optional[str] = None,
+    ) -> str:
+        if not inchi:
+            return ""
         control_prefix = []
         category_lower = category.lower() if category else "organic"
+        if "organometallic" in category_lower:
+            control_prefix.append("<ORGANOMETALLIC>")
+        elif "coordination" in category_lower:
+            control_prefix.append("<COORDINATION>")
+        elif "inorganic" in category_lower:
+            control_prefix.append("<INORGANIC>")
+        else:
+            control_prefix.append("<ORGANIC>")
+        control_prefix.append(
+            "<RECONNECTED_INCHI>" if "/r" in inchi else "<STANDARD_INCHI>"
+        )
         if has_metal:
+            metal_tok = (
+                f"<METAL_{primary_metal.upper()}>" if primary_metal else "<METAL>"
+            )
+            control_prefix.append(
+                metal_tok if metal_tok in self.token2idx else "<METAL>"
+            )
         return "".join(control_prefix) + inchi
     def preprocess_iupac(self, iupac: str) -> str:
         return self._normalize(iupac)
 class MetanoModel(nn.Module):
+    def __init__(
+        self,
+        config: ModelConfig,
+        classifier: MolecularClassifier,
+        pretrained_t5: T5ForConditionalGeneration,
+    ):
         super().__init__()
         self.config = config
         self.classifier = classifier
         self.tokenizer = CharacterLevelChemicalTokenizer(config)
+        self.model = pretrained_t5
         self.model.config.use_cache = True
 @dataclass
 class SymbolicResult:
     score: float
     hard_fail: bool
     reasons: List[str]
 class SymbolicScorer:
+    """Symbolic validator/penalizer for generated IUPAC candidates."""
     def __init__(self, metals: List[str]):
         self.metals = [m.lower() for m in metals]
+        # Minimal lexical hints used by heuristic checks.
+        self.VALID_SUFFIXES = [
+            "ane",
+            "ene",
+            "yne",
+            "ol",
+            "one",
+            "al",
+            "amine",
+            "amide",
+            "acid",
+            "ate",
+            "ether",
+            "ester",
+            "thiol",
+            "imine",
+            "benzene",
+        ]
+        # Prefixes expected to be attached/hyphenated consistently in IUPAC-like text.
+        self.MULTIPLICATIVE_PREFIXES = [
+            "mono",
+            "di",
+            "tri",
+            "tetra",
+            "penta",
+            "hexa",
+            "hepta",
+            "octa",
+            "nona",
+            "deca",
+            "bis",
+            "tris",
+            "tetrakis",
+            "pentakis",
+            "hexakis",
+        ]
+        # Penalty table used to compute symbolic score in [0, 1] via 1 - total_penalty.
+        self.PENALTY_WEIGHTS = {
+            "Empty prediction": 1.0,
+            "Unbalanced bracket": 1.0,
+            "Double spaces": 0.1,
+            "Repeated punctuation": 0.4,
+            "Repeated comma": 0.3,
+            "Invalid hyphen usage": 0.3,
+            "Locant without substituent": 0.6,
+            "Prediction too short": 0.5,
+            "Prediction too long": 0.4,
+            "Repeated token": 0.3,
+            "Invalid spacing after multiplicative prefix": 0.2,
+        }
     def _balanced(self, s: str) -> Tuple[bool, List[str]]:
+        """Check (), [], {} bracket balance and nesting correctness."""
         stack, pairs = [], {")": "(", "]": "[", "}": "{"}
         opens, closes = set(pairs.values()), set(pairs.keys())
         for ch in s:
+            if ch in opens:
+                stack.append(ch)
             elif ch in closes:
+                if not stack or stack[-1] != pairs[ch]:
+                    return False, [
+                        f"Unbalanced bracket: found '{ch}' without matching '{pairs[ch]}'"
+                    ]
                 stack.pop()
+        if stack:
+            return False, [f"Unbalanced bracket: missing closers for {stack}"]
         return True, []
     def score(self, src: str, pred: str) -> SymbolicResult:
+        """
+        Score candidate text with rule-based penalties.
+        hard_fail is set for structurally invalid outputs (empty or unbalanced).
+        """
+        pred = pred.strip()
         reasons = []
+        if len(pred) == 0:
+            reasons.append("Empty prediction")
+        # Structural check first (most important hard-fail signal).
         ok_balance, balance_reasons = self._balanced(pred)
         reasons.extend(balance_reasons)
+        # Surface-form sanity checks.
+        # Double spaces
+        if "  " in pred:
+            reasons.append("Double spaces")
+        # Repeated punctuation
+        if re.search(r"[,\.\-]{3,}", pred):
+            reasons.append("Repeated punctuation")
+        # Repeated commas
+        if ",," in pred:
+            reasons.append("Repeated comma")
+        # Invalid hyphen
+        if re.search(r"--|,-|-,", pred):
+            reasons.append("Invalid hyphen usage")
+        # Invalid locant
+        if re.search(r"\b\d+(,\d+)*-$", pred):
+            reasons.append("Locant without substituent")
+        # Length sanity
+        if len(pred) < 4:
+            reasons.append("Prediction too short")
+        if len(pred) > 200:
+            reasons.append("Prediction too long")
+        # Repeated words
+        if re.search(r"\b(\w+)\s+\1\b", pred.lower()):
+            reasons.append("Repeated token")
+        # Prefix spacing check for IUPAC-like style.
+        for prefix in self.MULTIPLICATIVE_PREFIXES:
+            if re.search(rf"\b{prefix}\s+", pred.lower()):
+                reasons.append(
+                    f"Invalid spacing after multiplicative prefix '{prefix}'"
+                )
+        # Convert reason strings to numeric penalty.
+        total_penalty = 0.0
+        for reason in reasons:
+            for key, weight in self.PENALTY_WEIGHTS.items():
+                if key in reason:
+                    total_penalty += weight
         hard_fail = (not ok_balance) or ("Empty prediction" in reasons)
+        score = max(0.0, 1.0 - total_penalty)
         return SymbolicResult(score=score, hard_fail=hard_fail, reasons=reasons)
 class BalancedBracketsLogitsProcessor(LogitsProcessor):
+    """Generation-time constraint: suppress unmatched closing brackets."""
     def __init__(self, tok2id: Dict[str, int]):
+        # Keep only bracket tokens present in tokenizer vocabulary.
+        self.ids = {
+            ch: tok2id[ch] for ch in ["(", ")", "[", "]", "{", "}"] if ch in tok2id
+        }
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        if not self.ids:
+            return scores
         for b in range(input_ids.size(0)):
             seq = input_ids[b].tolist()
+            open_par, close_par = seq.count(self.ids.get("(", -1)), seq.count(
+                self.ids.get(")", -1)
+            )
+            open_sq, close_sq = seq.count(self.ids.get("[", -1)), seq.count(
+                self.ids.get("]", -1)
+            )
+            # If closers already meet/exceed openers, block emitting more closers.
+            if close_par >= open_par and ")" in self.ids:
+                scores[b, self.ids[")"]] = -float("inf")
+            if close_sq >= open_sq and "]" in self.ids:
+                scores[b, self.ids["]"]] = -float("inf")
         return scores
 @torch.no_grad()
 def predict_neurosymbolic(
+    model: MetanoModel,
+    inchi: str,
+    scorer: SymbolicScorer,
+    category: Optional[str] = None,
+    has_metal: Optional[bool] = None,
+    primary_metal: Optional[str] = None,
+    num_candidates: int = 8,
+    sym_lambda: float = 0.5,
+    repair_num_candidates: int = 16,
+    max_repair_rounds: int = 3,
 ):
+    """
+    Hybrid decoding:
+    1) Generate candidates with neural model.
+    2) Rescore with symbolic heuristics.
+    3) If best candidate hard-fails, run constrained repair rounds.
+    """
     model.model.eval()
+    # Derive molecule metadata when not provided by caller.
     if category is None or has_metal is None:
         mol = Chem.MolFromInchi(inchi)
         if mol is not None:
             classification = model.classifier.classify_molecule(mol)
+            if category is None:
+                category = classification.get("classification", "organic")
+            if has_metal is None:
+                has_metal = classification.get("has_metal", False)
+            if primary_metal is None and has_metal:
+                primary_metal = classification.get("primary_metal")
         else:
+            if category is None:
+                category = "organic"
+            if has_metal is None:
+                has_metal = False
+    # Prepare source sequence with control and structural markers.
     category = category or "organic"
+    src = model.tokenizer.preprocess_inchi(
+        inchi, category=category, has_metal=has_metal, primary_metal=primary_metal
+    )
     enc = model.tokenizer.encode(src, model.config.max_input_length)
     input_ids = enc["input_ids"].unsqueeze(0).to(device)
     attention_mask = enc["attention_mask"].unsqueeze(0).to(device)
+    # Normalize prediction text for de-duplication across beam/sample outputs.
     def _dedup_key(s: str) -> str:
+        if not s:
+            return ""
         s = " ".join(s.strip().lower().split())
         return re.sub(r"\s*([(),\[\]{}\-+/=.:;·])\s*", r"\1", s)
+    # HF requirement: num_beams must be divisible by num_beam_groups.
+    def _choose_beam_groups(ncand: int, max_groups: int = 4) -> int:
+        gmax = min(max_groups, ncand)
+        for g in range(gmax, 1, -1):
+            if ncand % g == 0:
+                return g
+        return 1
+    # Shared candidate generation helper for beam/diverse/sample modes.
     def _generate(ncand: int, use_constraints: bool, mode: str = "beam"):
         kwargs = dict(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_length=model.config.max_output_length,
+            num_beams=ncand,
+            num_return_sequences=ncand,
+            early_stopping=True,
+            pad_token_id=model.tokenizer.pad_token_id,
+            eos_token_id=model.tokenizer.eos_token_id,
+            return_dict_in_generate=True,
+            output_scores=True,
+        )
+        # ---- Constraint tweaks ----
+        if use_constraints:
+            kwargs["logits_processor"] = [
+                BalancedBracketsLogitsProcessor(model.tokenizer.token2idx)
+            ]
+        # ---- Mode tweaks ----
+        if mode == "diverse":
+            g = _choose_beam_groups(ncand, max_groups=4)
+            # Only enable diverse beams if we can form >1 groups
+            if g > 1:
+                kwargs.update(
+                    num_beam_groups=g,
+                    diversity_penalty=0.2,
+                )
+        elif mode == "sample":
+            # Sampling fallback: get ncand independent samples
+            kwargs.update(
+                do_sample=True,
+                top_p=0.92,
+                temperature=0.8,
+                num_beams=1,
+                num_return_sequences=ncand,
+            )
+        out = (
+            model.model.generate(**kwargs)
+            if device.type != "cuda"
+            else torch.autocast(device_type="cuda", dtype=torch.float16)(
+                model.model.generate
+            )(**kwargs)
         )
+        preds = [
+            model.tokenizer.decode(seq, skip_special_tokens=True)
+            for seq in out.sequences
+        ]
+        neural_scores = (
+            out.sequences_scores.detach().float().cpu().numpy()
+            if hasattr(out, "sequences_scores")
+            else np.zeros(ncand)
+        )
         return preds, neural_scores
+    # Pool entry format: (combined, neural_prob_like, symbolic, hard_fail, reasons, text)
     pool = {}
     def _add_to_pool(preds, nscores):
         for p, ns in zip(preds, nscores):
             sym = scorer.score(src, p)
+            ns = math.exp(ns) # convert log-like beam score to positive scale
+            combined = (sym_lambda * float(ns)) + ((1 - sym_lambda) * float(sym.score))
             key = _dedup_key(p)
             if key not in pool or combined > pool[key][0]:
+                pool[key] = (
+                    combined,
+                    float(ns),
+                    float(sym.score),
+                    sym.hard_fail,
+                    sym.reasons,
+                    p,
+                )
+    # Initial unconstrained generation.
     preds1, ns1 = _generate(num_candidates, use_constraints=False, mode="beam")
     _add_to_pool(preds1, ns1)
+    # If top result fails symbolic hard checks, run constrained repair rounds.
     best = sorted(pool.values(), key=lambda x: x[0], reverse=True)[0]
     repair_modes, repair_round = ["beam", "diverse", "sample"], 0
         mode = repair_modes[min(repair_round, len(repair_modes) - 1)]
         preds2, ns2 = _generate(repair_num_candidates, use_constraints=True, mode=mode)
         _add_to_pool(preds2, ns2)
+        best = sorted(pool.values(), key=lambda x: x[0], reverse=True)[0] # update best after adding repairs
         repair_round += 1
     ranked_all = sorted(pool.values(), key=lambda x: x[0], reverse=True)
     return {
+        "inchi": inchi,
+        "predicted_iupac": best[5],
+        "neural_score": best[1],
+        "symbolic_score": best[2],
+        "combined_score": best[0],
+        "hard_fail": best[3],
+        "reasons": best[4],
         "candidates": [{"text": r[5], "combined": r[0]} for r in ranked_all[:10]],
     }
 def load_model_from_hf(repo_id: str) -> MetanoModel:
     """
     Downloads and loads the METANO T5 model directly from the Hugging Face Hub.
     print(f"Loading METANO model from Hugging Face Hub: {repo_id}")
     config = ModelConfig()
     classifier = MolecularClassifier()
     # Load the underlying T5 model weights and config from the Hub
     t5_model = T5ForConditionalGeneration.from_pretrained(repo_id)
     # Wrap it in the custom MetanoModel architecture
     model = MetanoModel(config, classifier, pretrained_t5=t5_model)
     model.to(device)
     print("Model successfully loaded to device:", device)
+    return model

test_run.py CHANGED Viewed

@@ -11,7 +11,7 @@ def main():
     scorer = SymbolicScorer(metals=config.metal_elements)
     # A sample coordination/organometallic InChI from your notebook
-    test_inchi = "InChI=1/C15H16N2O3S.Na/c1-10-3-4-12(9-11(10)2)15(18)17-21(19,20)14-7-5-13(16)6-8-14;/h3-9H,16H2,1-2H3,(H,17,18);/q;+1"
     print(f"\nRunning prediction for InChI:\n{test_inchi}\n")
@@ -20,7 +20,6 @@ def main():
         inchi=test_inchi,
         scorer=scorer,
         num_candidates=5,
-        sym_lambda=1.0,
         repair_num_candidates=5,
         max_repair_rounds=1
     )
@@ -29,12 +28,14 @@ def main():
     print(f"Predicted IUPAC: {out['predicted_iupac']}")
     print(f"Hard Fail Triggered: {out['hard_fail']}")
     print(f"Combined Score: {out['combined_score']:.3f}")
     if out['reasons']:
         print(f"Penalty Reasons: {out['reasons']}")
     print("\nTop Candidates:")
-    for cand in out["candidates"][:3]:
         print(f"  [{cand['combined']:.3f}] {cand['text']}")
 if __name__ == "__main__":

     scorer = SymbolicScorer(metals=config.metal_elements)
     # A sample coordination/organometallic InChI from your notebook
+    test_inchi = "InChI=1/Fe.Na.H2O4S.H2O.H/c;;1-5(2,3)4;;/h;;(H2,1,2,3,4);1H2;/q;+1;;;-1"
     print(f"\nRunning prediction for InChI:\n{test_inchi}\n")
         inchi=test_inchi,
         scorer=scorer,
         num_candidates=5,
         repair_num_candidates=5,
         max_repair_rounds=1
     )
     print(f"Predicted IUPAC: {out['predicted_iupac']}")
     print(f"Hard Fail Triggered: {out['hard_fail']}")
     print(f"Combined Score: {out['combined_score']:.3f}")
+    print(f"Symbolic Score: {out['symbolic_score']:.3f}")
+    print(f"Neural Score: {out['neural_score']:.3f}")
     if out['reasons']:
         print(f"Penalty Reasons: {out['reasons']}")
     print("\nTop Candidates:")
+    for cand in out["candidates"][1:]:
         print(f"  [{cand['combined']:.3f}] {cand['text']}")
 if __name__ == "__main__":