diff --git "a/det_engine.py" "b/det_engine.py"
new file mode 100644--- /dev/null
+++ "b/det_engine.py"
@@ -0,0 +1,5456 @@
+"""
+Copyright (c)  All Rights Reserved
+by bowen
+"""
+
+import json
+import math
+import os
+import sys
+import pathlib
+from typing import Iterable, List
+import random
+import itertools
+
+import numpy as np
+import pandas as pd
+import tqdm
+import torch
+import torch.amp 
+from PIL import Image
+# from src.data import CocoEvaluator
+# from src.misc import (MetricLogger, SmoothedValue, reduce_dict)
+# from src.solver.utils import output_to_smiles, output_to_smiles2
+# from src.solver.utils import bbox_to_graph_with_charge, mol_from_graph_with_chiral
+# from src.misc.draw_box_utils import draw_objs
+
+# from sklearn.metrics import f1_score
+# from src.postprocess.abbreviation_detector import get_ocr_recognition_only
+# from src.postprocess.utils_dataset import CaptionRemover
+from skimage.measure import label
+######################################add metric postprocess
+import rdkit 
+from rdkit import Chem
+from rdkit.Chem import Draw, AllChem
+from rdkit.Chem import rdchem, RWMol, CombineMols
+from rdkit import Chem
+from rdkit.Chem import rdFMCS
+import copy
+from paddleocr import PaddleOCR
+import re
+from rdkit import DataStructs
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle, Circle
+from scipy.spatial import cKDTree, KDTree
+from rdkit.Geometry import Point3D
+import multiprocessing
+
+
+
+def select_longest_smiles(smiles):
+    # 将 SMILES 以 '.' 分割为多个部分
+    components = smiles.split('.')
+    # 选择字符数最多的部分作为主结构
+    longest_component = max(components, key=len)
+    return longest_component
+
+def MCS_mol(mcs):
+    #mcs_smart = mcs.smartsString
+    mcs_mol = Chem.MolFromSmarts(mcs.smartsString)
+    AllChem.Compute2DCoords(mcs_mol)
+    return mcs_mol
+
+def g_atompair_matches(pair,mcs):
+    mcs_mol = MCS_mol(mcs)
+    matches0 = pair[0].GetSubstructMatches(mcs_mol, useQueryQueryMatches=True,uniquify=False, maxMatches=1000, useChirality=False)
+    matches1 = pair[1].GetSubstructMatches(mcs_mol, useQueryQueryMatches=True,uniquify=False, maxMatches=1000, useChirality=False)
+    if len(matches0) != len(matches1):
+        matches0=list(matches0)
+        matches1=list(matches1)
+        print( " g_atompair_matches noted: matcher not equal !!")
+        if len(matches0)>len(matches1) and len(matches1) !=0:
+            for i in range(0,len(matches0)):
+                if i < len(matches1):
+                    pass
+                else:
+                    ii=i % len(matches1)
+                    matches1.append(matches1[ii])
+        else:
+            for i in range(0,len(matches1)):
+                if i < len(matches0) and len(matches0):
+                    pass
+                else:
+                    ii=i % len(matches0)
+                    matches0.append(matches0[ii])
+    # assert len(matches0) == len(matches1), "matcher not equal break!!"
+    if len(matches0) != len(matches1):
+        atommaping_pairs=[[]]
+    else:atommaping_pairs=[list(zip(matches0[i],matches1[i])) for i in range(0,len(matches0))]
+    return atommaping_pairs
+
+
+class CustomError(Exception):
+    """A custom exception for specific errors."""
+    pass
+
+bond_dirs = {'NONE':    Chem.rdchem.BondDir.NONE,
+                'ENDUPRIGHT':   Chem.rdchem.BondDir.ENDUPRIGHT,
+                'BEGINWEDGE':   Chem.rdchem.BondDir.BEGINWEDGE,
+                'BEGINDASH':    Chem.rdchem.BondDir.BEGINDASH,
+            'ENDDOWNRIGHT': Chem.rdchem.BondDir.ENDDOWNRIGHT,}
+
+BONDTYPE = {'SINGLE':   Chem.rdchem.BondType.SINGLE,
+                'DOUBLE':   Chem.rdchem.BondType.DOUBLE,
+                'TRIPLE':   Chem.rdchem.BondType.TRIPLE,
+                'AROMATIC': Chem.rdchem.BondType.AROMATIC}
+BOND_DIRS = {'NONE':    Chem.rdchem.BondDir.NONE,
+        'ENDUPRIGHT':   Chem.rdchem.BondDir.ENDUPRIGHT,
+        'BEGINWEDGE':   Chem.rdchem.BondDir.BEGINWEDGE,
+        'BEGINDASH':    Chem.rdchem.BondDir.BEGINDASH,
+        'ENDDOWNRIGHT': Chem.rdchem.BondDir.ENDDOWNRIGHT,}
+BONDDIRECT=['ENDUPRIGHT', 'BEGINWEDGE', 'BEGINDASH', 'ENDDOWNRIGHT']
+
+
+BONDTYPE2ORD={ 
+                    'wdge':1,
+                    'dash':1,
+                    Chem.rdchem.BondType.SINGLE: 1,
+                 Chem.rdchem.BondType.DOUBLE: 2,
+                 Chem.rdchem.BondType.TRIPLE: 3,
+                 Chem.rdchem.BondType.AROMATIC: 1.5,
+                 }
+
+BONDTYPE={'SINGLE': Chem.BondType.SINGLE,
+ 'DOUBLE': Chem.BondType.DOUBLE,
+ 'TRIPLE': Chem.BondType.TRIPLE,
+ 'AROMATIC': Chem.BondType.AROMATIC}
+
+VALENCES = {
+    "H": [1], "Li": [1], "Be": [2], "B": [3], "C": [4], "N": [3, 5], "O": [2], "F": [1],
+    "Na": [1], "Mg": [2], "Al": [3], "Si": [4], "P": [5, 3], "S": [6, 2, 4], "Cl": [1], "K": [1], "Ca": [2],
+    "Br": [1], "I": [1], "*":[3,4,5,6], 
+}   
+
+ELEMENTS = [
+    "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne",
+    "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca",
+    "Sc", "Ti", "Ru", "Rh","Rn","Rf", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
+    "Ga", "Ge", "As", "Se", "Br", "Kr",  "Sr", "Zr",
+    "Nb", "Mo", "Tc", "Pd", "Ag", "Cd", "In", "Sn",
+    "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd",
+    "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb",
+    "Lu", "Hf", "Ta", "W",  "Os", "Ir", "Pt", "Au", "Hg",
+    "Tl", "Pb", "Bi", "Po", "At",  "Fr",  "Ac", "Th",
+    "Pa", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm",
+    "Md", "No", "Lr",  "Db", "Sg", "Bh", "Hs", "Mt", "Ds",
+    "Cn", "Nh", "Fl", "Mc", "Lv", "Og"
+]
+    # "Rg", "Rb", "Re", "Ra"as RGROUP in the Molscribe data
+    #"V",  "Y","U",   # be viewed as C for paddleOCR smt  ONELEMENTS ['A','J]
+    #"Ts" #as a chemical group [S](C1=CC=C(C=C1)C)(=O)=O
+RGROUP_SYMBOLS = ['R',"R'" 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11', 'R12',
+                  'Ra', 'Rb', 'Rc', 'Rd','Re','Rg', 'X', 'Y', 'Z', 'Q', 'A', 'E', 'Ar',
+                  "V",  "Y","U",'M', 'G','L',
+                  'Nr','Tt','Uu','Vv','Ww',#CLEF Nr is not in periodic table
+                  'D',#CLEF as [2H] but not recongited by rdkit chemdraw
+                  ]
+
+COLORS = {
+    u'c': '0.0,0.75,0.75', u'b': '0.0,0.0,1.0', u'g': '0.0,0.5,0.0', u'y': '0.75,0.75,0',
+    u'k': '0.0,0.0,0.0', u'r': '1.0,0.0,0.0', u'm': '0.75,0,0.75'
+}
+
+class Substitution(object):
+    '''Define common substitutions for chemical shorthand'''
+    def __init__(self, abbrvs, smarts, smiles, probability):
+        assert type(abbrvs) is list
+        self.abbrvs = abbrvs
+        self.smarts = smarts
+        self.smiles = smiles
+        self.probability = probability
+
+SUBSTITUTIONS: List[Substitution] = [
+    #abbrvs, smarts, smiles
+    #patch4 USPTO,try put the longer one first, as re use match by order
+    Substitution(['CH2CH2NSO2CH3'], '[CH2][CH]',  '[CH2]CNS(=O)(C)=O', 0.5),
+    Substitution(['NHNHCOCF3'], 'NHNHCOCF3',  '[NH]NC(=O)C(F)(F)(F)', 0.5),
+    Substitution(['CO2CysPr'], 'CO2CysPr',  '[C](=O)ON[C@H](C(CCC)=O)CS', 0.5),
+    Substitution(['OCH2CHOHCH2'], 'OCH2CHOHCH2',  '[O]CC(O)C', 0.5),
+    Substitution(['OCH2CHOHCH2OH'], 'OCH2CHOHCH2',  '[O]CC(O)CO', 0.5),
+        # elif symbol in ['SO2(CH2)3SO2NHCH2CHCH2OH']:smiles='[S](=O)(=O)CCCS(=O)(=O)NC[C]CO'
+    Substitution(['SO2(CH2)3SO2NHCH2CHCH2OH'], 'OCH2CHOHCH2',  '[S](=O)(=O)CCCS(=O)(=O)NC[C]CO', 0.5),
+
+
+
+
+    Substitution(['NO2', 'O2N'], '[N+](=O)[O-]', "[N+](=O)[O-]", 0.5),
+    # Substitution(['CHO', 'OHC'], '[CH1](=O)', "[CH1](=O)", 0.5),
+    Substitution(['CO2Et', 'COOEt'], 'C(=O)[OH0;D2][CH2;D2][CH3]', "[C](=O)OCC", 0.5),
+
+    Substitution(['OAc','AcO'], '[OH0;X2]C(=O)[CH3]', "[O]C(=O)C", 0.7),
+    Substitution(['NHAc'], '[NH1;D2]C(=O)[CH3]', "[NH]C(=O)C", 0.7),
+    Substitution(['Ac'], 'C(=O)[CH3]', "[C](=O)C", 0.1),
+
+    Substitution(['OBz','BzO'], '[OH0;D2]C(=O)[cH0]1[cH][cH][cH][cH][cH]1', "[O]C(=O)c1ccccc1", 0.7),  # Benzoyl
+    Substitution(['Bz'], 'C(=O)[cH0]1[cH][cH][cH][cH][cH]1', "[C](=O)c1ccccc1", 0.2),  # Benzoyl
+
+    Substitution(['COOBn','BnO2C'], '[OH0;D2][CH2;D2][cH0]1[cH][cH][cH][cH][cH]1', "[C](=O)OCc1ccccc1", 0.7),  # Benzyl
+    Substitution(['OBn','BnO'], '[OH0;D2][CH2;D2][cH0]1[cH][cH][cH][cH][cH]1', "[O]Cc1ccccc1", 0.7),  # Benzyl
+    Substitution(['Bn'], '[CH2;D2][cH0]1[cH][cH][cH][cH][cH]1', "[CH2]c1ccccc1", 0.2),  # Benzyl
+    Substitution(['NHBn'], '[NH]Cc1ccccc1', "[NH]Cc1ccccc1", 0.2),  # Benzyl
+    Substitution(['NBn2'], '[NH]Cc1ccccc1', "[N](Cc1ccccc1)Cc1ccccc1", 0.2),  # Benzyl
+
+    Substitution(['NHBoc','BocHN',"BOCHN"], '[NH1;D2]C(=O)OC([CH3])([CH3])[CH3]', "[NH]C(=O)OC(C)(C)C", 0.6),
+    Substitution(['NBoc'], '[NH0;D3]C(=O)OC([CH3])([CH3])[CH3]', "[NH1]C(=O)OC(C)(C)C", 0.6),
+    Substitution(['Boc','BOc'], 'C(=O)OC([CH3])([CH3])[CH3]', "[C](=O)OC(C)(C)C", 0.2),
+
+    Substitution(['Cbm'], 'C(=O)[NH2;D1]', "[C](=O)N", 0.2),
+    Substitution(['Cbz'], 'C(=O)OC[cH]1[cH][cH][cH1][cH][cH]1', "[C](=O)OCc1ccccc1", 0.4),
+    Substitution(['NHCbz'], 'C(=O)OC[cH]1[cH][cH][cH1][cH][cH]1', "[NH]C(=O)OCc1ccccc1", 0.4),
+    Substitution(['Cy'], '[CH1;X3]1[CH2][CH2][CH2][CH2][CH2]1', "[CH1]1CCCCC1", 0.3),
+    Substitution(['Fmoc'], 'C(=O)O[CH2][CH1]1c([cH1][cH1][cH1][cH1]2)c2c3c1[cH1][cH1][cH1][cH1]3',
+                 "[C](=O)OCC1c(cccc2)c2c3c1cccc3", 0.6),
+    Substitution(['FmocHN','FmOcHN', 'NHFmoc'], 'C(=O)O[CH2][CH1]1c([cH1][cH1][cH1][cH1]2)c2c3c1[cH1][cH1][cH1][cH1]3',
+                 "[NH]C(=O)OCC1c(cccc2)c2c3c1cccc3", 0.6),
+    Substitution(['Mes'], '[cH0]1c([CH3])cc([CH3])cc([CH3])1', "[c]1c(C)cc(C)cc(C)1", 0.5),
+    Substitution(['OMs','MsO'], '[OH0;D2]S(=O)(=O)[CH3]', "[O]S(=O)(=O)C", 0.7),
+    Substitution(['Ms'], 'S(=O)(=O)[CH3]', "[S](=O)(=O)C", 0.2),
+    Substitution(['Ph'], '[cH0]1[cH][cH][cH1][cH][cH]1', "[c]1ccccc1", 0.5),
+
+    Substitution(['PMB'], '[CH2;D2][cH0]1[cH1][cH1][cH0](O[CH3])[cH1][cH1]1', "[CH2]c1ccc(OC)cc1", 0.2),
+    Substitution(['PMBN'], '[CH2;D2][cH0]1[cH1][cH1][cH0](O[CH3])[cH1][cH1]1', "[N]Cc1ccc(OC)cc1", 0.2),
+    Substitution(['Py'], '[cH0]1[n;+0][cH1][cH1][cH1][cH1]1', "[c]1ncccc1", 0.1),
+    # Substitution(['SEM','MES'], '[CH2;D2][CH2][Si]([CH3])([CH3])[CH3]', "[CH2]CSi(C)(C)C", 0.2),
+    Substitution(['SEM','MES'], '[CH2;D2][O][CH2][CH2][Si]([CH3])([CH3])[CH3]', "[CH2]OCC[Si](C)(C)C", 0.2),#fix above 
+
+    Substitution(['Suc'], 'C(=O)[CH2][CH2]C(=O)[OH]', "[C](=O)CCC(=O)O", 0.2),
+    Substitution(['TBS'], '[Si]([CH3])([CH3])C([CH3])([CH3])[CH3]', "[Si](C)(C)C(C)(C)C", 0.5),
+    Substitution(['TBZ'], 'C(=S)[cH]1[cH][cH][cH1][cH][cH]1', "[C](=S)c1ccccc1", 0.2),
+    Substitution(['OTf'], '[OH0;D2]S(=O)(=O)C(F)(F)F', "[O]S(=O)(=O)C(F)(F)F", 0.7),
+    Substitution(['Tf'], 'S(=O)(=O)C(F)(F)F', "[S](=O)(=O)C(F)(F)F", 0.2),
+    Substitution(['TFA'], 'C(=O)C(F)(F)F', "[C](=O)C(F)(F)F", 0.3),
+    Substitution(['TFAH2N'], 'C(=O)C(F)(F)F', "[NH]C(=O)C(F)(F)F", 0.3),
+    Substitution(['TMS'], '[Si]([CH3])([CH3])[CH3]', "[Si](C)(C)C", 0.5),
+    Substitution(['Ts'], 'S(=O)(=O)c1[cH1][cH1][cH0]([CH3])[cH1][cH1]1', "[S](=O)(=O)c1ccc(C)cc1", 0.6),  # Ts
+    Substitution(['TsO','OTs'], '[O]S(C1=CC=C(C=C1)C)(=O)=O', "[O]S(C1=CC=C(C=C1)C)(=O)=O", 0.6),  # Ts
+
+    Substitution(['COCH3'], '[OH0;D2][CH3;D1]', "[C](=O)C", 0.3),
+    # Alkyl chains
+    Substitution(['OMe', 'MeO','H;CO', 'CH3O','OCH3', 'H3CO'], '[OH0;D2][CH3;D1]', "[O]C", 0.3),
+    Substitution(['SMe', 'MeS'], '[SH0;D2][CH3;D1]', "[S]C", 0.3),
+    Substitution(['NMe', 'MeN'], '[N;X3][CH3;D1]', "[N]C", 0.3),#modified as [NH]not wanted
+    Substitution(['NMe2', 'Me2N'], '[N;X3](C)[CH3;D1]', "[N](C)C", 0.3),#modified as [NH]not wanted
+
+    Substitution(['Me'], '[CH3;D1]', "[CH3]", 0.1),
+    Substitution(['OEt', 'EtO','C2H5O','OC2H5'], '[OH0;D2][CH2;D2][CH3]', "[O]CC", 0.5),
+    Substitution(['MeOH2C','CH2OMe'], '[CH2;D2]O[CH3]', "[CH2]OC", 0.5),
+    Substitution(['Et', 'CH2CH3','CH3CH2'], '[CH2;D2][CH3]', "[CH2]C", 0.3),
+    
+
+    Substitution(['Pr', 'nPr', 'n-Pr'], '[CH2;D2][CH2;D2][CH3]', "[CH2]CC", 0.3),
+    Substitution(['Bu', 'nBu', 'n-Bu'], '[CH2;D2][CH2;D2][CH2;D2][CH3]', "[CH2]CCC", 0.3),
+    # Substitution(['nBu', 'n-Bu'], '[CH2;D2][CH2;D2][CH2;D2][CH3]', "[CH2]CCC", 0.3),
+
+    # Branched
+    Substitution(['iPr', 'i-Pr'], '[CH1;D3]([CH3])[CH3]', "[CH1](C)C", 0.2),
+    Substitution(['iBu', 'i-Bu'], '[CH2;D2][CH1;D3]([CH3])[CH3]', "[CH2]C(C)C", 0.2),
+    Substitution(['OiBu'], '[OH0;D2][CH2;D2][CH1;D3]([CH3])[CH3]', "[O]CC(C)C", 0.2),
+    Substitution(['OtBu','tBuO'], '[OH0;D2][CH0]([CH3])([CH3])[CH3]', "[O]C(C)(C)C", 0.6),
+    Substitution(['tBu', 't-Bu'], '[CH0]([CH3])([CH3])[CH3]', "[C](C)(C)C", 0.3),
+
+    # Other shorthands (MIGHT NOT WANT ALL OF THESE)
+    Substitution(['CF3', 'F3C'], '[CH0;D4](F)(F)F', "[C](F)(F)F", 0.5),
+    Substitution(['NCF3', 'F3CN'], '[N;X3][CH0;D4](F)(F)F', "[NH]C(F)(F)F", 0.5),
+    Substitution(['OCF3', 'F3CO'], '[OH0;X2][CH0;D4](F)(F)F', "[O]C(F)(F)F", 0.5),
+    Substitution(['OCCl3', 'Cl3CO'], '[OH0;X2][CH0;D4](Cl)(Cl)Cl', "[O]C(Cl)(Cl)Cl", 0.5),
+    Substitution(['SCF3', 'F3CS'], '[SH0;X2][CH0;D4](F)(F)F', "[S]C(F)(F)F", 0.5),
+    Substitution(['CCl3'], '[CH0;D4](Cl)(Cl)Cl', "[C](Cl)(Cl)Cl", 0.5),
+    Substitution(['CO2H', 'HO2C', 'COOH'], 'C(=O)[OH]', "[C](=O)O", 0.5),  # COOH
+    Substitution(['CO2NH4','COONH4','H4NOOC','H4NO2C'], 'C(=O)[OH]', "[C](=O)ON", 0.5),  # COOH
+    Substitution([ 'COO-','CO2-'], 'C(=O)[OH]', "[C](=O)[O-]", 0.5),  # COOH
+    # Substitution([ 'COO'], 'C(=O)[OH]', "[C](=O)O", 0.5),  # COOH
+    Substitution(['CN', 'NC'], 'C#[ND1]', "[C]#N", 0.5),
+    # Substitution(['OCH3', 'H3CO'], '[OH0;D2][CH3]', "[O]C", 0.4),
+    #TODO if need just addit here
+    Substitution(['N3'], '[N]=[N+]=[N-]', "[N]=[N+]=[N-]", 0.4),#ACS image dataset has
+    # [N-]=[N+]
+    Substitution(['N2+Cl-','Cl-N2+'], '[N+]#[N].[Cl-]', "[N+]#[N].[Cl-]", 0.4),#ACS image dataset has
+    Substitution(['N2'], '[N]=[N-]', "[N]=[N-]", 0.4),#ACS image dataset has
+    Substitution(['N2H'], '[N]=[N-]', "[N]=[NH]", 0.4),#ACS image dataset has
+    Substitution(['NO','N=O','O=N','ON'], '[N]=[O]', "[N]=O", 0.4),#ACS image dataset has
+    Substitution(['NCH3'], '[N]C', "[NH]C", 0.4),#ACS image dataset has
+    Substitution(['NOMe'], '[N]OC', "[N]OC", 0.4),#ACS image dataset has
+    Substitution(['OCH2'], '[O]C', "[O]C", 0.4),#FORMULA_REGEX
+    Substitution(['C=O','O=C'], '[C]=[O]', "[C]=O", 0.4),#ACS image dataset has
+    Substitution(['NPh','PhN'], 'NC1=CC=CC=C1', "[N]C1=CC=CC=C1", 0.4),#ACS image dataset has
+    Substitution(['NHPh','PhNH','PhHN'], 'NC1=CC=CC=C1', "[NH]C1=CC=CC=C1", 0.4),#ACS image dataset has
+    Substitution(['TMSO','OSMT'], 'O[Si](C)(C)C', "[O][Si](C)(C)C", 0.5),
+    Substitution(['SPh','PhS'], 'SC1=CC=CC=C1', "[S]C1=CC=CC=C1", 0.4),#ACS image dataset has
+    Substitution(['SO3H'], 'S(=O)(=O)[OH]', "[S](=O)(=O)O", 0.4),
+    Substitution(['SO3NH2','SO3NH4','H4NO3S'], 'S(=O)(=O)[OH]', "[S](=O)(=O)ON", 0.4),
+    Substitution(['SO3'], 'S(=O)(=O)[OH]', "[S](=O)(=O)[O-]", 0.4),
+    Substitution(['SO2CF3'], '[S](=O)(=O)C(F)(F)F',  "[S](=O)(=O)C(F)(F)F", 0.5),
+    Substitution(['SO2Cl'], '[S](=O)(=O)Cl',  "[S](=O)(=O)Cl", 0.5),
+    Substitution(['SO2F'], '[S](=O)(=O)F',  "[S](=O)(=O)F", 0.5),
+    Substitution(['SO2'], '[S](=O)(=O)',  "[S](=O)(=O)", 0.5),
+    Substitution(['SO2NH'], '[S](=O)(=O)[N]',  "[S](=O)(=O)[N]", 0.5),#US07323045-20080129-C00062 may lead wrong connext
+    Substitution(['SO2NH2'], '[S](=O)(=O)[NH2]',  "[S](=O)(=O)[NH2]", 0.5),
+    Substitution(['SO2Me','SO2CH3'], '[S](=O)(=O)C',  "[S](=O)(=O)C", 0.5),
+    Substitution(['NHO2S'], '[S](=O)(=O)[N]',  "[N][S](=O)(=O)", 0.5),#US07323045-20080129-C00062 may lead wrong connext
+    Substitution(['OSO2Me'], '[O]S(=O)(=O)C',  "[O]S(=O)(=O)C", 0.5),
+    Substitution(['NHSO2Me'], '[NH]S(=O)(=O)C',  "[NH]S(=O)(=O)C", 0.5),
+    Substitution(['SOCH3','SOMe'], '[S](=O)(=O)',  "[S](=O)C", 0.5),
+
+    Substitution(['P+Ph3Br-'], '[P+](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3',  "[P+](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3", 0.5),
+    Substitution(['N+Ph3Br-'], '[N+](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3',  "[N+](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3", 0.5),
+    Substitution(['PPh2'], "[P](C1=CC=CC=C1)C2=CC=CC=C2",  "[P](C1=CC=CC=C1)C2=CC=CC=C2", 0.5),
+    # Substitution(['BOcHN',"BOCHN"], "[NH]C(OC(C)(C)C)=O",  "[NH]C(OC(C)(C)C)=O", 0.5),
+    Substitution(['CO2Me', 'COOMe'], 'C(=O)[OH0;D2][CH3]', "[C](=O)OC", 0.5),
+    Substitution(['ONa', 'NaO'], '[O][Na]', "[O][Na]", 0.5),
+    Substitution(['OTBDMS', 'TBDMSO'], "[O][Si](C)(C)C(C)(C)C", "[O][Si](C)(C)C(C)(C)C", 0.5),
+    Substitution(['CONH2'], '[C](O)(N)', "[C](=O)[NH2]", 0.5),
+    Substitution(['NHNH2'], '[NH2;D1]', "[NH]N", 0.1),
+    Substitution(['CONH'], 'CONH',  '[C](=O)N', 0.5),
+    Substitution(['CH3CONH'], '[NH]C(=O)C',  '[NH]C(=O)C', 0.5),
+    Substitution(['NH3Cl'], '[NH]Cl',  '[NH]Cl', 0.5),
+
+    Substitution(['SAc','AcS'], '[S]C(C)=O', "[S]C(C)=O", 0.5),
+    Substitution(['OAll'], '[O]CC=C', '[O]CC=C', 0.5),
+    # Substitution(['Tos'], '[Si](C)(C)C', '[Si](C)(C)C', 0.5),#NOTE different case ?? @@acs dataset ,we use the SO2here
+    Substitution(['Tos','TOs'], '[Si](C)(C)C', '[S](=O)(=O)C(C=C1)=CC=C1C', 0.5),#NOTE different case ??
+    Substitution(['OTos','OTOs','soTO'], '[Si](C)(C)C', '[O]S(=O)(=O)C(C=C1)=CC=C1C', 0.5),#NOTE different case ??
+    Substitution(['TsN'], '[N]S(C1=CC=C(C=C1)C)(=O)=O', '[N]S(C1=CC=C(C=C1)C)(=O)=O', 0.5),
+    Substitution(['Ts'], '[S](C1=CC=C(C=C1)C)(=O)=O', '[S](C1=CC=C(C=C1)C)(=O)=O', 0.5),
+    Substitution(['COCF3'], '[C](=O)C(F)(F)(F)', '[C](=O)C(F)(F)(F)', 0.5),
+    Substitution(['CF2', 'F2C'], '[C;D4](F)(F)', "[C](F)(F)", 0.5),
+    Substitution(['PMB'], '[CH2]C1=CC=C(C=C1)OC', '[CH2]C1=CC=C(C=C1)OC', 0.5),
+    Substitution(['NHCOtBu'], '[NH]C(C(C)(C)C)=O','[NH]C(C(C)(C)C)=O', 0.5),
+    Substitution(['OCN'], '[N]=C=O', "[N]=C=O", 0.5),
+    Substitution(['Me3Si'], '[Si](C)(C)(C)', "[Si](C)(C)(C)", 0.5),
+    Substitution(['PhO','OPh'], '[O]C1=CC=CC=C1', "[O]C1=CC=CC=C1", 0.5),
+    Substitution(['Allyl'], '[CH2]C=C', '[CH2]C=C', 0.5),
+    Substitution(['C7H3'], '[C]#CC#CC#CC', '[C]#CC#CC#CC', 0.5), 
+    Substitution(['C5H11'], '[CH2]CCCC', '[CH2]CCCC',  0.5), 
+    Substitution(['R1R2N'], "[N]([*])[*]",  "[N]([*])[*]", 0.5),
+    Substitution(['CO2R'], '[C](=O)O*', '[C](=O)O*',  0.5), 
+    Substitution(['CCl3CH2O2C'], '[C](=O)OCC(Cl)(Cl)Cl', '[C](=O)OCC(Cl)(Cl)Cl',  0.5), 
+    Substitution(['NHOH'], '[NH]O', '[NH]O',  0.5),
+    Substitution(['CO2'], '[C](=O)[O]', '[C](=O)[O]',  0.5),
+    Substitution(['O2C'], '[C](=O)[O]', '[O][C](=O)',  0.5),#NOTE the direction matters
+
+    Substitution(['PPh3'], '[P](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3', '[P](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3', 0.5),
+    Substitution(['TfO'], '[C](=O)[O]', '[O]S(=O)(C(F)(F)F)=O',  0.5),
+    Substitution(['OCH2Ph'], '[O]CC1=CC=CC=C1',  '[O]CC1=CC=CC=C1', 0.5),
+    Substitution(['OCH2CF3'], '[O]CC(F)(F)(F)',  '[O]CC(F)(F)(F)', 0.5),
+    Substitution(['COOCH2Ph'], '[C](=O)OCC1=CC=CC=C1',  '[C](=O)OCC1=CC=CC=C1', 0.5),
+    Substitution(['OCH2OC2H5'], '[C](=O)C(C)(C)C',  '[O]COCC', 0.5),
+    
+    Substitution(['Trt'], '[C](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3', '[C](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3',  0.5),
+    Substitution(['SF5'], '[S](F)(F)(F)(F)F',  '[S](F)(F)(F)(F)F', 0.5),
+
+    # Substitution(['CH2CH'], '[CH2][CH]',  '[CH2][CH]', 0.5),
+    # Substitution(['CH2CH2'], '[CH2][CH2]',  '[CH2][CH2]', 0.5),
+
+    # #SIMPLE abbv
+    Substitution(['S*'], '[S]*',  '[S]*', 0.5),
+    Substitution(['N*, NH*'], '[NH]*',  '[NH]*', 0.5),
+    Substitution(['C*','CH2*'], '[C]*',  '[CH2]*', 0.5),
+    Substitution(['P*',"PH*"], '[P]*',  '[PH]*', 0.5),
+    Substitution(['O*'], '[O]*',  '[O]*', 0.5),
+    #（） effect
+    Substitution(['N(CH3)2'], '[N](C)(C)', "[N](C)(C)", 0.5),
+    Substitution(['(C2H5)2N','Et2N'], '[N](C)(C)', "[N](CC)(CC)", 0.5),
+    Substitution(['B(OH)2'], '[B](O)O', "[B](O)O", 0.5),
+    Substitution(['CO2C(CH3)3'], '[C](=O)C(C)(C)C',  '[C](=O)C(C)(C)C', 0.5),
+    Substitution(['P(O)(OEt)2', 'P(OEt)2(O)'], "[P](OCC)(=O)CCO", "[P](OCC)(=O)OCC", 0.5),
+    Substitution(['(CH2)16Me'], '[CH2]CCCCCCCCCCCCCCCC', "[CH2]CCCCCCCCCCCCCCCC", 0.3),
+    Substitution(['(CH2)11Me'], '[CH2]CCCCCCCCCCC', "[CH2]CCCCCCCCCCC", 0.3),
+    Substitution(['N(H)Et','Et(H)N'], '[NH]CC', '[NH]CC',  0.5),
+    Substitution(['N(H)Me','Me(H)N'], '[NH]C', '[NH]C',  0.5),
+
+
+
+]
+ABBREVIATIONS = {abbrv: sub for sub in SUBSTITUTIONS for abbrv in sub.abbrvs}
+
+
+def extract_abbreviation_key(item):
+    if isinstance(item, list):
+        while isinstance(item, list):
+            item = item[0]
+        return item
+    return item
+
+
+def clean_unpaired_brackets(text):
+    #keep paired, del unpared 
+    result = []
+    stack = []
+    bracket_pairs = {')': '(', ']': '['}
+    opening_brackets = {'(', '['}
+    
+    for char in text:
+        if char in opening_brackets:
+            stack.append(char)
+            result.append(char)
+        elif char in bracket_pairs:
+            if stack and stack[-1] == bracket_pairs[char]:
+                stack.pop()
+                result.append(char)
+            else:
+                # 未配对的闭合括号，跳过
+                continue
+        else:
+            result.append(char)
+    return ''.join(result)
+
+# def del_unpairebrackets(opening_brackets):    
+#     # 移除未配对的开括号
+#     keep paired, del unpared 
+#     result = []
+#     stack = []
+#     bracket_pairs = {')': '(', ']': '['}
+#     opening_brackets = {'(', '['}
+#     for char in result:
+#         if char in opening_brackets:
+#             stack.append(char)
+#         elif char in bracket_pairs:
+#             if stack and stack[-1] == bracket_pairs[char]:
+#                 stack.pop()
+#                 final_result.append(char)
+#             else:
+#                 continue
+#         else:
+#             final_result.append(char)
+    
+    # # 如果仍有未闭合的开括号，移除它们
+    # return ''.join(c for c in final_result if not stack or c not in opening_brackets)
+
+def replace_c1(text):
+    # Use negative lookahead to ensure 'C1' isn't followed by another digit
+    return re.sub(r'C1(?!\d)', 'Cl', text)
+def transform_formula(formula):
+    # 匹配 C 后面的数字和 Hg（允许 Hg 后跟其他元素）
+    match = re.match(r'C(\d+)(.*?)Hg(.*)', formula)
+    if not match:
+        return formula
+    
+    n = int(match.group(1))
+    prefix = match.group(2)  # Hg 前的部分（如空字符串或其他元素）
+    suffix = match.group(3)  # Hg 后的部分（如 O2）
+    g_new = n * 2 + 1
+    return f"C{n}{prefix}H{g_new}{suffix}"
+def Cg_transform_formula(formula):
+    # 匹配 C 后面的数字和 Hg（允许 Hg 后跟其他元素）
+    match = re.match(r'CgH(\d+)(.*?)', formula)
+    if not match:
+        return formula
+    
+    n = int(match.group(1))
+    suffix = match.group(2)  # Hg 后的部分（如 O2）
+    g_new = (n-1)// 2
+    return f"C{g_new}H{n}{suffix}"
+
+def normalize_ocr_text(text, replacement_map):
+    """Normalize OCR text using the predefined mapping rules"""
+    if 'C1'in text:
+        text=replace_c1(text)
+    if 'Hg' in text:
+        text= transform_formula(text)
+    if 'Cg' in text:
+        text= Cg_transform_formula(text)
+    if 'Q' in text:
+        pattern = r'Q([A-Z])(\w+)'
+        replacement = r'O\1\2'
+        text = re.sub(pattern, replacement, text)
+    if text in ELEMENTS:
+        return text  
+    #remove space
+    if ' ' in text:
+        text = text.replace(" ", "")
+    if any(c in text for c in '0oO'):
+        # Step 1: Replace 'o' or 'O' with '0' when after a digit and before a letter or end of string
+        # text = re.sub(r'(?<=[1-9])[oO](?=[a-zA-GI-Z]|$)', '0', text)
+        text = re.sub(r'(?<![CF,CH]\d)[oO](?=[a-zA-GI-Z]|$)', '0', text)
+        if '00' in text:    text = re.sub(r'00', 'OO', text)#CH0 to CHO
+        # text= re.sub(r'(?<=\d)[oO](?=[a-zA-GI-Z]|$)', '0', text)
+        # Step 2: Replace '0' with 'O' when preceded by a letter or followed by optional digits/commas and a letter
+        # pattern = r'(?<=[a-zA-Z])0(?=([a-zA-Z]|$))'
+        if text in ['R20']: return text
+
+        text = re.sub(r'(?<=[a-zA-Z])0(?=([a-zA-Z]|$))', 'O', text)#CH0 to CHO
+        text = re.sub( r'^(0)|(?<=[a-zA-Z][?\d])0(?=[a-zA-Z0-9]*$|[a-zA-Z])', 'O', text)#CF20 to CF2O
+        # result = re.sub(r'(?<=[a-zA-Z])0|0(?=[,\d]*[a-zA-Z])', 'O', text)
+        # Step 3: Only apply '0' to 'O' replacement if '0' doesn't follow digits 1-9
+        # if not re.search(r'[1-9]0', text):
+        #     text = result
+    text=clean_unpaired_brackets(text)
+    pattern_n1 = r'^NHR[0-9a-z]$'
+
+    # Your existing text normalization rules
+    if text in ['OzN','O2N', 'O,N', 'NOz','NO2', 'NO,', '0;N','02N','N20']: text = 'NO2'
+    #jpo
+    elif text in ['CHzCH','CH,CH',]:text='CH3CH'
+    elif text in ["NHCHzCOOH","NHCH2COOH",]:text='NHCH2COOH'
+    elif text in ['CIOC','COCE','ClOC','COCI']:text='COCl'
+    
+    elif text in ['CHCOOCHs','CH2COOCH5']:text='CH2COOC2H5'
+    #staker
+    elif text in ['(t-Bu)','t-Bu']:text='t-Bu'
+
+    #ACS
+    elif text in ['SiMe2','Me2Si']:text='SiMe2'
+    elif text in ['ArzP(O)','Ar2P(O)']:text='Ar2P(O)'
+    elif text in ['P(O)(0Et)2','P(O)(OEt)2']:text='P(O)(OEt)2'
+    elif text in ['PhOzS','PhO2S']:text='PhO2S'
+    elif text in ['CH3O','CHzO']:text='CH3O'
+    elif text in ['NH.HCI','NH,.Hcl']:text='NH2.HCl'
+    
+    #CLEF
+    elif text in ['2','Z']:text='Z'
+    elif text in ['(CH2)m','(CH2)q','(CH2)s']:text='CH2'
+    elif text in ['Arl','Ari','Ar2','Ar1',]:text='Ar'
+    elif text in [ '"0ls','"ols','S[0]a']:text='S[O]a'
+    elif text in ['NHR%','NHR*']:text='NHR8'
+    elif text in ['Vv','Vy']:text='Vv'
+
+
+    elif text in ['N3','NY','Ny']:text='N3'
+    elif text in ['C2H52N','N(CH,CH3)2','C;H52N','(C;H5)2N','N(C;Hs)2','N(C;H5)2','(CHzCH2)2N','N(CHCH3)2','(CH3CH2)2N','(C2H52N', '(CHzCH)2N','(C2H5)2N','Et2N']:text='(C2H5)2N'
+    elif text in ['(CH3)2N','Me2NH','Me,N','Me2N']:text='Me2N'
+    elif text in ['(C;H4O)H','(C2H4O)H']:text='(C2H4O)H'
+    elif text in ['(C;H4O)4CH3','(C2H4O)4CH3' ]:text='(C2H4O)4CH3'
+    elif text in ['(CH2)16Me' ]:text='(CH2)16Me'
+    elif text in ['(CH2)11Me']:text='(CH2)11Me'
+    elif text in ['CO2CH2Ph','COOCH2Ph','COOCH,Ph']:text='COOCH2Ph'
+    elif text in ['CO2C(CH3)3','(CH3)3CO2C',]:text='CO2C(CH3)3'
+    elif text in ['OCH2Ph','OCH,Ph','OCHAPH','OCH;Ph']:text='OCH2Ph'
+    elif text in ['(CF2)8H','(CF2)gH','(CF2)sH','CF2sH', 'CF:H','CF2)sH','CF):H' ]: text = '(CF2)8H'
+    elif text in ['NHSO,Bu','NHSO2Bu',]: text = 'NHSO2Bu'
+    elif text in ['NHSO,CH3','NHSO2CH3','NHSO2Me']: text = 'NHSO2CH3'
+    elif text in ['1231','1231','23T', 'l23I']: text = 'l23I'
+
+
+
+    elif text in ['CF3','CFs', 'CF,', '13','CF 3','F;C', 'F:C', 'F sC', 'CF', 'CF;', 'CFa', 'FzC', 'CFz']: text = 'F3C'
+    elif text in ['OCCl3','Cl3CO',]: text = 'OCCl3'
+    elif text in ['CCl3','Cl3C',]: text = 'CCl3'
+    elif text in ['F;CN', 'NCF;']: text = 'F3CN'
+    elif text in ['NCH3','NHCH3',  'NCH;','CH3N','MeN','MeNH']: text = 'NCH3'
+    elif text in ['NOMe']:text='NOMe'
+    elif text in ['R,R,N']: text = 'R1R2N'
+    elif text in ['HzC','HyC','CHy','CHE','H3C.','1;.C', '1;C', 'M e','Mé', 'CH 3', 'CH:', 'HsC', 'HaC', 'H3C', 'CH3', 'CHa', 'H;C', 'CH,', 'CHs', 'CH;']: text = 'Me'
+    # elif text in ['CH2']: text = 'C'
+    elif text in ['PhzBr']: text = 'Ph3Br'
+    elif text in ['PPh3', 'PPha']: text = 'PPh3'
+    elif text in ['Et', 'CH,CH3','Catls','Cafls','CH2CH3','H3CH2C','C:H5','HzCH2C','H3CH2C', 'C,H5', 'CzH5','C2H5','C2Hs']: text = 'CH2CH3'
+    elif text in ['Ovle', 'HzCO','OCH', 'OCH:','H2CO', 'CH3O', 'CH,O', 'HsCO','OMe','AME', 'AMe','H3CO', 'MeO']: text = 'OMe'
+    elif text in ['OCHa','HgCO', 'OCH','HaCO', 'OCH:','H2CO', 'CH3O', 'CH,O', 'OMe','AME', 'AMe', 'MeO']: text = 'OMe'
+    elif text in ['SO2Cl', 'SOzCl']: text = 'SO2Cl'
+    elif text in ['SO2F', 'SOzF']: text = 'SO2F'
+    elif text in ['SONH', 'HNOS','SON', 'SO2NH']: text = 'SO2NH'
+    elif text in ['HNO2S','NHO2S']: text = 'NHO2S'
+    elif text in ['SO2Cl', 'SOzCl']: text = 'SO2Cl'
+    elif text in ['SO2F', 'SOzF']: text = 'SO2F'
+    elif text in ['SONH', 'HNOS','SON', 'SO2NH']: text = 'SO2NH'
+    elif text in ['SO2NH2', 'SO,NH', 'SO:NH2', 'SONH2']: text = 'SO2NH2'
+    elif text in ['SOzCF3', 'SO2CF3', 'CF3SO2']: text = 'SO2CF3'
+    elif text in ['SOz','O2S', '$02', 'S02','SO,', '62','O:S','SO2']: text = 'SO2'
+    elif text in ['H3CO2S','SO2CH3']: text='SO2CH3'
+    elif text in ['SO3H','SOsH','SOaH', 'HO3S','SOzH','HOzS']: text = 'SO3H'
+    elif text in ['MeO2SO','OSO2CH3','OSO2Me']:text='OSO2Me'
+    elif text in ['MeO2SHN','NHSO2Me']:text='NHSO2Me'
+    
+
+    elif text in ['PIME', 'PMB']: text = 'PMB'
+    elif text in ['1-BU', '-BU', '-Bu', 't-BU','t-Bu']: text = 't-Bu'
+    elif text in ['NTS', 'NTs', 'TsN']: text = 'TsN'
+    elif text in ['TsO', 'OTs']: text = 'OTs'
+    elif text in ['Nz* Cl', "N2+Cl-"]: text = 'N2+Cl-'
+    elif text in ['NH3Cl', 'NHzCl','NH;Cl']: text = 'NH3Cl'
+    elif text in ['B(OH)2']: text = 'B(OH)2'
+    elif text in ['NHAC', 'NHAc']: text = 'NHAc'
+    elif text in ['1CO', 'NCO', 'OCN', 'OON']: text = 'OCN'
+    elif text in ['COCFs','COCF3', 'COCF s']: text = 'COCF3'
+    elif text in ['OCF3', 'OCF 3','OCE', 'OCE:','OCEE', 'F3CO', 'OCF', 'OCF:']: text = 'OCF3'
+    elif text in ['SCF3', 'SCE', 'SCEE', 'F3CS', 'SCF', 'SCF:']: text = 'SCF3'
+    elif text in ['HzCS', 'SCH3', 'SMe','MeS','H3SC' ]: text = 'SMe'
+    elif text in ['CHzCHzO', 'CH3CH2O','H5C2O','OC2H5']: text = 'OEt'
+    elif text in ['CO,Et','COzEt', 'CO2Et','H3CH2COOC','CO2C2H5']:text = 'CO2Et'
+    elif text in ['OTBS', 'TBSO', 'OTBDMS']: text = 'OTBDMS'
+    elif text in ['PhO', 'Pho']: text = 'PhO'
+    elif text in ['CI', 'C1']: text = 'Cl'
+    elif text in ['P h', 'Ph']: text = 'Ph'
+    elif text in ['FAHN', 'TFAH,N','TFAH2N',]: text = 'TFAH2N'
+    elif text in ['MeaSi', 'Me3Si']: text = 'Me3Si'
+
+    elif text in ['PHzC','PH;C', 'PH3C']: text = 'PH3C'
+    elif text in ['COOH','OOOH','1OOC', 'HOOO','HOOC', 'DOOH', 'CO:H','HO,C','CO,H','CO2H']: text = 'CO2H'
+    # elif text in ['COO','COO-']: text = 'COO-'#coo-bond
+    elif text in ['CO2R','RO2C', 'RO,C','CO2*', "COzR'"]: text = 'CO2R'
+    elif text in ['CO2', 'COO','OOC', "COz"]: text = 'CO2'
+    #direction matter
+    elif text in ['O2C', '02C']: text = 'O2C'
+    elif text in ['CaH;', 'CHS', 'C2H5']: text = 'C2H5'
+    elif text in ['NHBoc','NHBOc', 'BocHN','BOcHN', "BOCHN"]: text = "NHBoc"
+    elif text in ['C7H', 'C7H3']: text = 'C7H3'
+    elif text in ['CsH11', 'C5H11']: text = 'C5H11'
+    elif text in ['CC3CH2O2C', 'CCl3CH2O2C']: text = 'CCl3CH2O2C'
+    elif text in ['CH2OMe','MeOH,C','CH,0Me', 'CH,OMe','MeOH2C']: text = 'CH2OMe'
+    elif text in ['R', "R'"]: text = '*'
+    elif text in ['U', 'U.']: text = 'U'
+    elif text in ['RO']: text = 'O*'
+    elif text in ['OAc', 'OAC']: text = 'OAc'
+    elif text in ['Rg', 'R9']: text = 'R9'
+    elif text in ['OQ', '00', '0Q','OCH3']: text = 'OMe'
+    # elif text in ['NH', 'HN']: text = '[NH]'
+    elif text in ['NH', 'HN', "NH2", 'H2N', 'H,N']: text = 'N'
+    elif text in ['OH', 'HO', 'OH2', '0']: text = 'O'
+    elif text in  ['N(H)Et','Et(H)N']: text = 'N(H)Et'
+    elif text in  ['N(H)Me','Me(H)N']: text = 'N(H)Me'
+    elif text in ['HNOC','CONH']: text='CONH'
+    elif text in ['HNOCCH3','CH,CONH','CH3CONH']: text='CH3CONH'
+    elif text in ['PPh2','Ph,P','Ph2P']: text='PPh2'
+    elif text in ['SF5','F5S']: text = 'SF5'
+    elif text in ['OCH2CF3','F3CH2CO']: text = 'OCH2CF3'
+    elif text in ['NHCbz','CbzHN']: text = 'NHCbz'
+    elif text in ['NHNH2','H2NHN']: text = 'NHNH2'
+    elif text in ['CHzCH22N','N2(CH2CH3)','(CH3CH2)2N']: text = '(CH3CH2)2N'
+    #NOTE this with 3 neibor bonds, whic in x order, direction matters
+    elif text in ['CHCHCH2CH-3','CH2CH2CH2CH']: text = 'CH2CH2CH2CH'
+    elif text in ['HCH2CH2CH2C','HCH2CH2CH2C' ]: text = 'HCH2CH2CH2C'
+    
+    elif text in ['(HzC)2HC','(H3C)2HC']: text = '(H3C)2HC'
+    elif text in ['13CO2SHNH2CH2C','H3CO2SHNH2CH2C','CH2CH2NSO2CH3']: text = 'CH2CH2NSO2CH3'#USPTO
+    elif text in ['CgH19','C9H19']: text = 'C9H19'
+    elif text in ['(CF2):H','(CF2)8H']: text = '(CF2)8H'
+
+    elif text in ['COOCH3','HzCO2C', 'CO,Me','H3CO2C','CO2CH3','MeOOC','CO2Me','COzMe','MeO2C','MeO,C']: text = 'CO2Me'
+    elif text in ['(CHCHO)','CH2CH2O']: text = 'CH2CH2O'
+    elif text in ['CO,CysPr','CO2CysPr']: text = 'CO2CysPr'
+    elif text in ['CH2CH2C(O)OCHCH3','CH;CH2C(O)OCHCH3']:text='CH2CH2C(O)OCH2CH3'
+    elif text in ['H4NOzS','H4NO3S']: text = 'H4NO3S'
+    elif text in ['C1OH21','C1oH21','CloH21', 'C10H21']: text = 'C10H21'
+
+    elif text in ['']: text = 'CF2'
+
+    elif text in replacement_map:
+        text = replacement_map[text]
+    # elif 'NHR' in text or 'RHN' in text:
+    #     text = NHR_string(text)
+    # elif text in ['RHN']: text = 'N*'
+    
+    return text
+
+
+
+
+
+def C_H_affixExpand(group):
+    """
+    Expands CnHm or HmCn chemical group notation into SMILES format.
+    Supports formats like C6H11, NHC6H11, H11C6, H11C6HN where H = 2C - 1.
+    Returns SMILES string or False if invalid.
+    """
+    # Regex patterns
+    p_cn_hm = r'^C(\d+)H(\d+)$'  # Standalone CnHm (e.g., C6H11)
+    p_hm_cn = r'^H(\d+)C(\d+)$'  # Standalone HmCn (e.g., H11C6)
+    p_prefix = r'^([A-Za-z]+)(C(\d+)H(\d+))$'  # Prefix + CnHm (e.g., NHC6H11)
+    p_suffix = r'^(C(\d+)H(\d+))([A-Za-z]+)$'  # CnHm + Suffix (e.g., C6H11NH)
+    p_hm_cn_prefix = r'^([A-Za-z]+)(H(\d+)C(\d+))$'  # Prefix + HmCn (e.g., H11C6HN)
+    p_hm_cn_suffix = r'^(H(\d+)C(\d+))([A-Za-z]+)$'  # HmCn + Suffix (e.g., H11C6NH)
+
+    # 2. Handle CnHm or HmCn with prefix/suffix
+    patterns = [
+    #pattern, sub_pattern,aff_idx, group_idx, c_idx, h_idx, aff_type
+    (p_prefix, p_cn_hm, 1, 2, 3, 4, 'prefix'),
+    (p_suffix, p_cn_hm, 4, 1, 2, 3, 'suffix'),
+    (p_hm_cn_prefix, p_hm_cn,1, 2, 4, 3, 'prefix'),
+    (p_hm_cn_suffix, p_hm_cn, 4, 1, 3, 2, 'suffix')
+    ]
+
+    # Abbreviation map for common groups
+    ABBREVIATIONS2 = {
+        'NH': '[NH]', 'HNOC': '[C](=O)[NH]',
+        'CONH': '[C](=O)[NH]', 'HN': '[NH]', 'HNO': '[NH]O', 'NO': '[N]=O',
+        'COO':'[C](=O)O',
+        'CO2':'[C](=O)O',
+
+    }#TODO may need more 
+
+    def validate_and_expand(c_count, h_count, prefix=None, suffix=None):
+        """Helper to validate CnHm/HmCn and generate SMILES."""
+        if h_count != 2 * c_count + 1:  # Check if H = 2C + 1 CmHn
+            return False
+        # Base SMILES: [CH] for single carbon, or [CH]C...C for multiple
+        smiles = '[CH2]C' if c_count == 2 else '[CH2]'+'C' * int(c_count - 1)#NOTE C have to 2n
+        print([c_count, h_count, prefix, suffix],'[c_count, h_count, prefix, suffix]')
+        if prefix:
+            prefix = ABBREVIATIONS2.get(prefix, prefix)
+            smiles = prefix + smiles
+        if suffix:  # Changed from elif to if to handle both prefix and suffix
+            suffix = ABBREVIATIONS2.get(suffix, suffix)
+            smiles = suffix + smiles #as CmHn are always n=2m+1
+        return smiles
+
+    # 1. Handle standalone CnHm or HmCn
+    match_cn_hm = re.match(p_cn_hm, group)
+    if match_cn_hm:
+        c_count, h_count = int(match_cn_hm.group(1)), int(match_cn_hm.group(2))
+        return validate_and_expand(c_count, h_count)
+
+    match_hm_cn = re.match(p_hm_cn, group)
+    if match_hm_cn:
+        h_count, c_count = int(match_hm_cn.group(1)), int(match_hm_cn.group(2))
+        return validate_and_expand(c_count, h_count)
+
+    for pattern, sub_pattern,aff_idx, group_idx, c_idx, h_idx, aff_type in patterns:
+        match = re.match(pattern, group)
+        if match:
+            cn_hm = match.group(group_idx)
+            affix = match.group(aff_idx)  # Other group is prefix/suffix
+            c_count = int(match.group(c_idx))
+            h_count = int(match.group(h_idx))
+            print(cn_hm,affix,c_count,h_count,'cn_hm,affix,c_count,h_count')
+            return validate_and_expand(
+                    c_count, h_count,
+                    prefix=affix if aff_type == 'prefix' else None,
+                    suffix=affix if aff_type == 'suffix' else None
+                )
+
+    return False
+
+def N_C_H_expand(group):
+    # 使用正则表达式匹配 NHCnHm 中的 n
+    match = re.match(r'NHC(\d+)H(\d+)', group)
+    match1 = re.match(r'NC(\d+)H(\d+)', group)
+    if not match and not match1:
+        return False
+    # 获取碳原子数
+    if match:
+        C_count = int(match.group(1))
+        H_count = int(match.group(2))
+    if match1:
+        C_count = int(match1.group(1))
+        H_count = int(match1.group(2))
+    if H_count== C_count*2 +1 :
+        # 构建 SMILES：'[N]' + 'C' * 碳原子数
+        smiles = '[N]' + 'C' * C_count
+    return smiles
+
+def C_F_expand(group):
+    # 尝试匹配 CnFm 格式 (e.g., C2F5)
+    match_cnfm = re.match(r'C(\d+)F(\d+)', group)
+    match_cnfm_2 = re.match(r'F(\d+)C(\d+)', group)
+    if match_cnfm:
+        C_count = int(match_cnfm.group(1))
+        F_count = int(match_cnfm.group(2))
+        # 验证氟原子数是否符合全氟烷基的规则：F_count = 2 * C_count + 1
+        if F_count != 2 * C_count + 1:
+            return False
+    else:
+        # 尝试匹配 CF2CF3 格式 (e.g., CF2CF3, CF2CF2CF3)
+        # 匹配一个或多个 CF2 后跟一个 CF3
+        match_cfx = re.match(r'(CF2)*CF3$', group)
+        if not match_cfx:
+            return False
+        # 计算碳原子和氟原子数
+        cf2_count = group.count('CF2')  # 每个 CF2 贡献 1 碳和 2 氟
+        C_count = cf2_count + 1  # +1 for the terminal CF3
+        F_count = cf2_count * 2 + 3  # Each CF2 has 2F, CF3 has 3F
+        # 验证氟原子数是否符合全氟烷基的规则
+        if F_count != 2 * C_count + 1:
+            return False
+    # 构建 SMILES 字符串
+    smiles = []
+    for i in range(C_count):
+        if i < C_count - 1:
+            # 前面的碳原子：2个氟原子，形式为 C(F)(F)
+            if len(smiles)==0:
+                smiles.append('[C](F)(F)')
+            else:
+                smiles.append('C(F)(F)')
+        else:
+            # 最后一个碳原子：3个氟原子，形式为 [C](F)(F)(F)
+            smiles.append('C(F)(F)(F)')
+    
+    # 连接所有部分
+    return ''.join(smiles)
+    
+# def C_H_expand(group):
+#     """
+#     Expands CnHm or HmCn chemical group notation into SMILES format.
+#     Supports formats like C18H37HNOC, CONHC3H7, C3H7, H23C11.
+#     Returns SMILES string or False if invalid.
+#     """
+#     # Regex patterns
+#     # Regex patterns
+#     p_cn_hm = r'^C(\d+)H(\d+)$'  # Standalone CnHm (e.g., C6H11)
+#     p_hm_cn = r'^H(\d+)C(\d+)$'  # Standalone HmCn (e.g., H11C6)
+#     p_prefix = r'^([A-Za-z]+)(C\d+H\d+)$'  # Prefix + CnHm (e.g., NHC6H11)
+#     p_suffix = r'^(C\d+H\d+)([A-Za-z]+)$'  # CnHm + Suffix (e.g., C6H11NH)
+#     p_hm_cn_prefix = r'^([A-Za-z]+)(H\d+C\d+)$'  # Prefix + HmCn (e.g., H11C6HN)
+#     p_hm_cn_suffix = r'^(H\d+C\d+)([A-Za-z]+)$'  # HmCn + Suffix (e.g., H11C6NH)
+
+#     # Element and suffix replacement map
+#     elements = ['S', 'N', 'P', 'C', 'O']
+#     keys = [f"{e}{suffix}" for e in elements for suffix in ['R"', "R'", "R", "*"]]
+#     replacement_map = {key: f'{key[0]}*' for key in keys}
+#     def validate_and_expand(c_count, h_count, prefix=None, suffix=None):
+#         """Helper to validate CnHm/HmCn and generate SMILES."""
+#         if h_count != 2 * c_count + 1:  # Check if valid alkyl group
+#             return False
+#         smiles = '[CH2]' + 'C' * (c_count - 1)
+#         if prefix:
+#             prefix = normalize_ocr_text(prefix, replacement_map)
+#             smiles = ABBREVIATIONS.get(prefix, prefix) + 'C' * c_count
+#         elif suffix:
+#             suffix = normalize_ocr_text(suffix, replacement_map)
+#             smiles = ABBREVIATIONS.get(suffix, suffix) + 'C' * c_count
+#         return smiles
+
+#     # 1. Handle standalone CnHm or HmCn first
+#     match_cn_hm = re.match(p_cn_hm, group)
+#     if match_cn_hm:
+#         c_count, h_count = int(match_cn_hm.group(1)), int(match_cn_hm.group(2))
+#         return validate_and_expand(c_count, h_count)
+
+#     match_hm_cn = re.match(p_hm_cn, group)
+#     if match_hm_cn:
+#         h_count, c_count = int(match_hm_cn.group(1)), int(match_hm_cn.group(2))
+#         return validate_and_expand(c_count, h_count)
+
+#     # 2. Handle CnHm or HmCn with prefix/suffix
+#     patterns = [
+#         (p_prefix, p_cn_hm, 1, 2, 'suffix'),
+#         (p_suffix, p_cn_hm, 2, 1, 'prefix'),
+#         (p_hm_cn_prefix, p_hm_cn, 1, 2, 'suffix'),
+#         (p_hm_cn_suffix, p_hm_cn, 2, 1, 'prefix')
+#     ]
+
+#     for pattern, sub_pattern, c_idx, h_idx, aff_type in patterns:
+#         match = re.match(pattern, group)
+#         if match:
+#             cn_hm = match.group(1 if aff_type == 'suffix' else 2)
+#             affix = match.group(2 if aff_type == 'suffix' else 1)
+#             sub_match = re.match(sub_pattern, cn_hm)
+#             if sub_match:
+#                 c_count = int(sub_match.group(c_idx))
+#                 h_count = int(sub_match.group(h_idx))
+#                 return validate_and_expand(
+#                     c_count, h_count,
+#                     prefix=affix if aff_type == 'prefix' else None,
+#                     suffix=affix if aff_type == 'suffix' else None
+#                 )
+
+#     return False
+
+import re
+
+def C_H_expand(group):
+    """
+    Expands CnHm or HmCn chemical group notation into SMILES format.
+    Supports formats like C18H37HNOC, CONHC3H7, C3H7, H23C11, and (H7C3)2N.
+    Returns SMILES string or False if invalid.
+    """
+    # Regex patterns
+    p_cn_hm = r'^C(\d+)H(\d+)$'  # Standalone CnHm (e.g., C6H11)
+    p_hm_cn = r'^H(\d+)C(\d+)$'  # Standalone HmCn (e.g., H11C6)
+    p_prefix = r'^([A-Za-z]+)(C\d+H\d+)$'  # Prefix + CnHm (e.g., NHC6H11)
+    p_suffix = r'^(C\d+H\d+)([A-Za-z]+)$'  # CnHm + Suffix (e.g., C6H11NH)
+    p_hm_cn_prefix = r'^([A-Za-z]+)(H\d+C\d+)$'  # Prefix + HmCn (e.g., H11C6HN)
+    p_hm_cn_suffix = r'^(H\d+C\d+)([A-Za-z]+)$'  # HmCn + Suffix (e.g., H11C6NH)
+    
+    # New pattern for handling (H7C3)2N format
+    p_bracketed_group  = r'^\((H(\d+)C(\d+))\)(\d+)([A-Za-z]+)$'  # Adjusted to handle (H7C3)2N, etc.
+    p_reverse_bracketed_group = r'^([A-Za-z]+)\((C(\d+)H(\d+))\)(\d+)$'  # Handles N(C3H7)2, etc.
+
+    # Element and suffix replacement map
+    elements = ['S', 'N', 'P', 'C', 'O']
+    keys = [f"{e}{suffix}" for e in elements for suffix in ['R"', "R'", "R", "*"]]
+    replacement_map = {key: f'{key[0]}*' for key in keys}
+
+    def validate_and_expand(c_count, h_count, prefix=None, suffix=None):
+        """Helper to validate CnHm/HmCn and generate SMILES."""
+        if h_count != 2 * c_count + 1:  # Check if valid alkyl group
+            return False
+        smiles = '[CH2]' + 'C' * (c_count - 1)
+        if prefix:
+            prefix = normalize_ocr_text(prefix, replacement_map)
+            smiles = ABBREVIATIONS.get(prefix, prefix) + 'C' * c_count
+        elif suffix:
+            suffix = normalize_ocr_text(suffix, replacement_map)
+            smiles = ABBREVIATIONS.get(suffix, suffix) + 'C' * c_count
+        return smiles
+
+    # 1. Handle standalone CnHm or HmCn first
+    match_cn_hm = re.match(p_cn_hm, group)
+    if match_cn_hm:
+        c_count, h_count = int(match_cn_hm.group(1)), int(match_cn_hm.group(2))
+        return validate_and_expand(c_count, h_count)
+
+    match_hm_cn = re.match(p_hm_cn, group)
+    if match_hm_cn:
+        h_count, c_count = int(match_hm_cn.group(1)), int(match_hm_cn.group(2))
+        return validate_and_expand(c_count, h_count)
+
+    # 2. Handle CnHm or HmCn with prefix/suffix
+    patterns = [
+        (p_prefix, p_cn_hm, 1, 2, 'suffix'),
+        (p_suffix, p_cn_hm, 2, 1, 'prefix'),
+        (p_hm_cn_prefix, p_hm_cn, 1, 2, 'suffix'),
+        (p_hm_cn_suffix, p_hm_cn, 2, 1, 'prefix')
+    ]
+
+    for pattern, sub_pattern, c_idx, h_idx, aff_type in patterns:
+        match = re.match(pattern, group)
+        if match:
+            cn_hm = match.group(1 if aff_type == 'suffix' else 2)
+            affix = match.group(2 if aff_type == 'suffix' else 1)
+            sub_match = re.match(sub_pattern, cn_hm)
+            if sub_match:
+                c_count = int(sub_match.group(c_idx))
+                h_count = int(sub_match.group(h_idx))
+                return validate_and_expand(
+                    c_count, h_count,
+                    prefix=affix if aff_type == 'prefix' else None,
+                    suffix=affix if aff_type == 'suffix' else None
+                )
+
+    base_smiles=False
+    # 3. Handle the new (H7C3)2N case TODO may need N2(C3H7)adding 
+    match_bracketed_group = re.match(p_bracketed_group, group)
+    if match_bracketed_group:
+        h_count, c_count = int(match_bracketed_group.group(2)), int(match_bracketed_group.group(3))
+        prefix = match_bracketed_group.group(5)
+        prefix_n = int(match_bracketed_group.group(4))
+        print("h_count, c_count,prefix",[h_count, c_count,prefix])
+        unit_smi='C'*c_count
+        BACKET_SM=f"({unit_smi})"* prefix_n
+        base_smiles=f"[{prefix}]{BACKET_SM}"
+
+    # 4. Handle the new  N(C3H7)2 
+    match_reverse_bracketed_group = re.match(p_reverse_bracketed_group, group)
+    if match_reverse_bracketed_group:
+        c_count, h_count = int(match_reverse_bracketed_group.group(3)), int(match_reverse_bracketed_group.group(4))
+        prefix = match_reverse_bracketed_group.group(1)
+        prefix_n = int(match_reverse_bracketed_group.group(5))
+        print("h_count, c_count,prefix",[h_count, c_count,prefix])
+        unit_smi='C'*c_count
+        BACKET_SM=f"({unit_smi})"* prefix_n
+        base_smiles=f"[{prefix}]{BACKET_SM}"
+
+    if base_smiles:
+        # If valid, return the SMILES with the appropriate number of repetitions for the group
+        return f"{base_smiles}" 
+    
+    
+    return False
+
+
+def C_H_expand2(group):
+    """
+    Expands CnHm or HmCn chemical group notation into SMILES format.
+    Supports formats like C6H11, NHC6H11, H11C6, H11C6HN where H = 2C - 1.
+    Returns SMILES string or False if invalid.
+    """
+    # Regex patterns
+    p_cn_hm = r'^C(\d+)H(\d+)$'  # Standalone CnHm (e.g., C6H11)
+    p_hm_cn = r'^H(\d+)C(\d+)$'  # Standalone HmCn (e.g., H11C6)
+    p_prefix = r'^([A-Za-z]+)(C\d+H\d+)$'  # Prefix + CnHm (e.g., NHC6H11)
+    p_suffix = r'^(C\d+H\d+)([A-Za-z]+)$'  # CnHm + Suffix (e.g., C6H11NH)
+    p_hm_cn_prefix = r'^([A-Za-z]+)(H\d+C\d+)$'  # Prefix + HmCn (e.g., H11C6HN)
+    p_hm_cn_suffix = r'^(H\d+C\d+)([A-Za-z]+)$'  # HmCn + Suffix (e.g., H11C6NH)
+
+    # Abbreviation map for common groups
+    ABBREVIATIONS2 = {
+        'NH': '[NH]', 'CONH': '[C](=O)[NH]', 'HN': '[NH]', 'HNO': '[NH]O', 'NO': '[N]=O'
+    }#TODO may need more 
+
+    def validate_and_expand(c_count, h_count, prefix=None, suffix=None):
+        """Helper to validate CnHm/HmCn and generate SMILES."""
+        if h_count != 2 * c_count - 1:  # Check if H = 2C - 1
+            return False
+        if c_count % 2 != 0:
+            print(f"C#C , c_count have to be 2n!!!")
+            return False
+        # Base SMILES: [CH] for single carbon, or [CH]C...C for multiple
+        smiles = '[C]#C unit repeat' if c_count == 2 else '[C]#C'+'C#C' * int(c_count/2 - 1)#NOTE C have to 2n
+        if prefix:
+            prefix = ABBREVIATIONS2.get(prefix, prefix)
+            smiles = prefix + smiles
+        if suffix:  # Changed from elif to if to handle both prefix and suffix
+            suffix = ABBREVIATIONS2.get(suffix, suffix)
+            smiles += suffix
+        return smiles
+
+    # 1. Handle standalone CnHm or HmCn
+    match_cn_hm = re.match(p_cn_hm, group)
+    if match_cn_hm:
+        c_count, h_count = int(match_cn_hm.group(1)), int(match_cn_hm.group(2))
+        return validate_and_expand(c_count, h_count)
+
+    match_hm_cn = re.match(p_hm_cn, group)
+    if match_hm_cn:
+        h_count, c_count = int(match_hm_cn.group(1)), int(match_hm_cn.group(2))
+        return validate_and_expand(c_count, h_count)
+
+    # 2. Handle CnHm or HmCn with prefix/suffix
+    patterns = [
+        (p_prefix, p_cn_hm, 2, 1, 2, 'prefix'),
+        (p_suffix, p_cn_hm, 1, 1, 2, 'suffix'),
+        (p_hm_cn_prefix, p_hm_cn, 2, 2, 1, 'prefix'),
+        (p_hm_cn_suffix, p_hm_cn, 1, 2, 1, 'suffix')
+    ]
+
+    for pattern, sub_pattern, group_idx, c_idx, h_idx, aff_type in patterns:
+        match = re.match(pattern, group)
+        if match:
+            cn_hm = match.group(group_idx)
+            affix = match.group(3 - group_idx)  # Other group is prefix/suffix
+            sub_match = re.match(sub_pattern, cn_hm)
+            if sub_match:
+                c_count = int(sub_match.group(c_idx))
+                h_count = int(sub_match.group(h_idx))
+                return validate_and_expand(
+                    c_count, h_count,
+                    prefix=affix if aff_type == 'prefix' else None,
+                    suffix=affix if aff_type == 'suffix' else None
+                )
+
+    return False
+
+
+def H_C_expand(group):
+    # 1. 处理 CnHm 在前的格式，例如 'C18H37HNOC'
+    match_cn_hm_prefix = re.match(r'(H\d+C\d+)(.+)', group)
+    elements = ['S', 'N', 'P', 'C', 'O']
+    keys = [f"{e}{suffix}" for e in elements for suffix in ['R"', "R'", "R", "*"]]
+    replacement_map = {key: f'{key[0]}*' for key in keys}
+
+    if match_cn_hm_prefix:
+        cn_hm = match_cn_hm_prefix.group(1)  # e.g., 'C18H37'
+        suffix = match_cn_hm_prefix.group(2)  # e.g., 'HNOC'
+        # 处理 CnHm 部分
+        match_cn_hm = re.match(r'H(\d+)C(\d+)', cn_hm)
+        if match_cn_hm:
+            C_count = int(match_cn_hm.group(1))
+            H_count = int(match_cn_hm.group(2))
+            if H_count != 2 * C_count + 1:
+                return False
+            else:
+                smiles = '[C]' + 'C' * (C_count - 1)
+                if suffix:
+                    suffix = normalize_ocr_text(suffix, replacement_map)
+                    suffix_smi=ABBREVIATIONS[suffix].smiles if suffix in ABBREVIATIONS else suffix
+                    sub_smic=sub_smic=suffix_smi +  'C' * (C_count )
+                    return sub_smic
+                else:
+                    return smiles        
+        return False
+    # 2. 处理 CnHm 在后的格式，例如 'CONHC3H7'
+    match_cn_hm_suffix = re.match(r'(.+)(H\d+C\d+)$', group)
+    if match_cn_hm_suffix:
+        prefix = match_cn_hm_suffix.group(1)  # e.g., 'CONH'
+        cn_hm = match_cn_hm_suffix.group(2)  # e.g., 'C3H7'
+        # 处理 CnHm 部分
+        match_cn_hm = re.match(r'H(\d+)C(\d+)', cn_hm)
+        if match_cn_hm:
+            C_count = int(match_cn_hm.group(1))
+            H_count = int(match_cn_hm.group(2))
+            # 可选：验证 H_count，例如直链烷基 H_count = 2 * C_count + 1
+            if H_count != 2 * C_count + 1:
+                return False
+            else:
+                smiles = '[C]' + 'C' * (C_count - 1)
+                if prefix:
+                    prefix = normalize_ocr_text(prefix, replacement_map)
+                    prefix_smi=ABBREVIATIONS[prefix].smiles if prefix in ABBREVIATIONS else prefix
+                    sub_smic=sub_smic=prefix_smi +  'C' * (C_count )
+                    return sub_smic
+                else:
+                    return smiles  
+        return False
+
+    # 3. 原有逻辑处理 CnFm 格式 (e.g., C2F5)
+    match_cnfm = re.match(r'H(\d+)C(\d+)', group)
+    if match_cnfm:
+        C_count = int(match_cnfm.group(1))
+        F_count = int(match_cnfm.group(2))
+        # 验证氟原子数是否符合全氟烷基的规则：F_count = 2 * C_count + 1
+        if F_count != 2 * C_count + 1:
+            return False
+        smiles = '[C]' + 'C' * (C_count - 1)
+        return smiles
+
+def C_F_expand(group):
+    # 尝试匹配 CnFm 格式 (e.g., C2F5)
+    match_cnfm = re.match(r'C(\d+)F(\d+)', group)
+    if match_cnfm:
+        C_count = int(match_cnfm.group(1))
+        F_count = int(match_cnfm.group(2))
+        # 验证氟原子数是否符合全氟烷基的规则：F_count = 2 * C_count + 1
+        if F_count != 2 * C_count + 1:
+            return False
+    else:
+        # 尝试匹配 CF2CF3 格式 (e.g., CF2CF3, CF2CF2CF3)
+        # 匹配一个或多个 CF2 后跟一个 CF3
+        match_cfx = re.match(r'(CF2)*CF3$', group)
+        if not match_cfx:
+            return False
+        # 计算碳原子和氟原子数
+        cf2_count = group.count('CF2')  # 每个 CF2 贡献 1 碳和 2 氟
+        C_count = cf2_count + 1  # +1 for the terminal CF3
+        F_count = cf2_count * 2 + 3  # Each CF2 has 2F, CF3 has 3F
+        # 验证氟原子数是否符合全氟烷基的规则
+        if F_count != 2 * C_count + 1:
+            return False
+    # 构建 SMILES 字符串
+    smiles = []
+    for i in range(C_count):
+        if i < C_count - 1:
+            # 前面的碳原子：2个氟原子，形式为 C(F)(F)
+            if len(smiles)==0:
+                smiles.append('[C](F)(F)')
+            else:
+                smiles.append('C(F)(F)')
+        else:
+            # 最后一个碳原子：3个氟原子，形式为 [C](F)(F)(F)
+            smiles.append('[C](F)(F)(F)')
+    
+    # 连接所有部分
+    return ''.join(smiles)
+
+
+# '|'.join(list(ABBREVIATIONS.keys()))
+original_str ='|'.join(list(ABBREVIATIONS.keys()))
+escaped_str = original_str.replace('*', r'\*').replace('(', r'\(').replace(')', r'\)')
+
+FORMULA_REGEX_str='(' + escaped_str + '|R[0-9]*|[A-Z][a-z]+|[A-Z]|[0-9]+|\(|\))' 
+# print(escaped_str)
+# print(FORMULA_REGEX_str)
+FORMULA_REGEX = re.compile(FORMULA_REGEX_str)
+# placeholder_atoms
+def _parse_tokens(tokens: list):
+    """
+    Parse tokens of condensed formula into list of pairs `(elt, num)`
+    where `num` is the multiplicity of the atom (or nested condensed formula) `elt`
+    Used by `_parse_formula`, which does the same thing but takes a formula in string form as input
+    """
+    elements = []
+    i = 0
+    j = 0
+    while i < len(tokens):
+        if tokens[i] == '(':
+            while j < len(tokens) and tokens[j] != ')':
+                j += 1
+            elt = _parse_tokens(tokens[i + 1:j])
+        else:
+            elt = tokens[i]
+        j += 1
+        if j < len(tokens) and tokens[j].isnumeric():
+            num = int(tokens[j])
+            j += 1
+        else:
+            num = 1
+        elements.append((elt, num))
+        i = j
+    return elements
+
+
+def _parse_formula(formula: str):
+    """
+    Parse condensed formula into list of pairs `(elt, num)`
+    where `num` is the subscript to the atom (or nested condensed formula) `elt`
+    Example: "C2H4O" -> [('C', 2), ('H', 4), ('O', 1)]
+    """
+    tokens = FORMULA_REGEX.findall(formula)
+    # if ''.join(tokens) != formula:
+    #     tokens = FORMULA_REGEX_BACKUP.findall(formula)
+    return _parse_tokens(tokens)
+
+
+def _expand_carbon(elements: list):
+    """
+    Given list of pairs `(elt, num)`, output single list of all atoms in order,
+    expanding carbon sequences (CaXb where a > 1 and X is halogen) if necessary
+    Example: [('C', 2), ('H', 4), ('O', 1)] -> ['C', 'H', 'H', 'C', 'H', 'H', 'O'])
+    """
+    expanded = []
+    i = 0
+    while i < len(elements):
+        elt, num = elements[i]
+        # skip unreasonable number of atoms
+        if num > 100000:
+            i += 1; continue
+        # expand carbon sequence
+        if elt == 'C' and num > 1 and i + 1 < len(elements):
+            next_elt, next_num = elements[i + 1]
+            if next_num > 100000:
+                i += 1; continue
+            quotient, remainder = next_num // num, next_num % num
+            for _ in range(num):
+                expanded.append('C')
+                for _ in range(quotient):
+                    expanded.append(next_elt)
+            for _ in range(remainder):
+                expanded.append(next_elt)
+            i += 2
+        # recurse if `elt` itself is a list (nested formula)
+        elif isinstance(elt, list):
+            new_elt = _expand_carbon(elt)
+            for _ in range(num):
+                expanded.append(new_elt)
+            i += 1
+        # simplest case: simply append `elt` `num` times
+        else:
+            for _ in range(num):
+                expanded.append(elt)
+            i += 1
+    if expanded==[]:
+        return False
+    else:
+        return expanded    
+
+def replace_bracket(match):
+    content = match.group(1)
+    # 条件1：包含数字或 '+' 或 '-'，保留整个 [content]
+    if re.search(r'\d|\+|-', content):
+        return f'[{content}]'
+    # 条件2：仅为 'H'，保留
+    elif content == 'H':
+        return '[H]'
+    # 条件3：字符长度 >=2 且包含 'H'，则去除括号和 H
+    elif len(content) >= 2 and 'H' in content:
+        return ''.join([ch for ch in content if ch != 'H'])
+    # 条件4：其他情况，去掉括号
+    else:
+        return content
+
+    # return re.sub(r'\[([^\[\]]+)\]', replace_bracket, smi)
+
+def formula_regex(abbrev):# molscribe way for the combine abbver style
+    tokens = FORMULA_REGEX.findall(abbrev)
+    # elements=_parse_tokens(tokens)
+    abbrev_exp=_expand_carbon(_parse_tokens(tokens))
+    if abbrev_exp==[]:
+        return False
+    else:
+        return abbrev_exp    
+
+def _expand_abbreviationMS(abbrev):
+    """
+    Expand abbreviation into its SMILES; also converts [Rn] to [n*]
+    Used in `_condensed_formula_list_to_smiles` when encountering abbrev. in condensed formula
+    """
+    if abbrev in ABBREVIATIONS:
+        return ABBREVIATIONS[abbrev].smiles
+    # if abbrev in RGROUP_SYMBOLS or (abbrev[0] == 'R' and abbrev[1:].isdigit()):
+    if abbrev in RGROUP_SYMBOLS or (abbrev[0] in RGROUP_SYMBOLS and abbrev[1:].isdigit()):
+        if abbrev[1:].isdigit():
+            return f'[{abbrev[1:]}*]'
+        return '*'
+    return f'[{abbrev}]'
+
+
+def _get_bond_symb(bond_num):
+    """
+    Get SMILES symbol for a bond given bond order
+    Used in `_condensed_formula_list_to_smiles` while writing the SMILES string
+    """
+    if bond_num == 0:
+        return '.'
+    elif bond_num == 1:
+        return ''
+    elif bond_num == 2:
+        return '='
+    elif bond_num == 3:
+        return '#'
+    else:
+        print(f"check this val  {bond_num} !!!" )
+
+    return ''
+def _condensed_formula_list_to_smiles(formula_list, start_bond, end_bond=None, direction=None):
+    """
+    Converts condensed formula (in the form of a list of symbols) to smiles
+    Input:
+    `formula_list`: e.g. ['C', 'H', 'H', 'N', ['C', 'H', 'H', 'H'], ['C', 'H', 'H', 'H']] for CH2N(CH3)2
+    `start_bond`: # bonds attached to beginning of formula
+    `end_bond`: # bonds attached to end of formula (deduce automatically if None)
+    `direction` (1, -1, or None): direction in which to process the list (1: left to right; -1: right to left; None: deduce automatically)
+    Returns:
+    `smiles`: smiles corresponding to input condensed formula
+    `bonds_left`: bonds remaining at the end of the formula (for connecting back to main molecule); should equal `end_bond` if specified
+    `num_trials`: number of trials
+    `success` (bool): whether conversion was successful
+    """
+    # `direction` not specified: try left to right; if fails, try right to left
+    if direction is None:
+        num_trials = 1
+        for dir_choice in [1, -1]:
+            smiles, bonds_left, trials, success = _condensed_formula_list_to_smiles(formula_list, start_bond, end_bond, dir_choice)
+            num_trials += trials
+            if success:
+                return smiles, bonds_left, num_trials, success
+        return None, None, num_trials, False
+    assert direction == 1 or direction == -1
+
+    def dfs(smiles, bonds_left, cur_idx, add_idx):
+        """
+        `smiles`: SMILES string so far
+        `cur_idx`: index (in list `formula`) of current atom (i.e. atom to which subsequent atoms are being attached)
+        `cur_flat_idx`: index of current atom in list of atom tokens of SMILES so far
+        `bonds_left`: bonds remaining on current atom for subsequent atoms to be attached to
+        `add_idx`: index (in list `formula`) of atom to be attached to current atom
+        `add_flat_idx`: index of atom to be added in list of atom tokens of SMILES so far
+        Note: "atom" could refer to nested condensed formula (e.g. CH3 in CH2N(CH3)2)
+        """
+        num_trials = 1
+        # end of formula: return result
+        if (direction == 1 and add_idx == len(formula_list)) or (direction == -1 and add_idx == -1):
+            if end_bond is not None and end_bond != bonds_left:
+                return smiles, bonds_left, num_trials, False
+            return smiles, bonds_left, num_trials, True
+
+        # no more bonds but there are atoms remaining: conversion failed
+        if bonds_left <= 0:
+            return smiles, bonds_left, num_trials, False
+        to_add = formula_list[add_idx]  # atom to be added to current atom
+        if not isinstance(to_add, str):
+            return  smiles, bonds_left, num_trials, False
+        if isinstance(to_add, list):  # "atom" added is a list (i.e. nested condensed formula): assume valence of 1
+            if bonds_left > 1:
+                # "atom" added does not use up remaining bonds of current atom
+                # get smiles of "atom" (which is itself a condensed formula)
+                add_str, val, trials, success = _condensed_formula_list_to_smiles(to_add, 1, None, direction)
+                if val > 0:
+                    add_str = _get_bond_symb(val + 1) + add_str
+                num_trials += trials
+                if not success:
+                    return smiles, bonds_left, num_trials, False
+                # put smiles of "atom" in parentheses and append to smiles; go to next atom to add to current atom
+                result = dfs(smiles + f'({add_str})', bonds_left - 1, cur_idx, add_idx + direction)
+            else:
+                # "atom" added uses up remaining bonds of current atom
+                # get smiles of "atom" and bonds left on it
+                add_str, bonds_left, trials, success = _condensed_formula_list_to_smiles(to_add, 1, None, direction)
+                num_trials += trials
+                if not success:
+                    return smiles, bonds_left, num_trials, False
+                # append smiles of "atom" (without parentheses) to smiles; it becomes new current atom
+                result = dfs(smiles + add_str, bonds_left, add_idx, add_idx + direction)
+            smiles, bonds_left, trials, success = result
+            num_trials += trials
+            return smiles, bonds_left, num_trials, success
+        # atom added is a single symbol (as opposed to nested condensed formula)
+        for val in VALENCES.get(to_add, [1]):  # try all possible valences of atom added
+            add_str = _expand_abbreviationMS(to_add)  # expand to smiles if symbol is abbreviation
+            if bonds_left > val:  # atom added does not use up remaining bonds of current atom; go to next atom to add to current atom
+                if cur_idx >= 0:
+                    add_str = _get_bond_symb(val) + add_str
+                result = dfs(smiles + f'({add_str})', bonds_left - val, cur_idx, add_idx + direction)
+            else:  # atom added uses up remaining bonds of current atom; it becomes new current atom
+                if cur_idx >= 0:
+                    add_str = _get_bond_symb(bonds_left) + add_str
+                result = dfs(smiles + add_str, val - bonds_left, add_idx, add_idx + direction)
+            trials, success = result[2:]
+            num_trials += trials
+            if success:
+                return result[0], result[1], num_trials, success
+            if num_trials > 10000:
+                break
+        return smiles, bonds_left, num_trials, False
+
+    cur_idx = -1 if direction == 1 else len(formula_list)
+    add_idx = 0 if direction == 1 else len(formula_list) - 1
+    return dfs('', start_bond, cur_idx, add_idx)
+
+def swap_paren_bracket(text):
+    # Check if string starts with '('
+    if not text.startswith('('):
+        return text
+    # Pattern: match (...) followed by [...]
+    pattern = r'^\((.*?)\)\[(.*?)\]'
+    # Find match
+    match = re.match(pattern, text)
+    if match:
+        # Swap the groups: [group2](group1)
+        return f'[{match.group(2)}]({match.group(1)})'
+    
+    return text
+
+def convert_ch2_string(s):
+    # 匹配 (CH2)后面跟数字或字母的模式
+    pattern = r'\(CH2\)(\d+|[a-zA-Z]+)'
+    match = re.fullmatch(pattern, s)
+    if not match:
+        return s  # 如果不是目标模式，返回原字符串
+    
+    suffix = match.group(1)
+    
+    if suffix.isdigit():
+        n = int(suffix)
+        if n == 1:
+            return '[CH2]'
+        else:
+            return '[CH2]' + 'C' * (n - 1)
+    else:
+        # 处理变量情况，如 (CH2)m
+        var = suffix
+        print(var,s)
+        return s
+
+
+def process_string_joinused(s):
+    # 检查字符串是否以[]开头
+    match = re.match(r'^\[([^\]]*)\](.*)$', s)
+    if not match:
+        return s  # 如果不匹配，直接返回原字符串
+    
+    content, rest = match.groups()
+    # 计算[]中字符数
+    char_count = len(content)
+    
+    # 如果字符数大于1且包含H
+    if char_count > 1 and 'H' in content:
+        # 移除H及其后连续的数字
+        new_content = re.sub(r'H\d*', '', content)
+        return f'[{new_content}]{rest}'
+    return s
+
+def all_elements_in_dict(lst, dictionary):
+    """
+    递归检查列表（可能嵌套）中的所有元素是否都存在于字典的键中
+    
+    :param lst: 要检查的列表（可能包含嵌套列表）
+    :param dictionary: 要检查的字典
+    :return: 如果所有元素都在字典键中返回True，否则返回False
+    """
+    for element in lst:
+        if isinstance(element, list):
+            # 如果是嵌套列表，递归检查
+            if not all_elements_in_dict(element, dictionary):
+                return False
+        else:
+            # 如果是普通元素，检查是否在字典键中
+            if element not in dictionary:
+                return False
+    return True
+
+def expand_cf2_to_smiles(input_string):
+    # 正则表达式匹配 (CF2)nX 的模式，X 为任意字母数字字符串
+    pattern = r'\(CF2\)(\d+)([A-Za-z0-9]+)'
+    match = re.match(pattern, input_string)
+    if not match:
+        return input_string  
+    # 提取数字 n 和末尾的化学基团 X
+    n = int(match.group(1))
+    tail_group = f"[{match.group(2)}]"
+    # 构建 SMILES 字符串
+    # 每个 CF2 单元是 [C](F)(F)，重复 n 次，最后接 tail_group
+    cf2_unit = 'C(F)(F)'
+    smiles = '[C](F)(F)' + cf2_unit * (n-1) + tail_group if n > 0 else tail_group
+    return smiles
+
+def find_repeating_unit_and_smiles(s):
+    match = re.fullmatch(r'(.+?)(?:\1)+', s)
+    if match:
+        unit = match.group(1)
+        repeat_count = len(s) // len(unit)
+        # 根据重复单元生成SMILES（适当处理CH2 -> C, CF2 -> CF2）
+        if unit == "CH2":
+            smiles_unit = "C"  # CH2 -> C
+            smi_init="[CH2]"
+        elif unit == "CF2":
+            smiles_unit = "C(F)(F)"  # CF2保持原样
+            smi_init="[C](F)(F)"
+        elif unit == "SO2":
+            smiles_unit = "S(=O)(=O)"  # SO2保持原样
+            smi_init="[S](=O)(=O)"
+        else:
+            smiles_unit,smi_init='',''
+            print(f'please add the repateat patter here !!! for: {s}')
+            # smiles_unit = unit  # 其他单元直接使用
+        # 生成最终的SMILES
+        smiles = smi_init +  smiles_unit * (repeat_count - 1 )
+        
+        return smiles, repeat_count, unit
+    else:
+        return None, 0, None  # 如果没有匹配到，则返回None
+    
+def get_smiles_from_symbol(symbol, mol, bonds):
+    """
+    Convert symbol (abbrev. or condensed formula) to smiles
+    If condensed formula, determine parsing direction and num. bonds on each side using coordinates
+    """
+    if symbol in ABBREVIATIONS:
+        return ABBREVIATIONS[symbol].smiles
+    if symbol in RGROUP_SYMBOLS or (symbol[0] in RGROUP_SYMBOLS and symbol[1:].isdigit()):
+        if symbol[1:].isdigit():
+            return f'[{symbol[1:]}*]'
+        return '*'
+    
+    if len(symbol) > 20:
+        return None
+    smiles=convert_ch2_string(symbol)
+    if smiles !=symbol:
+        return smiles
+    if '(CF2)' in symbol:
+        smiles=expand_cf2_to_smiles(symbol)
+        return smiles
+    smiles, repeat_count, unit = find_repeating_unit_and_smiles(symbol)    
+    if repeat_count>0:
+        return smiles
+    
+    #TODO@@@ add as speical case or add function, 
+    # this is hard encode NOTE fix this next version
+    if symbol in ['CH2CH','CHCH2','CH2CH2', 'CH2CH2CH','CH2CH2CH','H2CH2CHC','CHCH2CH2','(CH2)10', 'H2C','CH2',#'CH2CH2NSO2CH3',
+              'OCH2CHOHCH2NH','OCH2CHOHCH2','CF2O','OF2C','EtO2CHN','EtO2C',
+              'CH2CH2C(O)0CH2CH3','CH2CH2C(O)OCH2CH3','l23I',
+              'OCH2CH2OH','OCH2CHCH2CCH3','CH2O',
+            '(H4NO)2','SO2NHCH2CH','OCH2CH','OCF2H','COCOOCH2CH3','CH2CH2CH2CH','HCH2CH2CH2C','CF3CF2CF2CF2SO3',
+            # 'SO2(CH2)3SO2NHCH2CHCH2OH',
+            '(CF2)8H','PH3C','CO','OC',
+            'CF2CF2H','NHSO2CH3','CH2CH2C','CH;CH2C(O)0CHCH3','CH2CH2C(O)OCHCH3',
+              'NH2','H2N', 'CHO', 'OHC',   'N(SO2CH3)2','CH2CH2O','CH2CH2C(O)OCH2CH3',
+              #ACS
+              'Ar2P(O)','PhO2S','NHP(O)Ph2','P*Ph3','P+Ph3','NH2.HCl',
+              #CLEF
+              'S[O]a',
+            #USPTO
+             '(C3H6O)7CH3','HC','(HC','(CH2CH2CH2CH-)','3(CHCHCHCH272',
+            #UOB
+            'NHzBrH','NH2BrH',
+            #staker
+            '(co)','(CO)',
+            #JPO
+            'CH3CH','CH3CCH3','CH3CO','CH3OCH2','CO2C','CH2CO2CH3',"COCl",
+         ]:#NOTE this are not passed by _condensed_formula_list_to_smiles function
+        #TODO fix me in next version, may be need LLM to track this
+            # Substitution(['CHO', 'OHC'], '[CH1](=O)', "[CH1](=O)", 0.5),
+            # Substitution(['NH2','H2N'], '[NH2;D1]', "[NH2]", 0.1),
+        #TODO symbol2SMILES() need dig ChemDraw 
+        if symbol in ['CH2CH','CHCH2']:smiles='[CH2][CH]'
+        elif symbol in ['PH3C']:smiles='[CH2]P'
+        elif symbol in ['l23I']:smiles='[I]'
+        elif symbol in ['HC','(HC']:smiles='[CH]'
+        elif symbol in ['NHzBrH','NH2BrH']:smiles='[NH2].Br'
+        elif symbol in ['(C3H6O)7CH3']:smiles="[O]CCC"+"OCCC"*6+'C'#TODO maybe as function
+        elif symbol in ['NH2.HCl']:smiles="[NH2].Cl"
+        elif symbol in ['CH2CH2CH2CH','(CH2CH2CH2CH-)']:smiles='[CH2]CC[CH]'
+        elif symbol in ['3(CHCHCHCH272', 'CHCHCHCH2']:smiles='[CH]CC[CH2]'
+        # elif symbol in ['D']:smiles='[2H]'
+        elif symbol in [ 'CH3CH']:smiles='[CH]C'
+        elif symbol in [ 'CH2CO2CH3']:smiles='[CH2]C(=O)OC'
+        elif symbol in [ 'CO2C']:smiles='[C](=O)O[C]'
+        elif symbol in [ 'CH3CCH3']:smiles='[C](C)(C)'
+        elif symbol in [ 'CH3CO']:smiles='[C](=O)C'
+        elif symbol in [ 'CH3OCH2']:smiles='[CH2]OC'
+
+        elif symbol in [ '(co)','(CO)']:smiles='[C](=O)'
+        elif symbol in ['Ar2P(O)']:smiles='[P](*)(*)(=O)'
+        elif symbol in ['PhO2S']:smiles='[S](=O)(=O)c1ccccc1'
+        elif symbol in ['CO','OC']:smiles='[C](=O)'
+        elif symbol in ['CH2O']:smiles='[CH2][O]'
+        elif symbol in ['P*Ph3','P+Ph3',]:smiles='[P+](c1ccccc1)(c1ccccc1)(c1ccccc1)'
+        elif symbol in ['NHP(O)Ph2']:smiles='[NH]P(=O)(c1ccccc1)c1ccccc1'
+        elif symbol in ['CH;CH2C(O)0CHCH3','CH2CH2C(O)OCHCH3']:smiles='[CH2]CC(=O)OCC'
+        elif symbol in ['CH2CH2CH','H2CH2CHC','CHCH2CH2']:smiles='[CH2][CH2][CH]'
+        elif symbol in ['CH2CH2CH2CH']:smiles='[CH2]CC[CH]'
+        elif symbol in ['HCH2CH2CH2C']:smiles='[CH]CC[CH2]'
+        elif symbol in ['H2C','CH2']:smiles='[CH2]'
+        elif symbol in ['H2CH2C','CH2CH2']:smiles='[CH2][CH2]'
+        elif symbol in ['CHO', 'OHC']:smiles="[CH](=O)"
+        elif symbol in ['NH2','H2N']:smiles="[NH2]"
+        elif symbol in ['(CF2)8H',]:smiles="[C](F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)"
+        elif symbol in ['CH2CH2C(O)OCH2CH3','CH2CH2C(O)0CH2CH3']:smiles='[CH2]CC(=O)OCC'
+        elif symbol in ['CF3CF2CF2CF2SO3']:smiles='[S](=O)(=O)([O-])C(F)(F)C(F)(F)C(F)(F)C(F)(F)(F)'
+        elif symbol in ['S[O]a']:smiles='[S](=O)'
+        elif symbol in ['COCl']:smiles='[C](=O)Cl'
+
+
+
+        elif symbol in ['OCF2H']:smiles="[O]C(F)(F)"
+        elif symbol in ['CF2O']:smiles="[C](F)(F)[O]"
+        elif symbol in ['OF2C']:smiles="[O][C](F)(F)"
+        elif symbol in ['CF2CF2H']:smiles="[C](F)(F)C(F)(F)"
+        # elif symbol in ['CH2CH2NSO2CH3']:smiles='[CH2]CNS(=O)(C)=O'
+        elif symbol in ['CH2CH2O']:smiles='[CH2]CO'
+        elif symbol in ['OCH2CH2OH']:smiles='[O]CCO'#NOTE Chemdraw may give some idea
+        elif symbol in ['EtO2CHN']:smiles='[N]C(=O)OCC'
+        elif symbol in ['OCH2CHOHCH2NH']:smiles='[O]CC(O)CN'
+        elif symbol in ['OCH2CHCH2CCH3']:smiles='[O]C[CH]C[C]C'
+        elif symbol in ['(H4NO)2']:smiles='[O]NON'
+        elif symbol in ['SO2NHCH2CH']:smiles='[S](=O)(=O)NC[CH]'
+        elif symbol in ['N(SO2CH3)2']:smiles='[N](S(=O)(=O)C)(S(=O)(=O)C)'
+        elif symbol in ['CH2CH2C(O)OCH2CH3']:smiles='[CH2]CC(=O)OCC'
+        elif symbol in ['OCH2CH']:smiles='[O]C[CH]'
+        elif symbol in ['EtO2C']:smiles='C(=O)OCC'
+        elif symbol in ['CH2CH2C']:smiles='[CH2]C[C]'
+        elif symbol in ['NHSO2CH3']:smiles='[NH]S(=O)(=O)C'
+        elif symbol in ['COCOOCH2CH3']:smiles='C(=O)C(=O)OCC'
+        # elif symbol in ['SO2(CH2)3SO2NHCH2CHCH2OH']:smiles='[S](=O)(=O)CCCS(=O)(=O)NC[C]CO'
+        # elif symbol in ['H4NO3S']:smiles='[S]NCC'
+        # elif symbol in ['(CH2)10','[CH]CCCCCCCCC']:smiles='[CH]CCCCCCCCC'#as in  convert_ch2_string()
+        else:smiles=None
+        return smiles
+
+    total_bonds = int(sum([bond.GetBondTypeAsDouble() for bond in bonds]))#TODO aromtaic bond effect ??
+    formula_list = _expand_carbon(_parse_formula(symbol))
+    # all_in_dict = all(fl in ABBREVIATIONS for fl in formula_list)
+    all_in_dict=all_elements_in_dict(formula_list,ABBREVIATIONS)
+    #total_bonds, bonds_left 机制是有问题的， 所以需要以上的修补，机制不完善
+    smiles, bonds_left, num_trails, success = _condensed_formula_list_to_smiles(formula_list, total_bonds, None)
+    # if debug:
+    print(f'{[formula_list, total_bonds]} use _condensed_formula_list_to_smiles {success} <<-------\n {smiles}')
+    if success:
+        smiles=swap_paren_bracket(smiles)
+        return smiles
+    elif all_in_dict :#NOTE resolve abbv combine 
+        # smiles=ABBREVIATIONS[formula_list[0]].smiles
+        key = extract_abbreviation_key(formula_list[0])
+        if key in ABBREVIATIONS:
+            smiles = ABBREVIATIONS[key].smiles
+        else:
+            # raise ValueError(f"Abbreviation {key} not found in ABBREVIATIONS.")
+            print(f"Abbreviation {key} not found in ABBREVIATIONS.")
+            smiles=''
+        for fl_i in range(1,len(formula_list)):
+            cur_smi=process_string_joinused(ABBREVIATIONS[formula_list[fl_i]].smiles)
+            smiles += cur_smi
+        return smiles
+
+    return None
+
+def abbrev2smile(abbrev,abbrev_exp,mol,idx):
+    
+    atom_gost = mol.GetAtomWithIdx(idx)
+    bonds_gost = atom_gost.GetBonds()
+    sub_smi = get_smiles_from_symbol(abbrev, mol, bonds_gost)
+
+    if sub_smi:
+        # print(f"succes expanding {abbrev},{abbrev_exp}\n{sub_smi}\t{idx}")
+        return sub_smi
+    else:
+        print(f"failed expanding {abbrev},{abbrev_exp}\n{sub_smi}\t{idx}")
+        return '[*]'
+
+    # if abbrev_exp[0] in ABBREVIATIONS: 
+    #     init_smi=ABBREVIATIONS[abbrev_exp[0]].smiles
+    # else:
+    #     if len(abbrev_exp[0])==1:
+    #         init_smi=f'[{abbrev_exp[0]}]'
+    #     else:
+    #         print(f"{abbrev_exp[0]} @@@formula_regex")
+    #         init_smi=f'[{abbrev_exp[0]}]'
+    # # init_smi=ABBREVIATIONS[abbrev_exp[0]].smiles if abbrev_exp[0] in ABBREVIATIONS else 
+    # if len(abbrev_exp)==1:
+    #     sub_smi=init_smi
+    #     return sub_smi
+    # elif len(abbrev_exp)>1:
+    #     sub_smi=init_smi
+    #     for i_ in range(1,len(abbrev_exp)):
+
+    #         smi_=ABBREVIATIONS[abbrev_exp[i_]].smiles if abbrev_exp[i_] in ABBREVIATIONS else  f'[{abbs[i_]}]'
+    #         smi_2=re.sub(r'\[([^\[\]]+)\]', replace_bracket, smi_)        
+    #         sub_smi +=smi_2#default combine them with single bond TODO fixme ifneed
+    #     return sub_smi
+    # else:
+    #     return False
+def replace_cg_notation(astr):
+    def replacer(match):
+        h_count = int(match.group(1))
+        c_count = (h_count - 1) // 2
+        return f'C{c_count}H{h_count}'
+
+    return re.sub(r'CgH(\d+)', replacer, astr)
+
+
+def _expand_abbreviation(abbrev, mol,idx):# ABBREVIATIONS, RGROUP_SYMBOLS, ELEMENTS):
+    """
+    Expand abbreviation into its SMILES; also converts [Rn] to [n*].
+    """
+
+    if abbrev in ABBREVIATIONS:
+        return ABBREVIATIONS[abbrev].smiles
+    # elif sub_smi_HC:return sub_smi_HC
+    elif N_C_H_expand(abbrev):return N_C_H_expand(abbrev)
+    elif C_F_expand(abbrev):return C_F_expand(abbrev)
+    elif C_H_expand2(abbrev):return C_H_expand2(abbrev)
+    elif C_H_expand(abbrev):return C_H_expand(abbrev)
+    elif C_H_affixExpand(abbrev):return C_H_affixExpand(abbrev)
+    # elif abbrev in RGROUP_SYMBOLS or (abbrev[0] == 'R' and abbrev[1:].isdigit()):
+    elif abbrev in RGROUP_SYMBOLS or (abbrev[0] in RGROUP_SYMBOLS and abbrev[1:].isdigit()):
+        if abbrev[1:].isdigit():
+            return f'[{abbrev[1:]}*]'
+    elif abbrev in ELEMENTS:
+        return f'[{abbrev}]'
+
+    elif formula_regex(abbrev):
+        abbrev_exp= formula_regex(abbrev)
+        return abbrev2smile(abbrev,abbrev_exp,mol,idx)#last use Molscribe way
+    
+    match = re.match(r'^(\d+)?(.*)', abbrev)
+    if match:
+        numeric_part, remaining_part = match.groups()
+        if remaining_part in ELEMENTS:
+            return f'[{abbrev}]'
+        elif numeric_part:
+            return f'[{numeric_part}*]'
+
+    else:
+        print(f"fixme !!!@@@@: {abbrev}")
+
+    return '[*]'
+
+def count_current_bonds(mol, atom_idx):
+    """Count current bonds (including bond order) for an atom."""
+    atom = mol.GetAtomWithIdx(atom_idx)
+    return sum(bond.GetBondTypeAsDouble() for bond in atom.GetBonds())
+
+debug_not=True
+
+def expandABB(mol, ABBREVIATIONS, placeholder_atoms):#, RGROUP_SYMBOLS, ELEMENTS):
+    mols = [mol]
+    # 逆序遍历 placeholder_atoms，确保删除后不会影响后续索引
+    for idx in sorted(placeholder_atoms.keys(), reverse=True) :
+        group = placeholder_atoms[idx]
+        group_smiles = _expand_abbreviation(group,mol,idx)
+        submol = Chem.MolFromSmiles(group_smiles)  # 获取官能团的子分子
+        try:
+            submol_rw = Chem.RWMol(submol)  # 转换为可编辑的 RWMol
+        except Exception as e:
+            print(f"abbver: {group}")
+            print(f'try to convert {group_smiles} to sub_mol')
+            print(e)
+            if debug_not:
+                print(f"Failed to convert {group_smiles} to sub_mol, using placeholder [*] instead.")
+                submol = Chem.MolFromSmiles('[*]') 
+                submol_rw = Chem.RWMol(submol)
+            else:
+                raise e#NOTE use it when debugging with adding abber and fixing rules in det_engine.py
+
+        # 1. 识别 submol 的 anchor atoms（连接点）
+        anchor_atoms = [0]#always use the fisrt atom as anchor atom
+        for atom in submol_rw.GetAtoms():
+            # 具有自由基的原子或标记为连接点的原子（例如 [*]）
+            if atom.GetNumRadicalElectrons() > 0 and atom.GetIdx() not in anchor_atoms:# or atom.GetSymbol() == '*':
+                anchor_atoms.append(atom.GetIdx())
+        # 2. 复制主分子
+        new_mol = Chem.RWMol(mol)
+        placeholder_idx = idx
+        # 3. 记录 placeholder (*) 原子的邻居及其键类型
+        bonds_info = []
+        for bond in new_mol.GetBonds():
+            if bond.GetBeginAtomIdx() == placeholder_idx:
+                bonds_info.append({
+                    "neighbor": bond.GetEndAtomIdx(),
+                    "bond_type": bond.GetBondType()
+                })
+            elif bond.GetEndAtomIdx() == placeholder_idx:
+                bonds_info.append({
+                    "neighbor": bond.GetBeginAtomIdx(),
+                    "bond_type": bond.GetBondType()
+                })
+
+        # 4. 断开 placeholder 的所有键
+        for bond_info in bonds_info:
+            new_mol.RemoveBond(placeholder_idx, bond_info["neighbor"])
+
+        # 5. 删除 placeholder 原子
+        new_mol.RemoveAtom(placeholder_idx)
+
+        # 6. 重新计算邻居索引（删除后索引变化）
+        adjusted_bonds_info = []
+        for bond_info in bonds_info:
+            neighbor = bond_info["neighbor"]
+            if neighbor < placeholder_idx:
+                adjusted_neighbor = neighbor
+            else:
+                adjusted_neighbor = neighbor - 1  # 索引因删除原子而减 1
+            adjusted_bonds_info.append({
+                "neighbor": adjusted_neighbor,
+                "bond_type": bond_info["bond_type"]
+            })
+
+        # 7. 合并 submol
+        new_mol = Chem.RWMol(Chem.CombineMols(new_mol, submol_rw))
+
+        # 8. 计算 submol 的 anchor atoms 在合并后的索引
+        submol_atom_offset = new_mol.GetNumAtoms() - submol_rw.GetNumAtoms()
+        new_anchor_indices = [submol_atom_offset + anchor_idx for anchor_idx in anchor_atoms]
+
+        # 9. 重新连接官能团，使用原始键类型
+        if len(new_anchor_indices) == 1:
+            # 单连接点情况：所有邻居连接到唯一的 anchor atom
+            anchor_idx = new_anchor_indices[0]
+            for bond_info in adjusted_bonds_info:
+                neighbor = bond_info["neighbor"]
+                bond_type = bond_info["bond_type"]
+                new_mol.AddBond(neighbor, anchor_idx, bond_type)
+                # 重置自由基电子数
+                a1 = new_mol.GetAtomWithIdx(neighbor)
+                a2 = new_mol.GetAtomWithIdx(anchor_idx)
+                a1.SetNumRadicalElectrons(0)
+                a2.SetNumRadicalElectrons(0)
+        else:
+            #   # 多连接点情况：先尝试按顺序连接, 如果* 连*  会存在多种合理价态的不同分子情况
+            # 多连接点情况：根据邻居数量和 anchor atoms 分配连接           
+            if len(adjusted_bonds_info) > len(new_anchor_indices):
+                print(adjusted_bonds_info,'  <---adjusted_bonds_info')
+                print(new_anchor_indices,'<---new_anchor_indices')
+                # raise ValueError(f"Too many neighbors ({len(adjusted_bonds_info)}) for submol with {len(new_anchor_indices)} anchor atoms.")
+            # for i, bond_info in enumerate(adjusted_bonds_info):
+            #     # 按顺序将邻居连接到 anchor atoms
+            #     anchor_idx = new_anchor_indices[i % len(new_anchor_indices)]
+            #     neighbor = bond_info["neighbor"]
+            #     bond_type = bond_info["bond_type"]
+            #     new_mol.AddBond(neighbor, anchor_idx, bond_type)
+            #     # 重置自由基电子数
+            #     a1 = new_mol.GetAtomWithIdx(neighbor)
+            #     a2 = new_mol.GetAtomWithIdx(anchor_idx)
+            #     a1.SetNumRadicalElectrons(0)
+            #     a2.SetNumRadicalElectrons(0)
+            # 跟踪每个 anchor 的当前成键数
+            anchor_bond_counts = {idx: new_mol.GetAtomWithIdx(idx).GetTotalValence() for idx in new_anchor_indices}
+            print(anchor_bond_counts,'<---anchor_bond_counts')
+            # max_valence = {6: 4, 7: 3, 8: 2}  # 示例：C=4, N=3, O=2，需根据实际原子类型扩展
+            adjusted_bonds_info = sorted(adjusted_bonds_info, key=lambda x: x['neighbor'])
+            if mol.GetNumConformers() > 0:#as some mol may not have the conf dispite pass the 2d assign process
+                pos_0 = mol.GetConformer().GetAtomPosition(adjusted_bonds_info[0]['neighbor'])
+                pos_1 = mol.GetConformer().GetAtomPosition(adjusted_bonds_info[-1]['neighbor'])
+                print(pos_0.x,pos_1.x,"xxx",adjusted_bonds_info)
+                # if group =='SO2NH':
+                #     if pos_0.x <pos_1.x:
+                #         adjusted_bonds_info=[adjusted_bonds_info[-1],adjusted_bonds_info[0]]
+                # elif group =='NHO2S':
+                #     if pos_0.x <pos_1.x:
+                #         adjusted_bonds_info=[adjusted_bonds_info[-1],adjusted_bonds_info[0]]
+
+
+            for bond_info in adjusted_bonds_info:
+                neighbor = bond_info["neighbor"]
+                bond_type = bond_info["bond_type"]
+                bond_valence = {Chem.BondType.SINGLE: 1, Chem.BondType.DOUBLE: 2, Chem.BondType.TRIPLE: 3}.get(bond_type, 1)
+                # 寻找未饱和的 anchor
+                selected_anchor_idx = None
+                for anchor_idx in new_anchor_indices:
+                    atom = new_mol.GetAtomWithIdx(anchor_idx)
+                    atomic_num = atom.GetAtomicNum()
+                    current_valence = anchor_bond_counts[anchor_idx]
+                    max_allowed = max(VALENCES.get( atom.GetSymbol(), [1]))  # 默认最大价态为1（可根据需求调整）
+                    if current_valence + bond_valence <= max_allowed:
+                        selected_anchor_idx = anchor_idx
+                        break
+                if selected_anchor_idx is None:
+                    continue  # 跳过，当前没有可用的未饱和 anchor
+                # 添加键
+                new_mol.AddBond(neighbor, selected_anchor_idx, bond_type)
+                # 更新成键数
+                anchor_bond_counts[selected_anchor_idx] += bond_valence
+                # 重置自由基电子数
+                a1 = new_mol.GetAtomWithIdx(neighbor)
+                a2 = new_mol.GetAtomWithIdx(selected_anchor_idx)
+                a1.SetNumRadicalElectrons(0)
+                a2.SetNumRadicalElectrons(0)
+
+
+
+            # 多连接点情况：先尝试按顺序连接
+            # success = False
+            # temp_mol = Chem.RWMol(new_mol)  # 备份分子
+            # try:
+            #     for i, bond_info in enumerate(adjusted_bonds_info):
+            #         anchor_idx = new_anchor_indices[i % len(new_anchor_indices)]
+            #         neighbor = bond_info["neighbor"]
+            #         bond_type = bond_info["bond_type"]
+            #         temp_mol.AddBond(neighbor, anchor_idx, bond_type)
+            #         # 重置自由基电子数
+            #         a1 = temp_mol.GetAtomWithIdx(neighbor)
+            #         a2 = temp_mol.GetAtomWithIdx(anchor_idx)
+            #         a1.SetNumRadicalElectrons(0)
+            #         a2.SetNumRadicalElectrons(0)
+            #     # 验证价态
+            #     Chem.SanitizeMol(temp_mol)
+            #     new_mol = temp_mol
+            #     success = True
+            # except Chem.rdchem.MolSanitizeException:
+            #     # 价态不合理，尝试反序 anchor atoms
+            #     temp_mol = Chem.RWMol(new_mol)  # 恢复备份
+            #     reversed_anchors = new_anchor_indices[::-1]  # 反序 anchor atoms
+            #     try:
+            #         for i, bond_info in enumerate(adjusted_bonds_info):
+            #             anchor_idx = reversed_anchors[i % len(reversed_anchors)]
+            #             neighbor = bond_info["neighbor"]
+            #             bond_type = bond_info["bond_type"]
+            #             temp_mol.AddBond(neighbor, anchor_idx, bond_type)
+            #             # 重置自由基电子数
+            #             a1 = temp_mol.GetAtomWithIdx(neighbor)
+            #             a2 = temp_mol.GetAtomWithIdx(anchor_idx)
+            #             a1.SetNumRadicalElectrons(0)
+            #             a2.SetNumRadicalElectrons(0)
+            #         # 验证价态
+            #         Chem.SanitizeMol(temp_mol)
+            #         new_mol = temp_mol
+            #         success = True
+            #     except Chem.rdchem.MolSanitizeException:
+            #         print(f"Failed to connect submol with {len(new_anchor_indices)} anchor atoms to {len(adjusted_bonds_info)} neighbors.")
+            #         raise ValueError("Unable to create valid molecule with either anchor order.")
+            # if not success:
+            #     raise ValueError("Unable to create valid molecule.")
+       
+        # 10. 更新主分子
+        mol = new_mol
+        mols.append(mol)
+
+    # 输出修改后的分子 SMILES
+    modified_smiles = Chem.MolToSmiles(mols[-1])
+    return mols[-1], modified_smiles
+
+
+def is_valid_chem_text(text):
+    """检查化学表达式是否只包含大小写字母、数字和成对括号，且括号成对"""
+    if not text:
+        return False
+    if text.isdigit():
+        return False
+    # 检查是否只包含大小写字母、数字、括号
+    if not re.match(r'^[A-Za-z0-9()]+$', text):
+        return False
+    # 检查括号是否成对
+    stack = []
+    for char in text:
+        if char == '(':
+            stack.append(char)
+        elif char == ')':
+            if not stack or stack[-1] != '(':
+                return False
+            stack.pop()
+    return len(stack) == 0
+
+def select_chem_expression(orig_text, orig_score, scaled_text, scaled_score, cropped_img_orig, cropped_img_scaled):
+    """选择更合理的化学表达式"""
+    # 计算分数的绝对值差
+    score_diff = abs(orig_score - scaled_score)
+    if scaled_text in orig_text and orig_text in ABBREVIATIONS:
+        print(f'use orig_text as include the sacled and in ABBREVIATIONS {orig_text}')
+        return orig_text, orig_score, cropped_img_orig
+    elif orig_text in scaled_text and scaled_text in ABBREVIATIONS:
+        print(f'use scaled_text as include the orig_text and in ABBREVIATIONS {scaled_text}')
+        return scaled_text, scaled_score, cropped_img_scaled
+    
+    # 检查两个表达式的有效性
+    orig_valid = is_valid_chem_text(orig_text)
+    scaled_valid = is_valid_chem_text(scaled_text)
+    
+    #other condition here
+    # 如果分差大于0.1，选择分数高的
+    if score_diff > 0.1:
+        if orig_valid and scaled_valid:
+            if orig_score >= scaled_score and orig_text:
+                return orig_text, orig_score, cropped_img_orig
+            elif scaled_text:
+                return scaled_text, scaled_score, cropped_img_scaled
+        elif orig_valid and not scaled_valid:
+            return orig_text, orig_score, cropped_img_orig
+        elif scaled_valid and not orig_valid:
+            return scaled_text, scaled_score, cropped_img_scaled
+        else:
+            print(f"Both texts are invalid: orig_text='{orig_text}', scaled_text='{scaled_text}'")
+            if orig_score >= scaled_score:
+                return orig_text, orig_score, cropped_img_orig
+            else:
+                return scaled_text, scaled_score, cropped_img_scaled
+    # 如果分差小于0.1，选择更合理的化学表达式
+    else:
+        # 如果只有一个有效，选择有效的
+        if orig_valid and not scaled_valid:
+            return orig_text, orig_score, cropped_img_orig
+        elif scaled_valid and not orig_valid:
+            return scaled_text, scaled_score, cropped_img_scaled
+        # 如果都有效，比较长度
+        elif orig_valid and scaled_valid:
+            if orig_text in ABBREVIATIONS and scaled_text not in ABBREVIATIONS:
+                if  N_C_H_expand(scaled_text) or C_F_expand(scaled_text) or C_H_expand2(scaled_text) or C_H_expand(scaled_text):
+                    if len(scaled_text)> len(orig_text):
+                        return scaled_text, scaled_score, cropped_img_scaled
+                return orig_text, orig_score, cropped_img_orig
+            elif orig_text not in ABBREVIATIONS and scaled_text  in ABBREVIATIONS:
+                if  N_C_H_expand(orig_text) or C_F_expand(orig_text) or C_H_expand2(orig_text) or C_H_expand(orig_text):
+                    if len(orig_text)> len(scaled_text):
+                        return  orig_text, orig_score, cropped_img_orig
+                return scaled_text, scaled_score, cropped_img_scaled
+            elif orig_text not in ABBREVIATIONS and scaled_text  not in ABBREVIATIONS:
+                if len(orig_text) > len(scaled_text):
+                    return orig_text, orig_score, cropped_img_orig
+                else:
+                    if len(orig_text) == len(scaled_text):
+                        if orig_score >= scaled_score :
+                            return orig_text, orig_score, cropped_img_orig
+                        else:
+                            return scaled_text, scaled_score, cropped_img_scaled
+                    return scaled_text, scaled_score, cropped_img_scaled
+
+            elif orig_text in ABBREVIATIONS and scaled_text  in ABBREVIATIONS:
+                if len(orig_text) >= len(scaled_text):
+                    return orig_text, orig_score, cropped_img_orig
+                else:
+                    return scaled_text, scaled_score, cropped_img_scaled
+        # 如果都不有效，优先选择 orig（若存在）
+        elif orig_text:
+            return orig_text, orig_score, cropped_img_orig
+        elif scaled_text:
+            return scaled_text, scaled_score, cropped_img_scaled
+    
+    # 默认返回 scaled（若存在）
+    return scaled_text, scaled_score, cropped_img_scaled if scaled_text else (None, None, None)
+
+# def expandABB(mol,ABBREVIATIONS,  placeholder_atoms):# RGROUP_SYMBOLS, ELEMENTS):
+#     mols = [mol]
+   
+#     # Process placeholders in reverse order to avoid index issues
+#     for idx in sorted(placeholder_atoms.keys(), reverse=True):
+#         group = placeholder_atoms[idx]
+#         group_smiles = _expand_abbreviation(group)# ABBREVIATIONS, RGROUP_SYMBOLS, ELEMENTS)
+        
+#         try:
+#             submol = Chem.MolFromSmiles(group_smiles)
+#             if not submol:
+#                 raise ValueError(f"Invalid SMILES for group {group}: {group_smiles}")
+#             submol_rw = RWMol(submol)
+#         except Exception as e:
+#             print(f"Error processing SMILES for group {group}: {e}")
+#             continue
+        
+#         # Create a new editable molecule
+#         new_mol = RWMol(mol)
+#         placeholder_idx = idx
+        
+#         # Get neighbors of the placeholder atom
+#         neighbors = [nb.GetIdx() for nb in new_mol.GetAtomWithIdx(placeholder_idx).GetNeighbors()]
+        
+#         # Identify anchor atoms in submol (atoms marked as [*] or with isotope labels)
+#         anchor_atoms = []
+#         for atom in submol.GetAtoms():
+#             if atom.GetNumRadicalElectrons() > 0:
+#                 #atom.GetSymbol() == '*' or atom.GetIsotope() > 0:
+#                 anchor_atoms.append(atom.GetIdx())
+        
+#         # Validate number of anchor atoms vs. neighbors
+#         if len(anchor_atoms) != len(neighbors):
+#             print(f"Warning: Mismatch between anchor atoms ({len(anchor_atoms)}) and neighbors ({len(neighbors)}) for group {group}")
+#             print(len(anchor_atoms), len(neighbors))
+#             if len(anchor_atoms)==0:
+#                anchor_atoms.append(0)# use the first atom of submol as default such as PPh3
+        
+        
+#         # Remove bonds involving the placeholder atom
+#         bonds_to_remove = [(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())
+#                           for bond in new_mol.GetBonds()
+#                           if bond.GetBeginAtomIdx() == placeholder_idx or bond.GetEndAtomIdx() == placeholder_idx]
+#         for bond in bonds_to_remove:
+#             new_mol.RemoveBond(bond[0], bond[1])
+        
+#         # Remove the placeholder atom
+#         new_mol.RemoveAtom(placeholder_idx)
+        
+#         # Adjust neighbor indices after atom removal
+#         new_neighbors = [n - 1 if n > placeholder_idx else n for n in neighbors]
+        
+#         # Combine molecules
+#         new_mol = RWMol(CombineMols(new_mol, submol_rw))
+        
+#         # Connect anchor atoms to neighbors
+#         submol_offset = new_mol.GetNumAtoms() - submol.GetNumAtoms()
+#         for anchor_idx, neighbor_idx in zip(anchor_atoms, new_neighbors):
+#             new_anchor_idx = submol_offset + anchor_idx
+#             new_mol.AddBond(neighbor_idx, new_anchor_idx, Chem.BondType.SINGLE)
+            
+#             # Reset radical electrons
+#             new_mol.GetAtomWithIdx(neighbor_idx).SetNumRadicalElectrons(0)
+#             new_mol.GetAtomWithIdx(new_anchor_idx).SetNumRadicalElectrons(0)
+        
+#         mol = new_mol
+#         mols.append(mol)
+    
+#     # Generate final SMILES
+#     try:
+#         modified_smiles = Chem.MolToSmiles(mols[-1])
+#     except Exception as e:
+#         print(f"Error generating SMILES: {e}")
+#         return mols[-1], None
+    
+#     return mols[-1], modified_smiles
+
+
+# def _expand_abbreviation(abbrev):
+#     """
+#     Expand abbreviation into its SMILES; also converts [Rn] to [n*]
+#     Used in `_condensed_formula_list_to_smiles` when encountering abbrev. in condensed formula
+#     """
+#     if abbrev in ABBREVIATIONS: 
+#         return ABBREVIATIONS[abbrev].smiles
+#     elif abbrev in RGROUP_SYMBOLS or (abbrev[0] == 'R' and abbrev[1:].isdigit()):
+
+#         if abbrev[1:].isdigit():
+#             return f'[{abbrev[1:]}*]'
+#     elif abbrev in ELEMENTS:#ocr tool need this
+#         return f'[{abbrev}]'
+#     # try  abbrev    
+
+#     match = re.match(r'^(\d+)?(.*)', abbrev)
+#     if match:
+#         numeric_part, remaining_part = match.groups()
+#         if remaining_part in ELEMENTS:
+#             return f'[{abbrev}]'
+#         else:
+#             if numeric_part:
+#                 abbrev=f'[{numeric_part}*]'
+#     return '[*]'
+
+
+
+# def expandABB(mol,ABBREVIATIONS, placeholder_atoms):
+#     mols = [mol]
+#     # **第三步: 替换 * 并合并官能团**
+#     # 逆序遍历 placeholder_atoms，确保删除后不会影响后续索引
+#     for idx in sorted(placeholder_atoms.keys(), reverse=True):
+#         group = placeholder_atoms[idx]  # 获取官能团名称
+#         # print(idx, group)
+#         group=_expand_abbreviation(group)
+#         submol = Chem.MolFromSmiles(group)  # 获取官能团的子分子
+#         submol_rw = RWMol(submol)  # 让 submol 变成可编辑的 RWMol
+#         anchor_atom_idx = 0  # 选择 `submol` 的第一个原子作为连接点 as defined in ABBREVIATIONS
+#         # **1. 复制主分子**
+#         new_mol = RWMol(mol)
+#         # **2. 计算 `*` 在 `new_mol` 中的索引**
+#         placeholder_idx = idx
+#         # **3. 记录 `*` 原子的邻居**
+#         neighbors = [nb.GetIdx() for nb in new_mol.GetAtomWithIdx(placeholder_idx).GetNeighbors()]
+#         # **4. 断开 `*` 的所有键**
+#         bonds_to_remove = []  # 记录要断开的键
+#         for bond in new_mol.GetBonds():
+#             if bond.GetBeginAtomIdx() == placeholder_idx or bond.GetEndAtomIdx() == placeholder_idx:
+#                 bonds_to_remove.append((bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()))
+#         for bond in bonds_to_remove:
+#             new_mol.RemoveBond(bond[0], bond[1])
+#         # **5. 删除 `*` 原子**
+#         new_mol.RemoveAtom(placeholder_idx)
+#         # **6. 重新计算 `neighbors`（删除后索引变化）**
+#         new_neighbors = []
+#         for neighbor in neighbors:
+#             if neighbor < placeholder_idx:
+#                 new_neighbors.append(neighbor)
+#             else:
+#                 new_neighbors.append(neighbor - 1)  # 因为删除了一个原子，所有索引 -1
+#         # **7. 合并 `submol`**
+#         new_mol = RWMol(CombineMols(new_mol, submol_rw))
+
+#         # **8. 计算 `submol` 的第一个原子在合并后的位置**
+#         new_anchor_idx = new_mol.GetNumAtoms() - len(submol_rw.GetAtoms()) + anchor_atom_idx
+
+#         # **9. 重新连接官能团**
+#         for neighbor in new_neighbors:
+#             # print(neighbor, new_anchor_idx, "!!")
+#             new_mol.AddBond(neighbor, new_anchor_idx, Chem.BondType.SINGLE)
+#             a1=new_mol.GetAtomWithIdx(neighbor)
+#             a2=new_mol.GetAtomWithIdx(new_anchor_idx)
+#             a1.SetNumRadicalElectrons(0)
+#             a2.SetNumRadicalElectrons(0)## 将自由基电子数设为 0,as has added new bond
+#         # **10. 更新主分子**
+#         mol = new_mol
+#         mols.append(mol)
+#     # 输出修改后的分子 SMILES
+#     modified_smiles = Chem.MolToSmiles(mols[-1])
+#     # print(f"修改后的分子 SMILES: {modified_smiles}")            
+#     return mols[-1], modified_smiles
+
+
+
+
+
+# Helper function to check if two boxes overlap
+def boxes_overlap(box1, box2):
+    x1, y1, x2, y2 = box1
+    bx1, by1, bx2, by2 = box2
+    return not (x2 < bx1 or x1 > bx2 or y2 < by1 or y1 > by2)
+
+def boxes_overlap2(atombonx, bondbox):
+    """
+    检查两个矩形框是否重叠，并返回 bondbox 中不重叠一端到中心 10% 位置的坐标。
+    
+    参数:
+        atombonx: tuple (x1, y1, x2, y2) 表示原子框的坐标
+        bondbox: tuple (bx1, by1, bx2, by2) 表示键框的坐标
+        
+    返回:
+        tuple (x, y) 表示 bondbox 不重叠一端到中心 80% 位置的坐标，如果完全包含返回 (None, None)
+    """
+    x1, y1, x2, y2 = atombonx
+    bx1, by1, bx2, by2 = bondbox
+    
+    # 计算 bond_box 的中心坐标
+    bond_center_x = (bx1 + bx2) / 2
+    bond_center_y = (by1 + by2) / 2
+    
+    # 辅助函数：计算点到 atom_box 中心的距离
+    def distance_to_center(x, y):
+        center_x = (x1 + x2) / 2
+        center_y = (y1 + y2) / 2
+        return ((x - center_x) ** 2 + (y - center_y) ** 2) ** 0.5
+    
+    # 辅助函数：计算从中心到端点 80% 位置的坐标
+    def get_80_percent_point(far_x, far_y):
+        # 从中心到端点的向量，按 80% 比例缩放
+        dx = far_x - bond_center_x
+        dy = far_y - bond_center_y
+        new_x = bond_center_x + 0.7 * dx#let added H close to the heavy neighbor
+        new_y = bond_center_y + 0.7 * dy
+        return new_x, new_y
+    
+    # 检查是否完全不相交
+    if (bx2 < x1 or bx1 > x2 or by2 < y1 or by1 > y2):
+        # 完全不相交，返回较远一端到中心 80% 位置
+        dist1 = distance_to_center(bx1, by1)
+        dist2 = distance_to_center(bx2, by2)
+        far_x, far_y = (bx2, by2) if dist2 > dist1 else (bx1, by1)
+        return get_80_percent_point(far_x, far_y)
+    
+    # 检查是否完全包含在 atom_box 内
+    if (bx1 >= x1 and bx2 <= x2 and by1 >= y1 and by2 <= y2):
+        # bondbox 完全在 atom_box 内，无法确定不重叠部分，返回 bond_center_x, bond_center_y
+        # return None, None
+        return bond_center_x, bond_center_y
+
+    # 检查一端是否在 atom_box 内
+    if (bx1 >= x1 and bx1 <= x2 and by1 >= y1 and by1 <= y2):
+        # bx1, by1 在 atom_box 内，返回 bx2, by2 到中心 80% 位置
+        return get_80_percent_point(bx2, by2)
+    elif (bx2 >= x1 and bx2 <= x2 and by2 >= y1 and by2 <= y2):
+        # bx2, by2 在 atom_box 内，返回 bx1, by1 到中心 80% 位置
+        return get_80_percent_point(bx1, by1)
+    
+    # 处理部分相交但两端都不在 atom_box 内的情况
+    # 返回较远一端到中心 80% 位置
+    dist1 = distance_to_center(bx1, by1)
+    dist2 = distance_to_center(bx2, by2)
+    far_x, far_y = (bx2, by2) if dist2 > dist1 else (bx1, by1)
+    return get_80_percent_point(far_x, far_y)
+
+
+charge_labels = [19,20,21,22,23]
+def outputbox_update(output,charge_labels,bond_labels,lab2idx):
+    bonds_mask = np.array([True if ins  in bond_labels else False for ins in output['pred_classes']])
+    bond_bbox=output['bbox'][bonds_mask]
+    atoms_mask = np.array([True if ins not in bond_labels and ins not in charge_labels else False for ins in output['pred_classes']])
+    atom_bbox=output['bbox'][atoms_mask]
+    new_atoms=[]
+    b_len=3
+    single_odd_b2a=dict()
+    for bi,bb in enumerate(bond_bbox):
+        overlapped_atoms = []
+        overlapped_abox=[]
+        for ai,aa in enumerate(atom_bbox):
+            overlap_flag=boxes_overlap(bb, aa)#TODO use tghe atom bond box overlap get bond atom mapping,then built mol
+            if overlap_flag:
+                # print(bb, aa,overlap_flag)
+                overlapped_atoms.append(ai)
+                overlapped_abox.append(aa)
+        if len(overlapped_atoms) == 1:
+            single_odd_b2a[bi]=overlapped_atoms
+            # Compute the non-overlapping part of the bond box to place hydrogen
+            non_overlapping_x,non_overlapping_y=boxes_overlap2(overlapped_abox[0], bb)
+            new_atom_out={'bbox':    np.array([non_overlapping_x - b_len, 
+                                    non_overlapping_y - b_len,
+                                    non_overlapping_x + b_len, 
+                                    non_overlapping_y + b_len]).reshape(-1,4),
+                'bbox_centers': np.array([non_overlapping_x,non_overlapping_y]).reshape(-1,2),
+                'scores':       np.array([1.0]),
+                'pred_classes': np.array([lab2idx['H']])}
+            new_atoms.append(new_atom_out)
+
+    output2_=copy.deepcopy(output)
+    for boxout in new_atoms:
+        for k,arr in boxout.items():
+            value_or_row=output2_[k]
+            if arr.ndim == 1:
+                output2_[k]=np.append(value_or_row, arr)
+            elif arr.ndim >= 2:
+                output2_[k] = np.concatenate([value_or_row, arr], axis=0)
+            else:
+                print('errprs, unkown conditions !!!@')
+    return output2_, single_odd_b2a
+
+
+def remove_unconnected_hydrogens(mol):
+    """
+    移除分子中不与重原子相连的氢原子（包括孤立 H 和只连到其他 H 的 H）。
+    
+    参数:
+        mol: RDKit Mol 对象
+        
+    返回:
+        移除氢原子后的 RWMol 对象
+    """
+    # 转换为可编辑的 RWMol 对象
+    molexp = Chem.RWMol(mol)
+    to_remove = []
+
+    # 遍历所有原子
+    for atom in molexp.GetAtoms():
+        if atom.GetSymbol() == 'H':  # 只处理氢原子
+            neighbors = atom.GetNeighbors()
+            # 检查邻居中是否有重原子
+            has_heavy_atom = False
+            for neighbor in neighbors:
+                if neighbor.GetSymbol() != 'H':  # 如果邻居不是 H，则是重原子
+                    has_heavy_atom = True
+                    break
+            # 如果没有重原子邻居，标记为移除
+            if not has_heavy_atom:
+                to_remove.append(atom.GetIdx())
+    # 按索引从大到小排序，避免移除时索引混乱
+    to_remove.sort(reverse=True)
+    
+    # 移除标记的原子
+    for ai in to_remove:
+        molexp.RemoveAtom(ai)
+    return molexp
+
+from rdkit import Chem
+from rdkit.Chem import AllChem
+
+def remove_unconnected_hydrogens2(mol):
+    """
+    移除分子中不与重原子相连的氢原子（包括孤立 H 和只连到其他 H 的 H），并返回移除的氢原子坐标。
+
+    参数:
+        mol: RDKit Mol 对象
+
+    返回:
+        rw_mol: 移除氢原子后的 RWMol 对象
+        removed_h_coords: 移除的氢原子的坐标列表 [(x1, y1, z1), (x2, y2, z2), ...]
+    """
+    # 转换为可编辑的 RWMol 对象
+    rw_mol = Chem.RWMol(mol)
+    to_remove = []
+
+    # 获取分子的构象（假设只有一个构象）
+    conformer = rw_mol.GetConformer()
+
+    # 存储移除的氢原子坐标
+    removed_h_coords = []
+
+    # 遍历所有原子
+    for atom in rw_mol.GetAtoms():
+        if atom.GetSymbol() == 'H':  # 只处理氢原子
+            neighbors = atom.GetNeighbors()
+            # 检查邻居中是否有重原子
+            has_heavy_atom = False
+            for neighbor in neighbors:
+                if neighbor.GetSymbol() != 'H':  # 如果邻居不是 H，则是重原子
+                    has_heavy_atom = True
+                    break
+            # 如果没有重原子邻居，标记为移除，并记录坐标
+            if not has_heavy_atom:
+                to_remove.append(atom.GetIdx())
+                pos = conformer.GetAtomPosition(atom.GetIdx())
+                removed_h_coords.append((pos.x, pos.y, pos.z))
+    # 按索引从大到小排序，避免移除时索引混乱
+    to_remove.sort(reverse=True)
+    # 移除标记的原子
+    for ai in to_remove:
+        rw_mol.RemoveAtom(ai)
+
+    return rw_mol, removed_h_coords
+
+def detect_unconnected_hydrogens(mol):
+    rw_mol = Chem.RWMol(mol)
+    to_remove = []
+    # 获取分子的构象（假设只有一个构象）
+    conformer = rw_mol.GetConformer()
+    # 存储移除的氢原子坐标
+    removed_h_coords = []
+    # 遍历所有原子
+    for atom in rw_mol.GetAtoms():
+        if atom.GetSymbol() == 'H':  # 只处理氢原子
+            neighbors = atom.GetNeighbors()
+            # 检查邻居中是否有重原子
+            has_heavy_atom = False
+            for neighbor in neighbors:
+                if neighbor.GetSymbol() != 'H':  # 如果邻居不是 H，则是重原子
+                    has_heavy_atom = True
+                    break
+            # 如果没有重原子邻居，标记为移除，并记录坐标
+            if not has_heavy_atom:
+                to_remove.append(atom.GetIdx())
+                pos = conformer.GetAtomPosition(atom.GetIdx())
+                removed_h_coords.append((pos.x, pos.y, pos.z))
+    # 按索引从大到小排序，避免移除时索引混乱
+    to_remove.sort(reverse=True)
+    return to_remove
+
+def view_box_center2(bond_bbox, bond_centers, bond_scores, bond_classes,overlap_dist_thresh=5.0, 
+                     max_centers_per_box=5,
+                     plot_view=False,
+                     ):
+    """
+    筛选和可视化 bond_bbox 和 bond_centers，处理重叠圆和过多中心的框。
+    
+    参数:
+        bond_bbox: numpy array, [x1, y1, x2, y2] 格式的框坐标
+        bond_centers: numpy array, [x, y] 格式的中心坐标
+        bond_scores: numpy array, 得分
+        overlap_dist_thresh: float，判断圆重叠的距离阈值（默认为 5 个单位）
+        max_centers_per_box: int，一个框内允许的最大中心数（超过则移除）
+    
+    返回:
+        tuple: (筛选后的 bond_bbox, bond_centers, bond_scores)
+    """
+    # 确保输入形状匹配
+    assert len(bond_bbox) == len(bond_centers) == len(bond_scores), "Input arrays must have equal length"
+    n = len(bond_bbox)
+    # Step 1: 处理重叠的 bond_centers（保留得分最高的）
+    keep_centers = np.ones(n, dtype=bool)  # 标记要保留的中心
+    for i in range(n):
+        if not keep_centers[i]:
+            continue
+        for j in range(i + 1, n):
+            if not keep_centers[j]:
+                continue
+            # 计算两个中心之间的欧几里得距离
+            dist = np.sqrt(np.sum((bond_centers[i] - bond_centers[j]) ** 2))
+            if dist < overlap_dist_thresh:
+                # 如果重叠，保留得分较高的
+                if bond_scores[i] > bond_scores[j]:
+                    keep_centers[j] = False
+                else:
+                    keep_centers[i] = False
+    # 应用初步筛选
+    bond_bbox = bond_bbox[keep_centers]
+    bond_centers = bond_centers[keep_centers]
+    bond_scores = bond_scores[keep_centers]
+    bond_classes= bond_classes[keep_centers]
+    n = len(bond_bbox)  # 更新数量
+    # Step 2: 检查每个框内的中心数量
+    keep_boxes = np.ones(n, dtype=bool)  # 标记要保留的框
+    for i in range(n):
+        # 计算框内的中心数量
+        x1, y1, x2, y2 = bond_bbox[i]
+        centers_in_box = np.sum((bond_centers[:, 0] >= x1) & (bond_centers[:, 0] <= x2) &
+                                (bond_centers[:, 1] >= y1) & (bond_centers[:, 1] <= y2))
+        if centers_in_box > max_centers_per_box:
+            keep_boxes[i] = False
+    # 应用最终筛选
+    final_bond_bbox = bond_bbox[keep_boxes]
+    final_bond_centers = bond_centers[keep_boxes]
+    final_bond_scores = bond_scores[keep_boxes]
+    final_bond_classes= bond_classes[keep_boxes]
+    if plot_view:
+        # 可视化（可选）
+        fig, ax = plt.subplots(figsize=(10, 10))
+        for box in final_bond_bbox:
+            x1, y1, x2, y2 = box
+            width = x2 - x1
+            height = y2 - y1
+            rect = Rectangle((x1, y1), width, height, linewidth=1, edgecolor='blue', facecolor='none')
+            ax.add_patch(rect)
+        for center in final_bond_centers:
+            circle = Circle(center, radius=5, edgecolor='red', facecolor='none', linewidth=1)
+            ax.add_patch(circle)
+        
+        # 设置坐标轴范围
+        x_min = min(final_bond_bbox[:, 0].min(), final_bond_centers[:, 0].min()) - 10
+        x_max = max(final_bond_bbox[:, 2].max(), final_bond_centers[:, 0].max()) + 10
+        y_min = min(final_bond_bbox[:, 1].min(), final_bond_centers[:, 1].min()) - 10
+        y_max = max(final_bond_bbox[:, 3].max(), final_bond_centers[:, 1].max()) + 10
+        ax.set_xlim(x_min, x_max)
+        ax.set_ylim(y_min, y_max)
+        
+        ax.set_title("Filtered Boxes and Centers")
+        ax.set_xlabel("X")
+        ax.set_ylabel("Y")
+        plt.gca().set_aspect('equal', adjustable='box')
+        plt.grid(True, linestyle='--', alpha=0.7)
+        # plt.show()
+    else:
+        fig=None
+    return final_bond_bbox, final_bond_centers, final_bond_scores,final_bond_classes,fig
+
+def calculate_iou(box1, box2):
+    """
+    计算两个框的 IoU（Intersection over Union）。
+    
+    参数:
+        box1, box2: [x1, y1, x2, y2] 格式的框坐标
+        
+    返回:
+        float: IoU 值
+    """
+    x1 = max(box1[0], box2[0])
+    y1 = max(box1[1], box2[1])
+    x2 = min(box1[2], box2[2])
+    y2 = min(box1[3], box2[3])
+    
+    intersection = max(0, x2 - x1) * max(0, y2 - y1)
+    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    union = area1 + area2 - intersection
+    
+    return intersection / union if union > 0 else 0
+
+def nms_per_class(labels, boxes, scores, iou_thresh=0.5):
+    """
+    对每个类别应用 NMS，保留得分最高的框。
+    参数:
+        labels: numpy array，类别标签
+        boxes: numpy array，框坐标 [x1, y1, x2, y2]
+        scores: numpy array，得分
+        iou_thresh: float，IoU 阈值
+    返回:
+        dict: 筛选后的输出
+    """
+    # 按类别分组
+    unique_labels = np.unique(labels)
+    kept_indices = []
+    for label in unique_labels:
+        # 筛选当前类别的框
+        class_mask = labels == label
+        class_indices = np.where(class_mask)[0]
+        class_boxes = boxes[class_mask]
+        class_scores = scores[class_mask]
+        
+        # 按得分从高到低排序
+        order = np.argsort(class_scores)[::-1]
+        class_boxes = class_boxes[order]
+        class_scores = class_scores[order]
+        class_indices = class_indices[order]
+        
+        # NMS
+        keep = []
+        while len(class_scores) > 0:
+            # 保留得分最高的框
+            keep.append(class_indices[0])
+            if len(class_scores) == 1:
+                break
+            
+            # 计算当前框与其他框的 IoU
+            ious = np.array([calculate_iou(class_boxes[0], box) for box in class_boxes[1:]])
+            # 保留 IoU 低于阈值的框
+            keep_mask = ious < iou_thresh
+            class_boxes = class_boxes[1:][keep_mask]
+            class_scores = class_scores[1:][keep_mask]
+            class_indices = class_indices[1:][keep_mask]
+        
+        kept_indices.extend(keep)
+    
+    # 根据保留的索引更新输出
+    kept_indices = np.array(kept_indices)
+    return {
+        'labels': labels[kept_indices],
+        'boxes': boxes[kept_indices],
+        'scores': scores[kept_indices]
+    }
+
+
+
+
+import numpy as np
+def get_overlap_region(box1, box2):
+    """
+    Get the overlapping region of two boxes.
+    
+    Args:
+        box1, box2: [x_min, y_min, x_max, y_max]
+    
+    Returns:
+        tuple: (x_min, y_min, x_max, y_max) of overlap region, or None if no overlap
+    """
+    x1 = max(box1[0], box2[0])
+    y1 = max(box1[1], box2[1])
+    x2 = min(box1[2], box2[2])
+    y2 = min(box1[3], box2[3])
+    
+    if x2 <= x1 or y2 <= y1:
+        return None  # No overlap
+    return (x1, y1, x2, y2)
+
+def are_bond_connected(box1, box2, bond_bboxes, bond_iou_threshold=0.1):
+    """
+    Check if two atom boxes are connected by a bond box, with bond center in overlap region.
+    
+    Args:
+        box1, box2: atom boxes to check
+        bond_bboxes: array of bond boxes
+        bond_iou_threshold: IoU threshold for initial bond overlap
+    
+    Returns:
+        bool: True if connected by a bond with center in overlap region
+    """
+    # Get the overlap region of the two atom boxes
+    overlap_region = get_overlap_region(box1, box2)
+    if overlap_region is None:
+        return False  # No overlap between atom boxes
+
+    ox_min, oy_min, ox_max, oy_max = overlap_region
+
+    for bond_box in bond_bboxes:
+        # Preliminary IoU check
+        iou1 = calculate_iou(box1, bond_box)
+        iou2 = calculate_iou(box2, bond_box)
+        if iou1 > bond_iou_threshold and iou2 > bond_iou_threshold:
+            # Calculate bond box center
+            bond_center_x = (bond_box[0] + bond_box[2]) / 2
+            bond_center_y = (bond_box[1] + bond_box[3]) / 2
+            
+            # Check if bond center is within the overlap region
+            if (ox_min <= bond_center_x <= ox_max and 
+                oy_min <= bond_center_y <= oy_max):
+                return True
+    return False
+
+def calculate_iou(box1, box2):
+    """
+    计算两个边界框的 IoU
+    box1, box2: [x_min, y_min, x_max, y_max]
+    """
+    x1 = max(box1[0], box2[0])
+    y1 = max(box1[1], box2[1])
+    x2 = min(box1[2], box2[2])
+    y2 = min(box1[3], box2[3])
+
+    intersection = max(0, x2 - x1) * max(0, y2 - y1)
+    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    union = area1 + area2 - intersection
+
+    return intersection / union if union > 0 else 0
+
+def nms(atom_bboxes, atom_scores, atom_classes, iou_threshold=0.5):
+    """
+    应用非极大值抑制 (NMS)
+    atom_bboxes: 列表，包含所有边界框 [x_min, y_min, x_max, y_max]
+    atom_scores: 列表，包含每个边界框的置信度
+    atom_classes: 列表，包含每个边界框的类别
+    iou_threshold: IoU 阈值，用于判断是否抑制
+    返回: 保留的边界框、类别和置信度的索引
+    """
+    # 按置信度排序，获取索引
+    indices = np.argsort(atom_scores)[::-1]  # 从高到低排序
+
+    keep_indices = []
+    while len(indices) > 0:  # 使用 len(indices) 替代 indices.size
+        # 保留当前最高置信度的框
+        current_idx = indices[0]
+        keep_indices.append(current_idx)
+
+        # 计算当前框与其他框的 IoU
+        ious = np.array([calculate_iou(atom_bboxes[current_idx], atom_bboxes[idx]) for idx in indices[1:]])
+        # 找出 IoU > threshold 的索引（相对于 indices[1:] 的偏移）
+        suppress_indices = indices[1:][ious > iou_threshold]
+        # 更新 indices，去除当前框和被抑制的框
+        indices = np.setdiff1d(indices, np.concatenate(([current_idx], suppress_indices)))
+        # 调试信息
+        # print(f"Current idx: {current_idx}, rmoved: {suppress_indices}, Remaining: {indices}")
+        # print(f"Current idx: {current_idx}, rmoved: {suppress_indices}, IOU: {ious}")
+
+    # 返回保留的框、类别和置信度
+    kept_bboxes = np.array([atom_bboxes[i] for i in keep_indices])
+    kept_classes = np.array([atom_classes[i] for i in keep_indices])
+    kept_scores = np.array([atom_scores[i] for i in keep_indices])
+
+    return kept_bboxes, kept_classes, kept_scores
+
+def count_bond_overlaps(box, bond_bboxes, bond_iou_threshold=0.1):
+    """
+    Count how many bond boxes overlap with an atom box.
+    
+    Args:
+        box: atom box [x_min, y_min, x_max, y_max]
+        bond_bboxes: array of bond boxes
+        bond_iou_threshold: IoU threshold for overlap
+    
+    Returns:
+        int: number of overlapping bond boxes
+    """
+    return sum(1 for bond_box in bond_bboxes if calculate_iou(box, bond_box) > bond_iou_threshold)
+
+
+def count_bond_overlaps(box, bond_bboxes, bond_iou_threshold=0.01):
+    """Count how many bond boxes overlap with an atom box."""
+    return sum(1 for bond_box in bond_bboxes if calculate_iou(box, bond_box) > bond_iou_threshold)
+
+def count_atom_overlaps(box, all_bboxes, exclude_idx, min_iou=0.01):
+    """Count how many other atom boxes overlap with this box."""
+    return sum(1 for i, other_box in enumerate(all_bboxes) 
+               if i != exclude_idx and calculate_iou(box, other_box) > min_iou)
+
+def merge_low_iou_boxes(kept_bboxes, kept_classes, kept_scores, bond_bboxes, 
+                       merge_threshold=0.5, score_threshold=0.7, bond_iou_threshold=0.01, 
+                       high_iou_threshold=0.8, large_score_threshold=0.5):
+    """
+    Merge or filter boxes with IoU conditions, removing large low-score boxes first.
+    
+    Args:
+        kept_bboxes: array, atom bounding boxes [x_min, y_min, x_max, y_max]
+        kept_classes: array, class labels (e.g., 0 for 'C')
+        kept_scores: array, confidence scores
+        bond_bboxes: array, bond bounding boxes
+        merge_threshold: float, upper IoU threshold for merging
+        score_threshold: float, score threshold to preserve boxes
+        bond_iou_threshold: float, IoU threshold for bond connectivity
+        high_iou_threshold: float, IoU threshold for high-IoU merging
+        large_score_threshold: float, score threshold for large box removal (default 0.5)
+    
+    Returns:
+        tuple: (merged_bboxes, merged_classes, merged_scores)
+    """
+    if len(kept_bboxes) <= 1:
+        return kept_bboxes, kept_classes, kept_scores
+
+    kept_bboxes = np.array(kept_bboxes)
+    kept_classes = np.array(kept_classes)
+    kept_scores = np.array(kept_scores)
+    bond_bboxes = np.array(bond_bboxes)
+
+    # Step 0: Remove large boxes with low scores, high atom overlaps, and high bond overlaps
+    areas = (kept_bboxes[:, 2] - kept_bboxes[:, 0]) * (kept_bboxes[:, 3] - kept_bboxes[:, 1])
+    median_area = np.median(areas)
+    keep_mask = np.ones(len(kept_bboxes), dtype=bool)
+
+    for i in range(len(kept_bboxes)):
+        if kept_scores[i] < large_score_threshold:
+            atom_overlaps = count_atom_overlaps(kept_bboxes[i], kept_bboxes, i)
+            bond_overlaps = count_bond_overlaps(kept_bboxes[i], bond_bboxes, bond_iou_threshold)
+            is_large = areas[i] > median_area  # Define "large" as above median
+            if is_large and atom_overlaps >= 2 and bond_overlaps >= 3:
+                keep_mask[i] = False
+                print(f"Removed large low-score box idx {i}: score {kept_scores[i]}, "
+                      f"area {areas[i]}, atom overlaps {atom_overlaps}, bond overlaps {bond_overlaps}")
+
+    # Filter boxes
+    kept_bboxes = kept_bboxes[keep_mask]
+    print(f"afterRemoved large low-score atom box::{len(kept_bboxes)} ")
+    kept_classes = kept_classes[keep_mask]
+    kept_scores = kept_scores[keep_mask]
+    if len(kept_bboxes) == 0:
+        return np.array([]), np.array([]), np.array([])
+
+    merged_bboxes = []
+    merged_classes = []
+    merged_scores = []
+    used_indices = set()
+
+    # Step 1: Merge boxes with IoU > high_iou_threshold
+    i = 0
+    while i < len(kept_bboxes):
+        if i in used_indices:
+            i += 1
+            continue
+
+        high_iou_group = [i]
+        for j in range(len(kept_bboxes)):
+            if j in used_indices or j == i:
+                continue
+            iou = calculate_iou(kept_bboxes[i], kept_bboxes[j])
+            if iou > high_iou_threshold:
+                high_iou_group.append(j)
+
+        if len(high_iou_group) > 1:#atom box ovrlaped
+            group_scores = kept_scores[high_iou_group]
+            max_score_idx = high_iou_group[np.argmax(group_scores)]
+            merged_bboxes.append(kept_bboxes[max_score_idx])
+            merged_classes.append(kept_classes[max_score_idx])
+            merged_scores.append(kept_scores[max_score_idx])
+            used_indices.update(high_iou_group)
+            print(f"Merged high-IoU (> {high_iou_threshold}) boxes: {high_iou_group}, "
+                  f"kept index: {max_score_idx}")
+        i += 1
+
+    # Step 2: Process remaining boxes
+    i = 0
+    while i < len(kept_bboxes):
+        if i in used_indices:
+            i += 1
+            continue
+
+        current_indices = [i]
+        for j in range(len(kept_bboxes)):
+            if j in used_indices or j == i:
+                continue
+            iou = calculate_iou(kept_bboxes[i], kept_bboxes[j])#IOU between atoms box
+            if 0.05 <= iou < merge_threshold:#better detect model with score matters
+                #any small IOU between atoms will processed here
+                if kept_scores[j]<0.7:
+                    current_indices.append(j)
+
+        group_indices = current_indices
+        group_scores = kept_scores[group_indices]
+        group_classes = kept_classes[group_indices]
+        group_bboxes = kept_bboxes[group_indices]
+
+        max_score = np.max(group_scores)
+        max_score_idx = group_indices[np.argmax(group_scores)]
+
+        if max_score >= score_threshold:
+            bond_connected = False
+            if len(group_indices) > 1:
+                for idx1, idx2 in zip(group_indices[:-1], group_indices[1:]):
+                    if are_bond_connected(kept_bboxes[idx1], kept_bboxes[idx2], 
+                                        bond_bboxes, bond_iou_threshold):
+                        bond_connected = True
+                        break
+            if bond_connected:
+                for idx in group_indices:
+                    merged_bboxes.append(kept_bboxes[idx])
+                    merged_classes.append(kept_classes[idx])
+                    merged_scores.append(kept_scores[idx])
+                print(f"Kept all bond-connected boxes: {group_indices}")
+            else:
+                bond_overlap_counts = [count_bond_overlaps(kept_bboxes[idx], bond_bboxes, 
+                                      bond_iou_threshold) for idx in group_indices]
+                max_overlaps = max(bond_overlap_counts)
+                candidates = [idx for idx, count in zip(group_indices, bond_overlap_counts) 
+                            if count == max_overlaps]
+                best_idx = max(candidates, key=lambda idx: kept_scores[idx])
+                merged_bboxes.append(kept_bboxes[best_idx])
+                merged_classes.append(kept_classes[best_idx])
+                merged_scores.append(kept_scores[best_idx])
+                # print(f"No bond box overlap, kept box with most bond overlaps: {best_idx}, "
+                #       f"overlap count: {max_overlaps}")
+        else:
+            if len(group_indices) == 1:
+                merged_bboxes.append(kept_bboxes[i])
+                merged_classes.append(kept_classes[i])
+                merged_scores.append(kept_scores[i])
+                print(f"Merged lower IOU @@ ONLY ONE box {i}")
+            else:
+                new_bbox = [
+                    np.min(group_bboxes[:, 0]),  # x_min
+                    np.min(group_bboxes[:, 1]),  # y_min
+                    np.max(group_bboxes[:, 2]),  # x_max
+                    np.max(group_bboxes[:, 3])   # y_max
+                ]
+                merged_bboxes.append(new_bbox)
+                merged_classes.append(group_classes[np.argmax(group_scores)])
+                merged_scores.append(max_score)
+                print(f"Merged low-score boxes: {group_indices}")
+        used_indices.update(group_indices)
+        i += 1
+    
+    print(f"after processs low IOU atom box::{len(merged_bboxes)} ")
+    return (np.array(merged_bboxes), np.array(merged_classes), np.array(merged_scores))
+
+
+def refine_boxes(atom_bboxes, atom_scores, atom_classes, bond_bboxes, 
+                 nms_iou_threshold=0.5, merge_threshold=0.5, score_threshold=0.5, 
+                 bond_iou_threshold=0.01, high_iou_threshold=0.8):
+    """
+    Iteratively apply NMS and merge until the number of boxes stabilizes.
+    
+    Args:
+        atom_bboxes, atom_scores, atom_classes: Initial atom box data
+        bond_bboxes: Bond box data
+        nms_iou_threshold, merge_threshold, score_threshold, bond_iou_threshold, high_iou_threshold: Parameters
+    
+    Returns:
+        tuple: (final_bboxes, final_classes, final_scores)
+    """
+    current_bboxes = np.array(atom_bboxes)
+    current_classes = np.array(atom_classes)
+    current_scores = np.array(atom_scores)
+    prev_count = len(current_bboxes) + 1  # Ensure at least one iteration
+
+    iteration = 0
+    while len(current_bboxes) < prev_count:
+        print(f"\nIteration {iteration}: Starting with {len(current_bboxes)} boxes")
+        prev_count = len(current_bboxes)
+
+        # Apply NMS
+        kept_bboxes, kept_classes, kept_scores = nms(
+            current_bboxes, current_scores, current_classes, iou_threshold=nms_iou_threshold
+        )
+        print(f"After NMS: {len(kept_bboxes)} boxes")
+
+        # Apply merge_low_iou_boxes
+        merged_bboxes, merged_classes, merged_scores = merge_low_iou_boxes(
+            kept_bboxes, kept_classes, kept_scores, bond_bboxes,
+            merge_threshold=merge_threshold, score_threshold=score_threshold,
+            bond_iou_threshold=bond_iou_threshold, high_iou_threshold=high_iou_threshold
+        )
+        print(f"After merge: {len(merged_bboxes)} boxes")
+
+        # Update for next iteration
+        current_bboxes = merged_bboxes
+        current_classes = merged_classes
+        current_scores = merged_scores
+        iteration += 1
+
+    print(f"Converged after {iteration} iterations with {len(current_bboxes)} boxes")
+    return current_bboxes, current_scores, current_classes
+
+def merge_low_iou_boxes_old(kept_bboxes, kept_classes, kept_scores, merge_threshold=0.3):
+    """
+    合并 IoU < merge_threshold 的边界框，使用较高 score 的 class
+    """
+    if len(kept_bboxes) <= 1:
+        return kept_bboxes, kept_classes, kept_scores
+
+    merged_bboxes = []
+    merged_classes = []
+    merged_scores = []
+    used_indices = set()
+
+    for i in range(len(kept_bboxes)):
+        if i in used_indices:
+            continue
+
+        # 找到 IoU < merge_threshold 的框组
+        current_indices = [i]
+        for j in range(i + 1, len(kept_bboxes)):
+            if j in used_indices:
+                continue
+            iou = calculate_iou(kept_bboxes[i], kept_bboxes[j])
+            if iou < merge_threshold and iou >0.01:
+                current_indices.append(j)
+
+        # 获取相关框的 score, class, 和 bbox
+        scores = kept_scores[current_indices]
+        classes = kept_classes[current_indices]
+        bboxes = kept_bboxes[current_indices]
+
+        max_score = np.max(scores)
+        max_score_idx = current_indices[np.argmax(scores)]
+
+        if max_score > 0.5:
+            # 保留 score 最大的框
+            merged_bboxes.append(kept_bboxes[max_score_idx])
+            merged_classes.append(kept_classes[max_score_idx])
+            merged_scores.append(kept_scores[max_score_idx])
+        else:
+            # 合并框，取最小和最大坐标
+            new_bbox = [
+                np.min(bboxes[:, 0]),  # x_min
+                np.min(bboxes[:, 1]),  # y_min
+                np.max(bboxes[:, 2]),  # x_max
+                np.max(bboxes[:, 3])   # y_max
+            ]
+            merged_bboxes.append(new_bbox)
+            merged_classes.append(0)#repalce with *
+            merged_scores.append(max_score)
+
+        # 标记已使用的索引
+        used_indices.update(current_indices)
+
+    # 转换为 NumPy 数组
+    merged_bboxes = np.array(merged_bboxes)
+    merged_classes = np.array(merged_classes)
+    merged_scores = np.array(merged_scores)
+
+    return merged_bboxes, merged_classes, merged_scores
+
+############################################################################################################################################################
+#molscrbe evaluate
+from SmilesPE.pretokenizer import atomwise_tokenizer
+
+def canonicalize_smiles(smiles, ignore_chiral=False, ignore_cistrans=False, replace_rgroup=True):
+    if type(smiles) is not str or smiles == '':
+        return '', False
+    if ignore_cistrans:
+        smiles = smiles.replace('/', '').replace('\\', '')
+    if replace_rgroup:
+        tokens = atomwise_tokenizer(smiles)
+        for j, token in enumerate(tokens):
+            if token[0] == '[' and token[-1] == ']':
+                symbol = token[1:-1]
+                if symbol[0] == 'R' and symbol[1:].isdigit():
+                    tokens[j] = f'[{symbol[1:]}*]'
+                elif Chem.AtomFromSmiles(token) is None:
+                    tokens[j] = '*'
+        smiles = ''.join(tokens)
+    try:
+        canon_smiles = Chem.CanonSmiles(smiles, useChiral=(not ignore_chiral))
+        success = True
+    except:
+        canon_smiles = smiles
+        success = False
+    return canon_smiles, success
+
+def convert_smiles_to_canonsmiles(
+    smiles_list, ignore_chiral=False, ignore_cistrans=False, replace_rgroup=True, num_workers=16):
+    with multiprocessing.Pool(num_workers) as p:
+        results = p.starmap(canonicalize_smiles,
+                            [(smiles, ignore_chiral, ignore_cistrans, replace_rgroup) for smiles in smiles_list],
+                            chunksize=128)
+    canon_smiles, success = zip(*results)
+    return list(canon_smiles), np.mean(success)
+
+def tanimoto_similarity(smiles1, smiles2):
+    try:
+        mol1 = Chem.MolFromSmiles(smiles1)
+        mol2 = Chem.MolFromSmiles(smiles2)
+        fp1 = Chem.RDKFingerprint(mol1)
+        fp2 = Chem.RDKFingerprint(mol2)
+        tanimoto = DataStructs.FingerprintSimilarity(fp1, fp2)
+        return tanimoto
+    except:
+        return 0
+
+
+def compute_tanimoto_similarities(gold_smiles, pred_smiles, num_workers=16):
+    with multiprocessing.Pool(num_workers) as p:
+        similarities = p.starmap(tanimoto_similarity, [(gs, ps) for gs, ps in zip(gold_smiles, pred_smiles)])
+    return similarities
+
+class SmilesEvaluator(object):
+    def __init__(self, gold_smiles, num_workers=16, tanimoto=False):
+        self.gold_smiles = gold_smiles
+        self.num_workers = num_workers
+        self.tanimoto = tanimoto
+        self.gold_smiles_cistrans, _ = convert_smiles_to_canonsmiles(gold_smiles,
+                                                                     ignore_cistrans=True,
+                                                                     num_workers=num_workers)
+        self.gold_smiles_chiral, _ = convert_smiles_to_canonsmiles(gold_smiles,
+                                                                   ignore_chiral=True, ignore_cistrans=True,
+                                                                   num_workers=num_workers)
+        self.gold_smiles_cistrans = self._replace_empty(self.gold_smiles_cistrans)
+        self.gold_smiles_chiral = self._replace_empty(self.gold_smiles_chiral)
+
+    def _replace_empty(self, smiles_list):
+        """Replace empty SMILES in the gold, otherwise it will be considered correct if both pred and gold is empty."""
+        return [smiles if smiles is not None and type(smiles) is str and smiles != "" else "<empty>"
+                for smiles in smiles_list]
+
+    def evaluate(self, pred_smiles, include_details=False):
+        results = {}
+        if self.tanimoto:
+            results['tanimoto'] = np.mean(compute_tanimoto_similarities(self.gold_smiles, pred_smiles))
+        # Ignore double bond cis/trans
+        pred_smiles_cistrans, _ = convert_smiles_to_canonsmiles(pred_smiles,
+                                                                ignore_cistrans=True,
+                                                                num_workers=self.num_workers)
+        results['canon_smiles'] = np.mean(np.array(self.gold_smiles_cistrans) == np.array(pred_smiles_cistrans))
+        if include_details:
+            results['canon_smiles_details'] = (np.array(self.gold_smiles_cistrans) == np.array(pred_smiles_cistrans))
+        # Ignore chirality (Graph exact match)
+        pred_smiles_chiral, _ = convert_smiles_to_canonsmiles(pred_smiles,
+                                                              ignore_chiral=True, ignore_cistrans=True,
+                                                              num_workers=self.num_workers)
+        results['graph'] = np.mean(np.array(self.gold_smiles_chiral) == np.array(pred_smiles_chiral))
+        # Evaluate on molecules with chiral centers
+        chiral = np.array([[g, p] for g, p in zip(self.gold_smiles_cistrans, pred_smiles_cistrans) if '@' in g])
+        results['chiral'] = np.mean(chiral[:, 0] == chiral[:, 1]) if len(chiral) > 0 else -1
+        return results
+
+
+
+############################################################################################################################################################
+def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
+                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
+                    device: torch.device, epoch: int, max_norm: float = 0, **kwargs):
+    model.train()
+    criterion.train()
+    metric_logger = MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    # metric_logger.add_meter('class_error', SmoothedValue(window_size=1, fmt='{value:.2f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+    print_freq = kwargs.get('print_freq', 10)
+    
+    ema = kwargs.get('ema', None)
+    scaler = kwargs.get('scaler', None)
+
+    for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+
+        if scaler is not None:
+            with torch.autocast(device_type=str(device), cache_enabled=True):
+                outputs = model(samples, targets)
+            
+            with torch.autocast(device_type=str(device), enabled=False):
+                loss_dict = criterion(outputs, targets)
+
+            loss = sum(loss_dict.values())
+            scaler.scale(loss).backward()
+            
+            if max_norm > 0:
+                scaler.unscale_(optimizer)
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+
+        else:
+            outputs = model(samples, targets)
+            loss_dict = criterion(outputs, targets)
+            
+            loss = sum(loss_dict.values())
+            optimizer.zero_grad()
+            loss.backward()
+            
+            if max_norm > 0:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+
+            optimizer.step()
+        
+        # ema 
+        if ema is not None:
+            ema.update(model)
+
+        loss_dict_reduced = reduce_dict(loss_dict)
+        loss_value = sum(loss_dict_reduced.values())
+
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            print(loss_dict_reduced)
+            sys.exit(1)
+
+        metric_logger.update(loss=loss_value, **loss_dict_reduced)
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+
+# @torch.no_grad()
+# def evaluate(model: torch.nn.Module, criterion: torch.nn.Module, postprocessors, data_loader, base_ds, device, output_dir,
+#     annot_file=f'/home/jovyan/rt-detr/data/real_processed/CLEF_with_charge/annotations/val.json',
+#     outcsv_filename=f'/home/jovyan/rt-detr/rt-detr/output/output_charge_CLEF.csv',
+#     ):
+#     model.eval()
+#     criterion.eval()
+
+#     metric_logger = MetricLogger(delimiter="  ")
+#     header = 'Test:'
+
+#     iou_types = postprocessors.iou_types
+#     coco_evaluator = CocoEvaluator(base_ds, iou_types)
+
+#     panoptic_evaluator = None
+    
+#     # # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+#     # home='/home/jovyan/rt-detr'
+#     # dataset = 'CLEF'
+#     # annot_file=f'/home/jovyan/rt-detr/data/real_processed/{dataset}_with_charge/annotations/test.json'
+#     # outcsv_filename/home/jovyan/rt-detr/rt-detr/output/output_charge_{dataset}.csv'
+
+
+#     # annot_file=f'/home/jovyan/rt-detr/data/real_processed/{dataset}_with_charge/annotations/test.json'
+#     # outcsv_filename=f'/home/jovyan/rt-detr/rt-detr/output/output_charge_{dataset}.csv'
+#     with open(annot_file, 'r') as file: 
+#         data = json.load(file)
+
+
+
+
+#     image_id_to_name = {}
+
+#     for image_data in data['images']:
+#         image_id = image_data['id']
+#         image_path = image_data['file_name']
+#         image_name = os.path.basename(image_path)
+#         image_id_to_name[image_id] = image_name
+
+#     res_smiles = []
+#     bond_labels = [13,14,15,16,17,18]
+#     idx_to_labels={0:'other',1:'C',2:'O',3:'N',4:'Cl',5:'Br',6:'S',7:'F',8:'B',
+#                         9:'I',10:'P',11:'H',12:'Si',
+#                         #bond
+#                         13:'single',14:'wdge',15:'dash',
+#                         16:'=',17:'#',18:':',#aromatic
+#                         #charge
+#                         19:'-4',20:'-2',
+#                         21:'-1',#-
+#                         22:'+1',#+
+#                         23:'2',
+#                         }
+#     lab2idx={v:k for k,v in idx_to_labels.items()}
+#     #indigo bond type stero maping
+#     indi_bond={
+#             "1":'single', "2":'=',"3":'#',"4":':',"5":'wdge',"6":'dash',
+#     }
+
+
+#     smiles_data = pd.DataFrame({'file_name': [],
+#                                 'SMILES':[]})
+    
+#     output_dict = {}
+#     target_dict = {}
+#     filtered_output_dict = {}
+#     # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+#     for samples, targets in metric_logger.log_every(data_loader, 10, header):
+#         samples = samples.to(device)
+#         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+
+#         outputs = model(samples)
+
+#         orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)        
+#         results = postprocessors(outputs, orig_target_sizes)#RTDETRPostProcessor@@src/zoo/rtertr
+#         #results: a list of dict  label box score
+#         res = {target['image_id'].item(): output for target, output in zip(targets, results)}
+
+#         for target, output in zip(targets, results):
+#             output_dict[target['image_id'].item()] = output
+    
+#     stats = {}
+#     # stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+#     if coco_evaluator is not None:
+#         if 'bbox' in iou_types:
+#             # stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist()
+#             stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats
+#         if 'segm' in iou_types:
+#             stats['coco_eval_masks'] = coco_evaluator.coco_eval['segm'].stats.tolist()
+
+
+
+#     # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 
+#     # ocr_recognition_only = get_ocr_recognition_only(force_cpu=False)   
+#     # caption_remover = CaptionRemover(force_cpu=True)
+#     for key, value in output_dict.items():#TODO improve here
+#         selected_indices = value['scores'] > 0.5#may be >=0.5 cut off, as used the sigmoid?
+#         if value['labels'][selected_indices].size(0) != 0:#no good prediction
+#             filtered_output_dict[key] = {
+#                 'labels': value['labels'][selected_indices],# may be selected_indices ==0 as all small than0.5
+#                 'boxes': value['boxes'][selected_indices],
+#                 'scores': value['scores'][selected_indices]
+#             }
+#         else:
+#             ima_name=image_id_to_name[key]
+#             print(key,"all prediction scores small 0.5!!",len(output_dict),f"{ima_name}")##
+
+#     for i,(key,value) in enumerate(filtered_output_dict.items()):
+#         result = []#TODO need a box2mol or graph
+#         smi_mol=output_to_smiles(value,idx_to_labels,bond_labels,result)#TODO use the idx_to_labels numer to if --else
+#         if smi_mol:
+#             res_smiles.append(smi_mol[0])  #TODO check this erro other0
+#         else:
+#             res_smiles.append('')
+            
+#         new_row = {'file_name':image_id_to_name[key], 'SMILES':res_smiles[i]}
+#         smiles_data = smiles_data._append(new_row, ignore_index=True)
+    
+#     print(f"will save {len(smiles_data)} dataframe into csv") 
+#     smiles_data.to_csv(outcsv_filename, index=False)
+
+#     return stats, coco_evaluator
+
+def remove_bond_directions_if_no_chiral(mol):
+    # 检查分子是否有效
+    if mol is None:
+        return None
+    # 计算手性中心
+    chiral_centers = Chem.FindMolChiralCenters(mol, includeUnassigned=True)
+    # 如果没有手性中心，移除单键的立体化学标记
+    if not chiral_centers:
+        for bond in mol.GetBonds():
+            # 只处理单键
+            if bond.GetBondType() == Chem.BondType.SINGLE:
+                # 移除楔形和虚线标记
+                bond.SetBondDir(Chem.BondDir.NONE)
+    return mol
+#######################################################################################
+def molExpanding(mol_rebuit,placeholder_atoms,wdbs,bond_dirs,alignmol=False):
+    cm=copy.deepcopy(mol_rebuit)
+    # print(placeholder_atoms)
+    expand_mol, expand_smiles= expandABB(cm,ABBREVIATIONS, placeholder_atoms)
+    rdm=copy.deepcopy(expand_mol)
+    AllChem.Compute2DCoords(rdm)
+    target_mol, ref_mol=rdm, cm
+
+    if alignmol:
+        mcs=rdFMCS.FindMCS([target_mol, ref_mol], # larger,small order
+                        atomCompare=rdFMCS.AtomCompare.CompareAny,
+                        # bondCompare=rdFMCS.BondCompare.CompareAny,
+                        ringCompare=rdFMCS.RingCompare.IgnoreRingFusion,
+                        matchChiralTag=False,
+        )
+        atommaping_pairs=g_atompair_matches([target_mol, ref_mol],mcs)
+        atomMap=atommaping_pairs[0]
+        try:
+            rmsd2=rdkit.Chem.rdMolAlign.AlignMol(prbMol=target_mol, refMol=ref_mol, atomMap=atomMap,maxIters=2000000)
+        except Exception as e:
+            print(atomMap,"@@@@")
+            print(e)
+        #after get atomMap
+        c2p={cur:pre for cur, pre in atomMap}
+        p2c={pre:cur for cur, pre in atomMap}
+        for b in wdbs:#add bond direction
+            p0,p1=int(b[0]), int(b[1])#may be not in the atomMap as the mcs_sub
+            if p0 in p2c.keys() and p1 in p2c.keys():
+                c0,c1=p2c[p0],p2c[p1]
+                # print("[pre0,pre1]vs[c0,c1]current atom id",[p0,p1],[c0,c1])
+                b_=target_mol.GetBondBetweenAtoms(c0,c1)
+                if b_:
+                    b_.SetBondDir(bond_dirs[b[3]])
+        expandStero_smi=Chem.MolToSmiles(target_mol)#directly will not add the stero info into smiles, must have the assing steps
+    else:
+        expandStero_smi =expand_smiles 
+        
+    m=target_mol.GetMol()
+    # Chem.SanitizeMol(m)
+    Chem.DetectBondStereochemistry(m)
+    Chem.AssignChiralTypesFromBondDirs(m)
+    Chem.AssignStereochemistry(m)#expandStero_smi ,  m 
+
+    return expandStero_smi, m  
+
+
+def remove_backslash_and_slash(input_string):
+    if "\\" in input_string:
+        input_string = input_string.replace("\\", "")
+    if "/" in input_string:
+        input_string = input_string.replace("/", "")
+
+    return input_string
+
+
+def remove_number_before_star(input_string):
+    result = list(input_string) 
+
+    i = 0
+    while i < len(result):
+        if result[i] == '*' and i!= len(result) -1:  
+            #*c1c(*)c(*)c(C(*)(*)C(C)C)c(*)c1* --> *c1c(*)c(*)c(C(*)(*)C(C)C)c(*)c1*
+            j = i - 1
+            if result[j-1].isalpha(): 
+                continue
+            while j >= 0 and result[j].isdigit():
+                result[j] = ''  
+                j -= 1
+        i += 1
+
+    return ''.join(result)
+
+def remove_SP(input_string):
+    pattern = r'\[([^@]*)@?[A-Z0-9]*\]'
+    # if "S@SP1" in input_string:
+    #     input_string = input_string.replace("S@SP1", "S")
+    # elif "S@SP2" in input_string:
+    #     input_string = input_string.replace("S@SP2", "S")
+    # elif "S@SP3" in input_string:
+    #     input_string = input_string.replace("S@SP3", "S")
+    input_string = re.sub(r'@SP[1-3]', '', input_string)
+    if '@TB' in input_string:
+        result = re.sub(pattern, r'[\1]', input_string)
+        input_string=result
+    return input_string
+
+def rdkit_canonicalize_smiles(smiles):
+    Aad_string = r'([A-Z][a-z]*)([0-9]+)'
+    tokens = atomwise_tokenizer(smiles)
+    for j, token in enumerate(tokens):
+        if token[0] == '[' and token[-1] == ']':
+            symbol = token[1:-1]
+            # matches = re.findall(Aad_string, symbol)#findall may give not wanted, such as [BH2], shuld not change
+            matches = re.match(Aad_string, symbol)
+            if matches:
+                letters, numbers = matches.groups()
+                print(f"{letters} {numbers}")
+                # tokens[j] = f'[{symbol[1:]}*]'
+                tokens[j] = '*'
+            elif symbol in RGROUP_SYMBOLS:# or (symbol[0] in RGROUP_SYMBOLS and abbrev[1:].isdigit()):
+                tokens[j] = '*'
+            elif Chem.AtomFromSmiles(token) is None:
+                tokens[j] = '*'
+
+    smiles = ''.join(tokens)
+    try:
+        canon_smiles = Chem.CanonSmiles(smiles, useChiral=False)
+        success = True
+    except:
+        canon_smiles = smiles
+        success = False
+    return canon_smiles, success
+
+def NoRadical_Smi(smi):
+    aa=Chem.MolFromSmiles(smi)
+    for atom in aa.GetAtoms():
+        if atom.GetNumRadicalElectrons() > 0:  # 检查是否有自由基
+            # print(f"找到自由基原子: {atom.GetSymbol()}, 自由电子数: {atom.GetNumRadicalElectrons()}")
+            # 添加氢原子以去除自由基
+            atom.SetNumRadicalElectrons(0)  # 将自由电子数设为 0
+            # 根据硫原子的化合价调整氢原子数
+            atom.SetNumExplicitHs(atom.GetTotalValence() - atom.GetExplicitValence())
+    san_before=Chem.MolToSmiles(aa)
+    # print(san_before)
+    return san_before
+
+import logging
+
+def check_and_fix_valence(smiles_or_list):
+    """
+    Check atom valences in a SMILES string or a list [smiles, suffix/prefix].
+    Fix unusual valences (e.g., N(2)) by adding/removing hydrogens to maintain neutrality.
+    Returns: (corrected_smiles_or_list, warnings)
+    """
+    # Set up logging
+    logging.basicConfig(level=logging.WARNING)
+    warnings = []
+
+    # Standard valence dictionary for common atoms
+    standard_valences = {
+        'C': [4],
+        'N': [3],  # Prioritize valence 3 for neutral nitrogen (e.g., amines, amides)
+        'O': [2],
+        'H': [1],
+        'F': [1]
+    }
+
+    # Handle input: SMILES string or list from C_H_expand
+    if isinstance(smiles_or_list, list):
+        smiles, other_part = smiles_or_list
+    else:
+        smiles, other_part = smiles_or_list, None
+
+    # Process main SMILES
+    mol = Chem.MolFromSmiles(smiles, sanitize=False) if smiles else None
+    if mol is None:
+        warnings.append(f"Invalid SMILES: {smiles}")
+        return smiles_or_list, warnings
+
+    # Process other_part if it exists and is a valid SMILES
+    other_part_mol = None
+    if other_part:
+        try:
+            other_part_mol = Chem.MolFromSmiles(other_part, sanitize=False)
+        except:
+            pass  # other_part may not be valid SMILES (e.g., a suffix/prefix)
+
+    # Helper function to check and fix valence for a molecule
+    def process_molecule(mol, is_other_part=False):
+        nonlocal warnings
+        corrected = False
+        prefix = "other_part" if is_other_part else "SMILES"
+
+        # Compute valence explicitly to avoid precondition violation
+        mol.UpdatePropertyCache(strict=False)
+
+        # Check valences
+        for atom in mol.GetAtoms():
+            symbol = atom.GetSymbol()
+            valence = atom.GetTotalValence()
+            expected_valences = standard_valences.get(symbol, [valence])
+            if valence not in expected_valences:
+                warnings.append(f"Unusual valence in {prefix} for {symbol}: {valence} (expected {expected_valences})")
+
+        # Fix nitrogen valence issues by adjusting hydrogens
+        if any('N' in w for w in warnings if prefix in w):
+            rw_mol = Chem.RWMol(mol)  # Editable molecule
+            for atom in rw_mol.GetAtoms():
+                if atom.GetSymbol() != 'N':
+                    continue
+                valence = atom.GetTotalValence()
+                if valence < 3:
+                    # Add hydrogens to reach valence 3
+                    hydrogens_needed = 3 - valence
+                    atom.SetNumExplicitHs(atom.GetNumExplicitHs() + hydrogens_needed)
+                    corrected = True
+                elif valence > 3:
+                    # Remove hydrogens if possible
+                    hydrogens_to_remove = valence - 3
+                    current_hydrogens = atom.GetNumExplicitHs()
+                    if current_hydrogens >= hydrogens_to_remove:
+                        atom.SetNumExplicitHs(current_hydrogens - hydrogens_to_remove)
+                        corrected = True
+                    else:
+                        warnings.append(f"Cannot reduce N valence in {prefix} to 3 without removing non-H bonds")
+            if corrected:
+                mol = rw_mol.GetMol()
+
+        # Sanitize molecule after corrections
+        if corrected:
+            try:
+                Chem.SanitizeMol(mol, catchErrors=True)
+                return mol, True
+            except Exception as e:
+                warnings.append(f"Failed to sanitize {prefix} after correction: {str(e)}")
+                return mol, False
+        return mol, False
+
+    # Process main molecule
+    mol, mol_corrected = process_molecule(mol)
+
+    # Convert main molecule back to SMILES
+    corrected_smiles = Chem.MolToSmiles(mol) if mol_corrected else smiles
+
+    # Process other_part if it's a valid molecule
+    corrected_other_part = other_part
+    if other_part_mol:
+        other_part_mol, other_corrected = process_molecule(other_part_mol, is_other_part=True)
+        corrected_other_part = Chem.MolToSmiles(other_part_mol) if other_corrected else other_part
+
+    # Return based on input type
+    if other_part:
+        return [corrected_smiles, corrected_other_part], warnings
+    return corrected_smiles, warnings
+
+def molfpsim(original_smiles,test_smiles):#I2M use the coordinates, so 2D coformation should be always
+    #only use longest for desalts, one molecule comparing
+    test_smiles= select_longest_smiles(test_smiles)
+    original_smiles= select_longest_smiles(original_smiles)
+    test_smiles, warnings=check_and_fix_valence(test_smiles)
+
+    original_smiles = remove_backslash_and_slash(original_smiles)#c/s 
+    test_smiles = remove_backslash_and_slash(test_smiles)
+    original_smiles = re.sub(r'\[(\d+)\*', '[*',original_smiles)#[1*]-->[*]
+    test_smiles = re.sub(r'\[(\d+)\*', '[*',test_smiles)
+    original_smiles = remove_SP(original_smiles)#additional complex space stero from coordinates, most not used
+    test_smiles = remove_SP(test_smiles)
+    
+    rd_smi_ori, success1=rdkit_canonicalize_smiles(original_smiles)#R-->*
+    if "S" in rd_smi_ori and success1:#NOTE H replace radical electron
+        rd_smi_ori=NoRadical_Smi(rd_smi_ori)
+    rd_smi, success2=rdkit_canonicalize_smiles(test_smiles)
+    original_smiles,test_smiles=rd_smi_ori,rd_smi
+
+    mol1 = Chem.MolFromSmiles(original_smiles)#TODO considering smiles with rdkit not recongized in real data
+    mol2 = Chem.MolFromSmiles(test_smiles)#TODO considering smiles with rdkit not recongized in real data
+
+    morganfps1 = AllChem.GetMorganFingerprint(mol1, useChirality=False)
+    morganfps2 = AllChem.GetMorganFingerprint(mol2, useChirality=False)
+    morgan_tani = DataStructs.DiceSimilarity(morganfps1, morganfps2)
+    fp1 = Chem.RDKFingerprint(mol1)
+    fp2 = Chem.RDKFingerprint(mol2)
+    tanimoto = DataStructs.FingerprintSimilarity(fp1, fp2)
+    return morgan_tani, tanimoto
+
+
+
+
+def comparing_smiles2(original_smiles,test_smiles):#I2M use the coordinates, so 2D coformation should be always
+    original_smiles = remove_backslash_and_slash(original_smiles)#c/s 
+    test_smiles = remove_backslash_and_slash(test_smiles)
+    original_smiles = re.sub(r'\[(\d+)\*', '[*',original_smiles)#[1*]-->[*]
+    test_smiles = re.sub(r'\[(\d+)\*', '[*',test_smiles)
+    original_smiles = remove_SP(original_smiles)#additional complex space stero from coordinates, most not used
+    test_smiles = remove_SP(test_smiles)
+    
+    rd_smi_ori, success1=rdkit_canonicalize_smiles(original_smiles)#R-->*
+    if "S" in rd_smi_ori and success1:#NOTE H replace radical electron
+        rd_smi_ori=NoRadical_Smi(rd_smi_ori)
+
+    rd_smi, success2=rdkit_canonicalize_smiles(test_smiles)
+    original_smiles,test_smiles=rd_smi_ori,rd_smi
+
+    try:
+        original_mol = Chem.MolFromSmiles(original_smiles)#considering whe nmmet abbrev
+        test_mol = Chem.MolFromSmiles(test_smiles,sanitize=False)#as build mol may not sanitized for rdkit
+        if success2 and success1:
+            # if original_smiles!=test_smiles:
+            #     print(f'smiles ori,pred after Chem.CanonSmiles:\n{original_smiles}\n{test_smiles}')
+            RDarom_smi=Chem.MolToSmiles(original_mol)
+            RDarom_smi_test=Chem.MolToSmiles(test_mol)
+            if RDarom_smi==RDarom_smi_test:
+                return True
+            else:
+                print(f'smiles ori,pred after Chem.CanonSmiles:\n{RDarom_smi}\n{RDarom_smi_test}\n')
+  
+        if original_mol:
+            Chem.SanitizeMol(original_mol)
+            keku_smi_ori=Chem.MolToSmiles(original_mol,kekuleSmiles=True)
+        else:
+            keku_smi_ori=original_smiles
+        
+        if test_mol:
+            Chem.SanitizeMol(test_mol)
+            keku_smi=Chem.MolToSmiles(test_mol,kekuleSmiles=True)
+        else:
+            keku_smi=test_smiles
+            
+        if '*' not in keku_smi:
+            keku_inch_ori=  Chem.MolToInchi(Chem.MolFromSmiles(keku_smi_ori))
+            keku_inch_test=  Chem.MolToInchi(Chem.MolFromSmiles(keku_smi))
+        else:
+            keku_inch_ori=  1
+            keku_inch_test=  2
+
+        rd_smi=Chem.MolToSmiles(test_mol)#need improve the acc
+        rd_smi_ori=Chem.MolToSmiles(original_mol)
+    except Exception as e:#TODO fixme here
+        print(f"comparing_smiles@@@ kekulize or SanitizeMol problems")# original_smiles,test_smiles\n{original_smiles}\n{test_smiles}")
+        print(e,"!!!!!!!\n")
+        keku_inch_ori=  1
+        keku_inch_test=  2
+        keku_smi=1
+        keku_smi_ori=2
+        #add molscribe rules here
+        if not success1:#ori smiles still invaild even after * replaced
+            rd_smi_ori = rd_smi
+        # else:
+        #     if canon_smiles1 == canon_smiles2:
+        #         rd_smi_ori = rd_smi
+            # else:
+    if rd_smi_ori == rd_smi or keku_smi_ori == keku_smi or keku_inch_ori==keku_inch_test :#as orinial smiles may use kekuleSmiles style
+        return True
+    else:return False
+
+def smiles12_comparing(original_smiles,test_smiles):
+    original_smiles = remove_backslash_and_slash(original_smiles)#c/s 
+    test_smiles = remove_backslash_and_slash(test_smiles)
+    original_smiles = re.sub(r'\[(\d+)\*', '[*',original_smiles)#[1*]-->[*]
+    test_smiles = re.sub(r'\[(\d+)\*', '[*',test_smiles)
+    original_smiles = remove_SP(original_smiles)#additional complex space stero from coordinates, most not used
+    test_smiles = remove_SP(test_smiles)
+    
+    rd_smi_ori, success1=rdkit_canonicalize_smiles(original_smiles)
+    rd_smi, success2=rdkit_canonicalize_smiles(test_smiles)
+    original_smiles,test_smiles=rd_smi_ori,rd_smi
+    try:
+        original_mol = Chem.MolFromSmiles(original_smiles)#considering whe nmmet abbrev
+        test_mol = Chem.MolFromSmiles(test_smiles,sanitize=False)#as build mol may not sanitized for rdkit
+        if original_mol:
+            Chem.SanitizeMol(original_mol)
+            keku_smi_ori=Chem.MolToSmiles(original_mol,kekuleSmiles=True)
+        else:
+            keku_smi_ori=original_smiles
+        
+        if test_mol:
+            Chem.SanitizeMol(test_mol)
+            keku_smi=Chem.MolToSmiles(test_mol,kekuleSmiles=True)
+        else:
+            keku_smi=test_smiles
+            
+        if '*' not in keku_smi:
+            keku_inch_ori=  Chem.MolToInchi(Chem.MolFromSmiles(keku_smi_ori))
+            keku_inch_test=  Chem.MolToInchi(Chem.MolFromSmiles(keku_smi))
+        else:
+            keku_inch_ori=  1
+            keku_inch_test=  2
+
+        rd_smi=Chem.MolToSmiles(test_mol)#need improve the acc
+        rd_smi_ori=Chem.MolToSmiles(original_mol)
+    except Exception as e:#TODO fixme here
+        print(f"comparing_smiles@@@ kekulize or SanitizeMol problems")# original_smiles,test_smiles\n{original_smiles}\n{test_smiles}")
+        print(e,"!!!!!!!\n")
+        keku_inch_ori=  1
+        keku_inch_test=  2
+        keku_smi=1
+        keku_smi_ori=2
+        #add molscribe rules here
+        if not success1:#ori smiles still invaild even after * replaced
+            rd_smi_ori = rd_smi
+        # else:
+        #     if canon_smiles1 == canon_smiles2:
+        #         rd_smi_ori = rd_smi
+            # else:
+    if rd_smi_ori == rd_smi or keku_smi_ori == keku_smi or keku_inch_ori==keku_inch_test :#as orinial smiles may use kekuleSmiles style
+        return True
+    else:return False
+
+
+def comparing_smiles(new_row,test_smiles):#I2M use the coordinates, so 2D coformation should be always
+    original_smiles=new_row['SMILESori']
+    original_smiles = remove_backslash_and_slash(original_smiles)#c/s 
+    test_smiles = remove_backslash_and_slash(test_smiles)
+    original_smiles = re.sub(r'\[(\d+)\*', '[*',original_smiles)#[1*]-->[*]
+    test_smiles = re.sub(r'\[(\d+)\*', '[*',test_smiles)
+    original_smiles = remove_SP(original_smiles)#additional complex space stero from coordinates, most not used
+    test_smiles = remove_SP(test_smiles)
+    
+    rd_smi_ori, success1=rdkit_canonicalize_smiles(original_smiles)
+    rd_smi, success2=rdkit_canonicalize_smiles(test_smiles)
+    original_smiles,test_smiles=rd_smi_ori,rd_smi
+    try:
+        original_mol = Chem.MolFromSmiles(original_smiles)#considering whe nmmet abbrev
+        test_mol = Chem.MolFromSmiles(test_smiles,sanitize=False)#as build mol may not sanitized for rdkit
+        if original_mol:
+            Chem.SanitizeMol(original_mol)
+            keku_smi_ori=Chem.MolToSmiles(original_mol,kekuleSmiles=True)
+        else:
+            keku_smi_ori=original_smiles
+        
+        if test_mol:
+            Chem.SanitizeMol(test_mol)
+            keku_smi=Chem.MolToSmiles(test_mol,kekuleSmiles=True)
+        else:
+            keku_smi=test_smiles
+            
+        if '*' not in keku_smi:
+            keku_inch_ori=  Chem.MolToInchi(Chem.MolFromSmiles(keku_smi_ori))
+            keku_inch_test=  Chem.MolToInchi(Chem.MolFromSmiles(keku_smi))
+        else:
+            keku_inch_ori=  1
+            keku_inch_test=  2
+
+        rd_smi=Chem.MolToSmiles(test_mol)#need improve the acc
+        rd_smi_ori=Chem.MolToSmiles(original_mol)
+    except Exception as e:#TODO fixme here
+        print(f"comparing_smiles@@@ kekulize or SanitizeMol problems")# original_smiles,test_smiles\n{original_smiles}\n{test_smiles}")
+        print(new_row)
+        print(e,"!!!!!!!\n")
+        keku_inch_ori=  1
+        keku_inch_test=  2
+        keku_smi=1
+        keku_smi_ori=2
+        #add molscribe rules here
+        if not success1:#ori smiles still invaild even after * replaced
+            rd_smi_ori = rd_smi
+        # else:
+        #     if canon_smiles1 == canon_smiles2:
+        #         rd_smi_ori = rd_smi
+            # else:
+    if rd_smi_ori == rd_smi or keku_smi_ori == keku_smi or keku_inch_ori==keku_inch_test :#as orinial smiles may use kekuleSmiles style
+        return True
+    else:return False
+
+
+
+
+
+
+
+def bbox2center(bbox):
+    x_center = (bbox[:, 0] + bbox[:, 2]) / 2
+    y_center = (bbox[:, 1] + bbox[:, 3]) / 2
+    # center_coords = torch.stack((x_center, y_center), dim=1)
+    centers = np.stack((x_center, y_center), axis=1)
+    return centers
+
+import cv2
+BONDDIRECT=['ENDUPRIGHT', 'BEGINWEDGE', 'BEGINDASH', 'ENDDOWNRIGHT']
+
+
+def reorder_bond_bbox(bond_bbox, single_atom_bond):
+    # 分离普通索引和需要后置的索引
+    normal_indices = []
+    special_indices = []
+    # 获取需要后置的 key
+    keys_to_move = set(single_atom_bond.keys())
+    # 分类所有索引
+    for i in range(len(bond_bbox)):
+        if i in keys_to_move:
+            special_indices.append(i)
+        else:
+            normal_indices.append(i)
+    # 新顺序：普通索引在前，特殊索引在后
+    new_order = normal_indices + special_indices
+    # 重排 bond_bbox
+    reordered_bbox = [bond_bbox[i] for i in new_order]
+    return reordered_bbox
+
+def boxes_overlap(box1, box2):
+    """
+    检查两个边界框是否重叠
+    box1, box2: [x1, y1, x2, y2]
+    """
+    return not (box1[2] < box2[0] or box1[0] > box2[2] or
+                box1[3] < box2[1] or box1[1] > box2[3])
+def calculate_center(box):
+    """
+    计算边界框的中心点
+    """
+    return np.array([(box[0] + box[2]) / 2, (box[1] + box[3]) / 2])
+def merge_boxes(box1, box2):
+    """
+    合并两个边界框，返回新边界框 [x1, y1, x2, y2]
+    """
+    return [
+        min(box1[0], box2[0]),
+        min(box1[1], box2[1]),
+        max(box1[2], box2[2]),
+        max(box1[3], box2[3])
+    ]
+
+
+def get_merged_box(boxes):
+    """Calculate the smallest box encompassing all given boxes."""
+    x_mins = [box[0] for box in boxes]
+    y_mins = [box[1] for box in boxes]
+    x_maxs = [box[2] for box in boxes]
+    y_maxs = [box[3] for box in boxes]
+    return [min(x_mins), min(y_mins), max(x_maxs), max(y_maxs)]
+
+def box_area(box):
+    """Calculate the area of a box."""
+    return (box[2] - box[0]) * (box[3] - box[1])
+
+def Newbox_(atom_bbox,bond_bbox, lab2idx):
+    #add H atom box when on direction bond
+    new_atoms=[]
+    b_len=3
+    single_odd_b2a=dict()
+    for bi,bb in enumerate(bond_bbox):
+        overlapped_atoms = []
+        overlapped_abox=[]
+        for ai,aa in enumerate(atom_bbox):
+            overlap_flag=boxes_overlap(bb, aa)#TODO use tghe atom bond box overlap get bond atom mapping,then built mol
+            if overlap_flag:
+                # print(bb, aa,overlap_flag)
+                overlapped_atoms.append(ai)
+                overlapped_abox.append(aa)
+        if len(overlapped_atoms) == 1:
+            single_odd_b2a[bi]=overlapped_atoms
+            # Compute the non-overlapping part of the bond box to place hydrogen
+            non_overlapping_x,non_overlapping_y=boxes_overlap2(overlapped_abox[0], bb)
+            new_atom_out={'bbox':    np.array([non_overlapping_x - b_len, 
+                                    non_overlapping_y - b_len,
+                                    non_overlapping_x + b_len, 
+                                    non_overlapping_y + b_len]).reshape(-1,4),
+                'bbox_centers': np.array([non_overlapping_x,non_overlapping_y]).reshape(-1,2),
+                'scores':       np.array([1.0]),
+                'pred_classes': np.array([lab2idx['H']])}
+            new_atoms.append(new_atom_out)
+    return new_atoms, single_odd_b2a
+
+
+def has_boxes(data):
+    #TO CHECK OCR detct used or not
+    return isinstance(data, list) and len(data) > 0 and all(
+        isinstance(item, list) and len(item) == 2 and 
+        isinstance(item[0], list) and len(item[0]) == 4
+        for item in data
+    )
+
+def AtomBox2bondBox(atom_box,bond_bbox):
+    b_nei=[]
+    overlap=True
+    for bi,bb in enumerate(bond_bbox):
+        overlap_flag=boxes_overlap(bb, atom_box)#TODO use tghe atom bond box overlap get bond atom mapping,then built mol
+        if overlap_flag:
+            b_nei.append(bi)
+    if len(b_nei)==0:
+        # delt_hei.append(hei)
+        overlap=False
+    return overlap, b_nei
+
+
+import torchvision.transforms.v2 as T
+
+def image_to_tensor(image_path,debug=True):
+    image = Image.open(image_path)
+    w, h = image.size
+    
+    # 处理灰度或其他模式
+    if image.mode == "L":
+        if debug: print("检测到灰度图像 (1 通道)，转换为 RGB...")
+        image = image.convert("RGB")
+    elif image.mode != "RGB":
+        if debug: print(f"检测到 {image.mode} 模式，转换为 RGB...")
+        image = image.convert("RGB")
+    # Define a transform to convert the image to a tensor and normalize it
+    transform = T.Compose([
+            T.Resize((640, 640)),  # 调整大小
+            # T.ToImageTensor(),  # 转换为 PyTorch Tensor
+            T.ToTensor(),
+            lambda x: x.to(torch.float32),  # 手动转换数据类型# T.ConvertDtype(dtype=torch.float32),  # 转换数据类型
+        ])
+    
+    # Apply the transform to the image
+    tensor = transform(image)
+    
+    return tensor,w,h
+
+
+
+# from src.zoo.rtdetr.rtdetr_postprocessor import RTDETRPostProcessor
+
+@torch.no_grad()
+def evaluate_x(model: torch.nn.Module, criterion: torch.nn.Module, postprocessors, 
+        data_loader, device,
+        outcsv_filename=f'/home/jovyan/rt-detr/rt-detr/output/output_charge_CLEF.csv',
+        visual_check=False,
+        other2ppsocr=True,
+        getacc=False,
+        ):
+    
+    postprocessor2=RTDETRPostProcessor(num_classes=30, use_focal_loss=True, num_top_queries=300, remap_mscoco_category=False)
+    output_directory = os.path.dirname(outcsv_filename)
+    prefix_f = os.path.basename(outcsv_filename).split('.')[0]
+    if other2ppsocr:
+        ocr = PaddleOCR(
+        use_angle_cls=True,
+        lang='latin',use_space_char=True,use_debug=False,
+        use_gpu=True if cv2.cuda.getCudaEnabledDeviceCount() > 0 else False)
+
+        ocr2 = ocr2 = PaddleOCR(use_angle_cls=True,use_gpu =False,use_debug=False,
+                    rec_algorithm='SVTR_LCNet', rec_model_dir='/nfs_home/bowen/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer',
+                    lang="en") 
+        outcsv_filename=f"{output_directory}/{prefix_f}_withOCR.csv"
+
+
+    if visual_check:
+        output_directory = os.path.dirname(outcsv_filename)
+        prefix_f = os.path.basename(outcsv_filename).split('.')[0]
+        ima_checkdir=f"{output_directory}/{prefix_f}_Boxed"
+        os.makedirs(ima_checkdir, exist_ok=True)
+
+    if getacc:
+        acc_summary=f"{outcsv_filename}.I2Msummary.txt"
+        flogout = open(f'{acc_summary}' , 'w')
+        failed=[]
+        mydiff=[]
+        simRD=0
+        sim=0
+        mysum=0
+
+    model.eval()
+    criterion.eval()
+    metric_logger = MetricLogger(delimiter="  ")
+    header = 'Infering:'
+    res_smiles = []
+    idx_to_labels23={0:'other',1:'C',2:'O',3:'N',4:'Cl',5:'Br',6:'S',7:'F',8:'B',
+                        9:'I',10:'P',11:'*',12:'Si',13:'NONE',14:'BEGINWEDGE',15:'BEGINDASH',
+                        16:'=',17:'#',18:'-4',19:'-2',20:'-1',21:'1',22:'2',} 
+    idx_to_labels30 = {0:'other',1:'C',2:'O',3:'N',4:'Cl',5:'Br',6:'S',7:'F',8:'B',
+                        9:'I',10:'P',11:'H',12:'Si',13:'NONE',14:'BEGINWEDGE',15:'BEGINDASH',
+                        16:'=',17:'#',18:'-4',19:'-2',20:'-1',21:'1',22:'2',
+                        23:'CF3',#NOTE rdkit get element not supporting group
+                        24:'CN',
+                        25:'Me',
+                        26:'CO2Et',
+                        27:'R',
+                        28:'Ph',
+                        29:'*',
+                        }
+    bond_labels = [13,14,15,16,17]
+
+    if postprocessors.num_classes==23:
+        # print(data["categories"])
+        print(f'usage idx_to_labels23',idx_to_labels23)
+        idx_to_labels = idx_to_labels23
+    elif postprocessors.num_classes==30:
+        # print(data["categories"])#NOTE 11 is H not * now
+        print(f'usage idx_to_labels30',idx_to_labels30)
+        idx_to_labels = idx_to_labels30
+    else:
+        print(f"error unkown ways@@@@@@@@@@@!!!!!!!!!!idx_to_labels::{len(idx_to_labels)}\n{idx_to_labels}")
+    abrevie={"[23*]":'CF3',
+                                "[24*]":'CN',
+                                "[25*]":'Me',
+                                "[26*]":'CO2Et',
+                                "[27*]":'R',
+                                "[28*]":'Ph',
+                                "[29*]":'3~7UP',
+        }
+    # idx_to_labels = idx_to_labels23
+    lab2idx={ v:k  for k,v in idx_to_labels.items() }
+
+    smiles_data = pd.DataFrame({'file_name': [],
+                                'SMILESori':[],
+                                'SMILESpre':[],
+                                'SMILESexp':[],
+                                }
+                                )
+    output_dict = {}
+    output_ori={}
+    filtered_output_dict = {}
+    box_thresh=0.1
+    # for samples, targets in metric_logger.log_every(data_loader, 10, header):
+    #     samples = samples.to(device)
+    #     # targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+    #     outputs = model(samples)
+    #     # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)#.to(device)    
+    #     orig_target_sizes = targets["orig_size"].to(device)  
+    #     results = postprocessors(outputs, orig_target_sizes)#RTDETRPostProcessor@@src/zoo/rtertr
+    #     for i_, z in enumerate(zip(targets['image_id'], results)):
+    #         ti, output=z
+    #         output_dict[ti.item()] = [     
+    #                                     output,
+    #                                     targets['img_path'][i_], 
+    #                                     targets['SMILES'][i_],
+    #                                 ]
+
+    #         output_ori[ti.item()] =[     
+    #                     targets['img_path'][i_], 
+    #                     targets['SMILES'][i_],
+    #                                 ]
+    # print(len(output_ori),len(output_dict))     
+    for samples, targets in metric_logger.log_every(data_loader, 10, header):
+        # orig_target_sizes = targets["orig_size"].to(device)  
+        for i_, ti in enumerate(targets['image_id']):
+            output_dict[ti.item()] = [     
+                                    targets['img_path'][i_], 
+                                    targets['SMILES'][i_],
+                                ]
+
+
+    for key, value in output_dict.items():
+        
+        image_path = value[0]
+        SMILESori = value[1]
+
+        # selected_indices = value['scores'] > 0.5#may be >=0.5 cut off, as used the sigmoid?
+        # selected_indices = value[0]['scores']  > box_thresh
+        # true_count = selected_indices.sum().item()
+        #testing here
+        image_path='/cadd_data/samba_share/from_docker/data/work_space/ori/real/acs/ol020229e-Scheme-c3-10.png'
+
+        tensor,w,h = image_to_tensor(image_path)
+        tensor=tensor.unsqueeze(0).to(device)
+        print(tensor.size())  # Output tensor shape (C x H x W)
+        ori_size=torch.Tensor([w,h]).long().unsqueeze(0).to(device)
+        outputs = model(tensor)
+        result_ = postprocessor2(outputs, ori_size)
+        # result_ = postprocessors(outputs, ori_size)
+        score_=result_[0]['scores']
+        boxe_=result_[0]['boxes']
+        label_=result_[0]['labels']
+        #---------------------------------################################
+        selected_indices =score_ > box_thresh
+        true_count = selected_indices.sum().item()
+        output={
+            'labels': label_[selected_indices].to("cpu").numpy(),
+            'boxes': boxe_[selected_indices].to("cpu").numpy(),
+            'scores': score_[selected_indices].to("cpu").numpy()
+        }
+
+        img_ori = Image.open(image_path).convert('RGB')
+        w_ori, h_ori = img_ori.size  # 获取原始图像的尺寸
+        print(w_ori, h_ori, "orignianl vs 1000,1000")
+
+        print(f"selected_indices 中 True 的数量: {true_count}")
+        print(f"before nms_per_class, :: box 的数量:{len(output['labels'])}")
+        output = nms_per_class(output['labels'], output['boxes'], output['scores'], iou_thresh=0.5)
+        print(f"after nms_per_class, :: box 的数量:{len(output['labels'])}")
+
+        
+        # filtered_output_dict={image_path: output}
+        x_center = (output["boxes"][:, 0] + output["boxes"][:, 2]) / 2
+        y_center = (output["boxes"][:, 1] + output["boxes"][:, 3]) / 2
+        # center_coords = torch.stack((x_center, y_center), dim=1)
+        center_coords = np.stack((x_center, y_center), axis=1)
+        # center_coords=np.stack((x_center, y_center)).reshape(-1,2)#NOTE not do this, mix element order shits
+        #TODO split atom_charge \ bond drawing
+        output = {'bbox':         output["boxes"],#.to("cpu").numpy(),
+                    'bbox_centers': center_coords,#.to("cpu").numpy(),
+                    'scores':       output["scores"],#.to("cpu").numpy(),
+                    'pred_classes': output["labels"],#.to("cpu").numpy()
+                    }
+        ############################################################################################################################
+        img_ori = Image.open(image_path).convert('RGB')
+        w_ori, h_ori = img_ori.size  # 获取原始图像的尺寸
+        print(w_ori, h_ori, "orignianl vs 1000,1000")
+        # 计算缩放比例
+        scale_x = 1000 / w_ori
+        scale_y = 1000 / h_ori
+        img_ori_1k = img_ori.resize((1000,1000))
+        img = Image.open(image_path).convert('RGB')
+        img = img.resize((1000,1000))
+        # atom_bondBox_check=True
+
+        print(f"from oupt socore > {box_thresh},get box {len(output['bbox'])} after nms_per_class ")
+        # split into atom bond charge nms， then mergd , then box2 mol NOTE charege and bond confidence at least >10%
+        charge_mask = np.array([True if ins  in charge_labels and  output['scores'][i]>0.1  else False  for i, ins in enumerate(output['pred_classes'])])
+        charges_bbox=output['bbox'][charge_mask]
+        charges_centers= output['bbox_centers'][charge_mask]
+        charges_classes= output['pred_classes'][charge_mask]
+        charges_scores= output['scores'][charge_mask]
+        charges_bbox, charges_centers, charges_scores,charges_classes,figc =view_box_center2(charges_bbox, charges_centers, charges_scores,charges_classes, overlap_dist_thresh=5.0, max_centers_per_box=5)
+        #view_box_center2 help remove large box if boxscore small than 0.5
+        # bonds_mask2 = np.array([True if ins  in bond_labels else False for ins in output['pred_classes']])
+        # bonds_mask= output['scores'][bonds_mask2]>=0.1# TODO fix me, as training bond box overlap with bondbox,aussme bond socre make sense
+        bonds_mask = np.array([True if ins  in bond_labels and output['scores'][i]>0.2 else False for i, ins in enumerate(output['pred_classes'])])
+        bond_bbox=output['bbox'][bonds_mask]
+        bond_centers= output['bbox_centers'][bonds_mask]
+        bond_classes= output['pred_classes'][bonds_mask]
+        bond_scores= output['scores'][bonds_mask]
+        # bond_bbox2, bond_centers2, bond_scores2,bond_classes2,fig=view_box_center2(bond_bbox, bond_centers, bond_scores,bond_classes, overlap_dist_thresh=5.0, max_centers_per_box=5)
+        bond_bbox, bond_centers, bond_scores,bond_classes,fig =view_box_center2(bond_bbox, bond_centers, bond_scores,bond_classes, overlap_dist_thresh=5.0, max_centers_per_box=3)
+        bond_bbox, bond_classes, bond_scores = nms(bond_bbox, bond_scores,bond_classes, iou_threshold=0.5)
+
+        heavy_mask= np.array([True if ins not in bond_labels and ins not in charge_labels and ins != lab2idx['H'] else False for ins in output['pred_classes']])
+        h_mask= np.array([True if ins not in bond_labels and ins not in charge_labels and ins == lab2idx['H'] else False for ins in output['pred_classes']])
+
+        #TODO fix me if heavy or H all need this view_box_center2 filtering
+        heavy_bbox = output['bbox'][heavy_mask]
+        heavy_classes = output['pred_classes'][heavy_mask]
+        heavy_centers= output['bbox_centers'][heavy_mask]
+        heavy_scores= output['scores'][heavy_mask]
+        heavy_bbox, heavy_centers, heavy_scores,heavy_classes,fighv =view_box_center2(heavy_bbox, heavy_centers, heavy_scores,heavy_classes, overlap_dist_thresh=5.0, max_centers_per_box=5)
+
+        #TODO del isolated C without bond box overlap
+        delt_hei=[]
+        for hei,hebox in enumerate(heavy_bbox):
+            he_class=idx_to_labels[heavy_classes[hei]]
+            b_nei=[]
+            if he_class in ['C']:#TODO add other cases
+                for bi,bb in enumerate(bond_bbox):
+                    overlap_flag=boxes_overlap(bb, hebox)#TODO use tghe atom bond box overlap get bond atom mapping,then built mol
+                    if overlap_flag:
+                        b_nei.append(bi)
+                if len(b_nei)==0:
+                    delt_hei.append(hei)
+        n = len(heavy_scores)  # 更新数量
+        keep_boxes = np.ones(n, dtype=bool)  
+        keep_boxes[delt_hei]=False
+        heavy_bbox, heavy_centers, heavy_scores,heavy_classes=heavy_bbox[keep_boxes], heavy_centers[keep_boxes], heavy_scores[keep_boxes],heavy_classes[keep_boxes]
+
+        h_bbox = output['bbox'][h_mask]
+        h_centers= output['bbox_centers'][h_mask]
+        h_classes= output['pred_classes'][h_mask]
+        h_scores= output['scores'][h_mask]
+        h_bbox, h_centers, h_scores,h_classes,figh =view_box_center2(h_bbox, h_centers, h_scores,h_classes, overlap_dist_thresh=5.0, max_centers_per_box=5)
+
+        #NOTE need keep the order heavy atom first then following with Hs
+        # atoms_mask = np.array([True if ins not in bond_labels and ins not in charge_labels else False for ins in output['pred_classes']])
+        # atom_bbox=output['bbox'][atoms_mask]
+        # atom_classes=output['pred_classes'][atoms_mask]
+        # 合并 bbox，保持重原子在前，氢原子在后
+        atom_bbox = np.concatenate([heavy_bbox, h_bbox], axis=0)
+        atom_classes = np.concatenate([heavy_classes, h_classes], axis=0)
+        # atom_centers = np.concatenate([heavy_centers, h_centers], axis=0)
+        atom_scores = np.concatenate([heavy_scores, h_scores], axis=0)
+        #TODO nms checking
+        # kept_bboxes, kept_classes, kept_scores=nms(atom_bbox, atom_scores, atom_classes, iou_threshold=0.5)
+        # # kept_bboxes, kept_classes, kept_scores=nms_atomBox(atom_bbox, atom_scores, atom_classes, iou_threshold=0.5)
+        # merged_bboxes, merged_classes, merged_scores = merge_low_iou_boxes(kept_bboxes, kept_classes, kept_scores, merge_threshold=0.5, score_threshold=0.7)
+        # print(f'ater nms kept_box {len(kept_bboxes)}, followd merge_low_iou_boxes  kept_box:: {len(merged_bboxes)}')
+        # atom_bbox, atom_classes, atom_scores=merged_bboxes, merged_classes, merged_scores
+        atom_bbox, atom_scores, atom_classes = refine_boxes(atom_bbox, atom_scores, atom_classes,  bond_bbox)
+
+
+        x_center = (atom_bbox[:, 0] + atom_bbox[:, 2]) / 2
+        y_center = (atom_bbox[:, 1] + atom_bbox[:, 3]) / 2
+        # center_coords = torch.stack((x_center, y_center), dim=1)
+        center_coords = np.stack((x_center, y_center), axis=1)
+        atom_centers=center_coords
+
+        print(f"before NMS :: heavy box {len(heavy_bbox)} ---- H box {len(h_bbox)}---bond box{len(bond_bbox)}")
+        print(f"after  NMS+view_box_center2 :: atom box {len(atom_bbox)} bond box {len(bond_bbox)}  charge box {len(charges_bbox)} ")
+        # print(f"bond box with only single atom box overlap:: {single_odd_bi}")
+        print(f"atom box afte NMS and merge_low_iou_boxes")
+        print(f"get box {len(output['bbox'])} with NMS")
+        print(f"atom score >0.1 bond score >0.2, then folllowed with NMS")
+        print(f"bond_bbox nums::",bond_bbox.shape,len(bond_bbox))
+        print(f" OCR will start involved ")#
+        #check if ODD single-bonds with only one atom exisits, try add the atoms box for this bond
+        new_atoms, single_odd_b2a= Newbox_(atom_bbox,bond_bbox, lab2idx )
+        print(f"new_atoms number {len(new_atoms)}\n{new_atoms}")
+        if len(new_atoms)>0:
+            for boxout in new_atoms:
+                for k,arr in boxout.items():
+                    value_or_row=output[k]
+                    if arr.ndim == 1:
+                        output[k]=np.append(value_or_row, arr)
+                    elif arr.ndim >= 2:
+                        output[k] = np.concatenate([value_or_row, arr], axis=0)
+                    else:
+                        print('errprs, unkown conditions !!!@')
+        #NOTE try to use OCR to help postprocess box adding and del
+        # 加载图像 OCR
+        image = cv2.imread(image_path)
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # 预处理图像突出下标
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
+        # print(_, thresh)
+        kernel = np.ones((2, 2), np.uint8)
+        dilated = cv2.dilate(thresh, kernel, iterations=1)
+        # cv2.imwrite("preprocessed.jpg", dilated)#NOTE comment if need checking
+        # result = ocr.ocr("preprocessed.jpg", cls=True)
+        #  ocr.ocr(image_npocr, cls=True, det=False)
+        result = ocr.ocr(dilated, cls=True)  # 直接传递 NumPy 数组
+        # 解析结果
+        text_boxes = []
+        text_contents = []
+        confidences = []
+        for line in result:
+            print(line)
+            if line:
+                for box_info in line:
+                    box = box_info[0]
+                    x_coords = [point[0] for point in box]
+                    y_coords = [point[1] for point in box]
+                    text_box = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]
+                    text = box_info[1][0]
+                    text_boxes.append(text_box)
+                    text_contents.append(text)
+                    confidences.append(box_info[1][1])
+        print("Detected text boxes:", text_boxes)
+        print("Detected text contents:", text_contents)
+        print("Confidences:", confidences)        
+        #after whole img OCRed
+        # Initialize dictionaries and lists
+        ai2text = {}
+        ai2relplace = {}
+        ai2rdkitlab_unknown = {}
+        non_overlapping_texts = []
+        # Build initial KDTree
+        tree = cKDTree(atom_centers)
+        # Collect indices to delete after the loop to keep tree valid during processing
+        indices_to_delete = set()
+        # Process each OCR text box
+        for ti, text_box in enumerate(text_boxes):
+            text_center = calculate_center(text_box)
+            ocr_text = text_contents[ti]
+
+            # Normalize OCR text
+            if ocr_text in ['OH', 'HO']:
+                ocr_text = 'O'
+            elif ocr_text in ['SH', 'HS']:
+                ocr_text = 'S'
+            elif ocr_text in ['NH', 'HN']:
+                ocr_text = 'N'
+            elif ocr_text in ['CH', 'HC']:
+                ocr_text = 'C'
+            elif ocr_text == '0':
+                ocr_text = 'O'
+            elif ocr_text == 'L':
+                ocr_text = 'Li'
+            elif ocr_text[-1]=='-':
+                if ocr_text[:-1] in  ABBREVIATIONS:
+                    ocr_text=ocr_text[:-1]
+            
+            # Find all overlapping atom boxes
+            overlapping_indices = []
+            for idx in range(len(atom_bbox)):
+                if idx not in indices_to_delete and boxes_overlap(atom_bbox[idx], text_box):
+                    overlapping_indices.append(idx)
+
+            if overlapping_indices:
+                # If there are overlapping atom boxes, merge them
+                if len(overlapping_indices) > 1:
+                    # Get the smallest box encompassing all overlapping atom boxes
+                    overlapping_boxes = [atom_bbox[idx] for idx in overlapping_indices]
+                    merged_box = get_merged_box(overlapping_boxes)
+                    overlapping_indices_atomboxclass=[idx_to_labels[atom_classes[i]] for i in overlapping_indices]
+                    print(f"Merging {len(overlapping_indices)} atom boxes overlapping with OCR text: {ocr_text}")
+                    print(f" {overlapping_indices} boxes type{overlapping_indices_atomboxclass}  merged as OCR text: {ocr_text}")
+                    merged_area = box_area(merged_box)
+                    text_area = box_area(text_box)
+                    final_box = merged_box if merged_area >= text_area else text_box
+                else:
+                    # If only one overlap, use the text box directly
+                    final_box = text_box
+                # Use the OCR text box as the merged box
+                primary_idx = overlapping_indices[0]
+                # atom_bbox[primary_idx] = text_box
+                
+                # Update the primary atom box
+                atom_bbox[primary_idx] = final_box
+                # Update class and dictionaries based on OCR text
+                if ocr_text in ABBREVIATIONS:
+                    ai2relplace[primary_idx] = ocr_text
+                    atom_classes[primary_idx] = 0
+                    if ocr_text in lab2idx:
+                        atom_classes[primary_idx] = lab2idx[ocr_text]
+                elif ocr_text in ['H', 'C', 'O', 'N', 'Cl', 'Br', 'S', 'F', 'B', 'I', 'P', 'Si']:
+                    atom_classes[primary_idx] = lab2idx[ocr_text]
+                elif ocr_text in RGROUP_SYMBOLS or (ocr_text[0] == 'R' and ocr_text[1:].isdigit()):
+                    atom_classes[primary_idx] = 0
+                else:
+                    ai2rdkitlab_unknown[primary_idx] = ocr_text
+                    atom_classes[primary_idx] = 0
+                
+                ai2text[primary_idx] = ocr_text
+
+                # Mark redundant indices for deletion
+                indices_to_delete.update(overlapping_indices[1:])
+
+            else:
+                # No overlap: record the text box and nearest atom index
+                distance, nearest_idx = tree.query(text_center)
+                if nearest_idx not in indices_to_delete:  # Only record if nearest_idx is still valid
+                    print(f"No overlap for OCR text '{ocr_text}', nearest atom box index: {nearest_idx}")
+                    non_overlapping_texts.append({
+                        'text': ocr_text,
+                        'text_box': text_box,
+                        'nearest_atom_idx': nearest_idx,
+                        'distance': distance
+                    })
+
+        #set up atom_ocr match atom_class
+        atom_ocr=[]
+        for i,ai in enumerate(atom_classes):
+            if i in ai2text:
+                atom_ocr.append(ai2text[i])
+            # elif i in ai2rdkitlab_unknown:
+            #     atom_ocr.append(ai2rdkitlab_unknown[i])
+            else:
+                atom_ocr.append(idx_to_labels[ai])
+        print(f"atom class + ocr presented as symbols::\n{atom_ocr}")
+        atom_ocr=np.array(atom_ocr)
+        # Perform deletions after the loop
+        if indices_to_delete:
+            indices_to_keep = np.setdiff1d(np.arange(len(atom_bbox)), list(indices_to_delete))
+            atom_bbox = atom_bbox[indices_to_keep]
+            atom_classes = atom_classes[indices_to_keep]
+            atom_centers = atom_centers[indices_to_keep]
+            atom_scores = atom_scores[indices_to_keep]
+            atom_ocr= atom_ocr[indices_to_keep]
+
+            # Adjust dictionary indices
+            for d in [ai2text, ai2relplace, ai2rdkitlab_unknown]:
+                d_new = {}
+                for old_idx, value in d.items():
+                    new_idx = np.where(indices_to_keep == old_idx)[0][0] if old_idx in indices_to_keep else None
+                    if new_idx is not None:
+                        d_new[new_idx] = value
+                d.clear()
+                d.update(d_new)
+
+            # Adjust nearest_atom_idx in non_overlapping_texts
+            for entry in non_overlapping_texts:
+                old_idx = entry['nearest_atom_idx']
+                if old_idx in indices_to_keep:
+                    entry['nearest_atom_idx'] = np.where(indices_to_keep == old_idx)[0][0]
+                else:
+                    entry['nearest_atom_idx'] = -1  # Mark as invalid if the nearest atom was deleted
+
+        # Rebuild KDTree if needed for further use
+        tree = cKDTree(atom_centers)
+
+        # Final output
+        print("Whole img with OCR :: ai2relplace, ai2rdkitlab_unknown:", [ai2relplace, ai2rdkitlab_unknown])
+        print(f"Adjusted ai ocr_text: {ai2text}")
+        print(f"Atom box num: {len(atom_bbox)}:: {[idx_to_labels[i] for i in atom_classes]}")
+        print("Non-overlapping OCR text boxes:", non_overlapping_texts)
+
+        #for all  heavy atom labels, consider N3 pred as N, or other cases, I2M not good as paddle on ABC 
+        atomcorp_img = Image.open(image_path).convert('RGB')
+        atomcorp_img1k=atomcorp_img.resize([1000,1000])
+        text_contents_star=[]
+        text_confidences_star=[]
+        text_boxes_star=[]
+        boxid2del=dict()
+        ocr_discrepancies = {}  # New dictionary to record OCR vs. AI mismatches
+        print(f"has atom_bbox number {len(atom_bbox)}")
+        for i,box in enumerate(atom_bbox):#split ocr image
+            # if i in ai2text: continue #may be need comment this, if splited OCR acc better!!
+            abox =box* [scale_x, scale_y, scale_x, scale_y]
+            cropped_img=atomcorp_img1k.crop(abox)#if use the small ori image will not get infos
+            image_npocr = np.array(cropped_img)
+            result_ocr= ocr2.ocr(image_npocr, det=False)#,cls=True,use_debug=False, det=False)#det fale not box but get rcongized more 
+            # result_ocr= ocr.ocr(image_npocr, cls=True, det=False)#,cls=True, det=False)#det fale not box but get rcongized more 
+            if result_ocr:
+                for line in result_ocr:
+                    # print(f"Atom box--- {i}, OCR result---: {line}")
+                    if line:
+                        box_flag=has_boxes(line)
+                        for box_info in line:
+                            # print(len(box_info))
+                            if not box_flag:
+                                text=box_info[0]
+                                #[^a-zA-Z0-9\*\-\+] 表示匹配除了字母、数字、*、- 和 + 之外的所有字符。
+                                text=re.sub(r'[^a-zA-Z0-9,\*\-\+]', '', text)#remove special chars
+                                score_=box_info[1]
+                                text_contents_star.append(text)
+                                text_confidences_star.append(score_)
+                            else:#when paddleOCRuse detection model get text box info
+                                box = box_info[0]
+                                x_coords = [point[0] for point in box]
+                                y_coords = [point[1] for point in box]
+                                text_box = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]
+                                text = box_info[1][0]
+                                text=re.sub(r'[^a-zA-Z0-9,\*\-\+]', '', text)#remove special chars
+                                text_boxes_star.append(text_box)
+                                text_contents_star.append(text)
+                                score_=box_info[1][1]
+                                text_confidences_star.append(score_)
+                            if i in ai2text:#ocr 全img vs  split img 
+                                # print(f'from whole img ocr atom box {i}----from whole img::{ai2text[i]}')
+                                if  ai2text[i] != text:
+                                    text=ai2text[i] if len(ai2text[i])>=len(text) else text
+                            print(f"Atom box {i}@@ OCR text: {text}, score: {score_}, AI class: {idx_to_labels[atom_classes[i]]}, AI score: {atom_scores[i]}")
+                            # Normalize OCR text
+                            if text in ['OH', 'HO']:
+                                text = 'O'
+                            elif text in ['SH', 'HS']:
+                                text = 'S'
+                            elif text in ['NH', 'HN']:
+                                text = 'N'
+                            elif text in ['CH', 'HC']:
+                                text = 'C'
+                            elif text == '0':
+                                text = 'O'
+                            elif text == 'L':
+                                text = 'Li'
+                            elif '-' in text:
+                                if text[:-1] in  ABBREVIATIONS:
+                                    text=text[:-1]
+
+                            # Check if OCR text is a single character and not a valid element
+                            is_single_char = len(text) == 1
+                            ai_pred = idx_to_labels[atom_classes[i]]
+                            #TOD add more simpfiled 
+                            if text=='0':
+                                atom_classes[i]=lab2idx['O']
+                            elif text in ['H', 'C', 'O', 'N', 'Cl', 'Br', 'S', 'F', 'B', 'I', 'P', 'Si']:
+                                atom_classes[i]=lab2idx[text]#need update to keep H following Heavy
+                            # elif  # ocr recongnized on lable C as other things chars 
+                            elif is_single_char and text not in ELEMENTS and ai_pred == 'C':
+                                # Do not replace AI prediction, just record discrepancy
+                                ocr_discrepancies[i] = {
+                                    'ocr_text': text,
+                                    'ocr_score': score_,
+                                    'ai_class': ai_pred,
+                                    'ai_score': atom_scores[i]
+                                }
+                            else:
+                                overlap, b_nei=AtomBox2bondBox(atom_bbox[i],bond_bbox)
+                                if not overlap:
+                                    if text not in ELEMENTS and text not in ABBREVIATIONS:
+                                        # print(f"new cases::{text} for atombox {i}  {atom_bbox[i]}check how to fix it  !!!")
+                                        # print(f'OCR text:: {text} score ::{box_info}||atom clss::{idx_to_labels[atom_classes[i]]} {atom_scores[i]}')
+                                        if text != idx_to_labels[atom_classes[i]]:
+                                            boxid2del[i]= [text,idx_to_labels[atom_classes[i]]]#will delt this atom box infos
+                                else:
+                                    if text != idx_to_labels[atom_classes[i]]:
+                                        if atom_scores[i]<=score_:
+                                            if text in RGROUP_SYMBOLS or text in ABBREVIATIONS:
+                                                ai2relplace[i]=text
+                                                atom_classes[i]=0
+                                                if text in lab2idx and  lab2idx[text] in list(range(23,29)):atom_classes[i]=lab2idx[text]
+                                            elif text in ['H', 'C', 'O', 'N', 'Cl', 'Br', 'S', 'F', 'B', 'I', 'P', 'Si']:
+                                                atom_classes[i]=lab2idx[text]
+                                            else:
+                                                ai2relplace[i]=text
+                                                atom_classes[i]=0
+                                                
+        # 按照 value 的第一个元素（假设是字符串）的长度进行排序，长度大的排前
+        boxid2del = dict(sorted(boxid2del.items(), key=lambda item: item[0], reverse=True))
+        print(f"considering del box",boxid2del)                                    
+        print("after split img  OCR:: ai2relplace,ai2rdkitlab_unknown",[ai2relplace,ai2rdkitlab_unknown])
+        print(f"considering delet atomb box :{boxid2del}")
+        syms=[]
+        for i in range(len(atom_classes)):
+            if  i in ai2relplace: syms.append(ai2relplace[i])
+            elif i in ai2rdkitlab_unknown:syms.append(ai2rdkitlab_unknown[i])
+            else:
+                syms.append(idx_to_labels[atom_classes[i]])
+        print(f"atombox {atom_classes}:: number {len(atom_classes)}\n",[idx_to_labels[i] for i in atom_classes])
+        print(f" {syms}")
+        #chedck isolated box, if need add bond box between the isolated box or not
+        isolated_ais = []
+        # 第一步：构建 bond 到 atom 的映射，并计算 distance_threshold
+        bond_distances = []
+        singleAtomBond=dict()
+        for bi, bb in enumerate(bond_bbox):
+            overlapped_atoms = []
+            overlapped_abox = []
+            for ai, aa in enumerate(atom_bbox):
+                overlap_flag = boxes_overlap(bb, aa)
+                if overlap_flag:
+                    overlapped_atoms.append(ai)
+                    overlapped_abox.append(aa)
+                    # if bi not in b2a.keys():
+                    #     b2a[bi] = [ai]
+                    # else:
+                    #     b2a[bi].append(ai)
+            if len(overlapped_atoms) == 2:
+                center1 = calculate_center(atom_bbox[overlapped_atoms[0]])
+                center2 = calculate_center(atom_bbox[overlapped_atoms[1]])
+                distance = np.linalg.norm(center1 - center2)
+                bond_distances.append(distance)
+                # print(f"Bond {bi} connects atoms {overlapped_atoms}, distance: {distance:.2f}")
+            elif len(overlapped_atoms) == 1:
+                print(f"single bond - atom still exists for bond {bi}, need porcess this !!")
+                if bi not in singleAtomBond:
+                    singleAtomBond[bi]=overlapped_atoms#considering use the add H box for solve TODO 
+
+        # 动态计算 distance_threshold
+        distance_threshold = max(bond_distances) if bond_distances else 100.0  # 默认值 10 如果无 bond
+        distance_threshold_min = min(bond_distances) if bond_distances else 100.0  # 默认值 10 如果无 bond
+        print(f"Calculated distance_threshold center based: {distance_threshold:.2f}")
+
+        # 第二步：构建 atom 到 bond 的映射，并检测孤立原子
+        a2b=dict()
+        for ai, aa in enumerate(atom_bbox):
+            b_nei = []
+            for bi, bb in enumerate(bond_bbox):
+                overlap_flag = boxes_overlap(bb, aa)
+                if overlap_flag:
+                    b_nei.append(bi)
+            a2b[ai] = b_nei
+            if a2b[ai] ==[]:
+                if ai not in isolated_ais:
+                    isolated_ais.append(ai)
+
+        isolated_ais=sorted(isolated_ais,reverse=True)#avoid delte atom with index errors
+        print(f"isolated_ais atom box {isolated_ais}\n ", [idx_to_labels[i] for i in atom_classes[isolated_ais]])
+
+        # 第三步：处理孤立原子，尝试合并或删除
+        updated_atom_bbox = atom_bbox.copy()
+        updated_atom_classes = atom_classes.copy()
+        updated_atom_scores = atom_scores.copy()
+        print(f"atom bbox num {len(atom_bbox)}")#ttt
+        new_bond_bbox=[]
+        deleted_ais=[]
+        del4boxid2del=set()
+        for isolated_ai in isolated_ais:
+            isolated_box = atom_bbox[isolated_ai]
+            isolated_center = calculate_center(isolated_box)
+            nearest_distance = float('inf')
+            nearest_ai = -1
+            # 找到最近的非孤立原子
+            for ai, aa in enumerate(atom_bbox):
+                if ai not in isolated_ais and ai != isolated_ai:
+                    center = calculate_center(aa)
+                    distance = np.linalg.norm(isolated_center - center)
+                    if distance < nearest_distance:
+                        nearest_distance = distance
+                        nearest_ai = ai
+            # 合并或删除逻辑
+            if nearest_ai != -1:
+                if nearest_distance<=distance_threshold_min or (nearest_distance <=distance_threshold and nearest_distance>=distance_threshold_min):#this the centers dist not bond length
+                    nearest_box = atom_bbox[nearest_ai]
+                    nearest_class = atom_classes[nearest_ai]
+                    nearest_center = calculate_center(nearest_box)
+                    if isolated_ai in boxid2del:
+                        textocr2del=boxid2del[isolated_ai][0]
+                    else:
+                        textocr2del=None
+                    #NOTE based ont the class and ovelap bond box to adjust
+                    overlap1,bondnei=AtomBox2bondBox(nearest_box,bond_bbox)
+                    if len(bondnei)==1:#could be add two other bond, add bond box
+                        # if textocr2del in [',', '+', '-'] or not any(c.isupper() for c in textocr2del):
+                        if textocr2del is not None and  not any(c.isupper() for c in textocr2del):
+                            # del4boxid2del.add(isolated_ai)
+                            deleted_ais.append(isolated_ai)
+                            pass
+                        else:
+                            new_bc = (isolated_center + nearest_center)*0.5
+                            new_bondbox=np.array([new_bc[0] - nearest_distance*0.5,
+                                                new_bc[1] - nearest_distance*0.5,
+                                                new_bc[0] + nearest_distance*0.5,
+                                                new_bc[1] + nearest_distance*0.5]
+                                        )
+                            new_bond_bbox.append(new_bondbox.reshape(-1,4))
+                            print(f'add a new bond box new_bc for two atom boxes {isolated_ai} ---- {nearest_ai}::\n {idx_to_labels[atom_classes[isolated_ai]]}   --- {idx_to_labels[atom_classes[nearest_ai]]}')
+                    else:#TODO fix me when get the case with >=2 bonds need add bond also
+                        try:
+                            new_box = merge_boxes(isolated_box, nearest_box)
+                            updated_atom_bbox[nearest_ai] = new_box
+                            chosed_score_ = max(atom_scores[isolated_ai], atom_scores[nearest_ai])
+                            updated_atom_scores[nearest_ai] = chosed_score_
+                        except Exception as e:
+                            print(f"file_name@: {image_path}\n SMILES in csv:\n{SMILESori}")
+                            print(e)
+                            print('nearest_ai  ', nearest_ai)
+                            check2=True
+                            if check2:
+                                padding=5
+                                # box_thresh=0.3
+                                atombox_img=draw_objs(copy.deepcopy(img),
+                                                    atom_bbox* [scale_x, scale_y, scale_x, scale_y],
+                                                    atom_classes, atom_scores ,
+                                                    category_index=idx_to_labels,
+                                                    box_thresh=box_thresh,
+                                                    line_thickness=3,
+                                                    font='arial.ttf',
+                                                    font_size=10)
+                                bonbox_img=draw_objs(copy.deepcopy(img),
+                                                    bond_bbox* [scale_x, scale_y, scale_x, scale_y],
+                                                    bond_classes, bond_scores ,
+                                                    category_index=idx_to_labels,
+                                                    box_thresh=0.01,
+                                                    line_thickness=3,
+                                                    font='arial.ttf',
+                                                    font_size=10)
+                                # Get sizes of the individual images
+                                atom_width, atom_height = atombox_img.size
+                                bon_width, bon_height = bonbox_img.size
+                                combined_width = atom_width + bon_width + padding * 3
+                                combined_height = max(atom_height, bon_height) + padding * 2
+                                combined_img = Image.new('RGB', (combined_width, combined_height), color=(255, 255, 255))  # White background
+                                # Paste the images onto the new canvas
+                                combined_img.paste(atombox_img, (padding, padding))  # Top-left
+                                combined_img.paste(bonbox_img, (atom_width + padding * 2, padding))
+                                print(f"atom box afte NMS and merge_low_iou_boxes")
+                            combined_img.save(f"tttttttttttttttttttttttBoxed.png"
+                                              )
+                            raise Exception("@debug this!!\n")
+                        
+                        if chosed_score_>=0.5:
+                            if chosed_score_==atom_scores[isolated_ai]:
+                                updated_atom_classes[nearest_ai] = 0 # mrege replaced with *
+                            # else:
+                            #     updated_atom_classes[nearest_ai] = atom_classes[nearest_ai]  # 保留较高 score 的类别
+                        updated_atom_bbox = np.delete(updated_atom_bbox, isolated_ai, axis=0)#after mreged need del it
+                        # updated_atom_bbox = np.delete(updated_atom_bbox, isolated_ai, axis=0)
+                        updated_atom_classes = np.delete(updated_atom_classes, isolated_ai)
+                        updated_atom_scores = np.delete(updated_atom_scores, isolated_ai)
+                        print(f"Merged atom {isolated_ai} into {nearest_ai}, new box: {new_box}")
+                        isolated_ais.remove(isolated_ai)
+                        deleted_ais.append(isolated_ai)
+                # elif nearest_distance<=distance_threshold_min:#very close,mrege with nearest one
+                elif atom_scores[isolated_ai] < 0.5:
+                    # 删除低分孤立原子
+                    updated_atom_bbox = np.delete(updated_atom_bbox, isolated_ai, axis=0)
+                    updated_atom_classes = np.delete(updated_atom_classes, isolated_ai)
+                    updated_atom_scores = np.delete(updated_atom_scores, isolated_ai)
+                    print(f"DELET isolated atom {isolated_ai} with score {atom_scores[isolated_ai]}")
+                    deleted_ais.append(isolated_ai)
+                    # 更新索引，因为数组维度变化
+                    isolated_ais = [i if i < isolated_ai else i - 1 for i in isolated_ais if i != isolated_ai]
+                else:
+                    print(f"KEEP isolated atom {isolated_ai} with score {atom_scores[isolated_ai]} >= 0.5")
+            
+
+            else:
+                if atom_scores[isolated_ai] < 0.5:
+                    updated_atom_bbox = np.delete(updated_atom_bbox, isolated_ai, axis=0)
+                    updated_atom_classes = np.delete(updated_atom_classes, isolated_ai)
+                    updated_atom_scores = np.delete(updated_atom_scores, isolated_ai)
+                    print(f"DELET isolated atom {isolated_ai} with score {atom_scores[isolated_ai]}")
+                    deleted_ais.append(isolated_ai)
+                    isolated_ais = [i if i < isolated_ai else i - 1 for i in isolated_ais if i != isolated_ai]
+                else:
+                    print(f"KEEP isolated atom {isolated_ai} with score {atom_scores[isolated_ai]} >= 0.5")
+
+        if len(new_bond_bbox)>0:
+            for i,bond_box in enumerate(new_bond_bbox):
+                bond_bbox= np.concatenate([bond_bbox,bond_box],axis=0)
+                bond_scores= np.concatenate((bond_scores,np.array([0.9])),axis=0)
+                bond_classes= np.concatenate([bond_classes,np.array([13])],axis=0)
+            #reset bond center
+            x_center = (bond_bbox[:, 0] + bond_bbox[:, 2]) / 2
+            y_center = (bond_bbox[:, 1] + bond_bbox[:, 3]) / 2
+            # center_coords = torch.stack((x_center, y_center), dim=1)
+            center_coords = np.stack((x_center, y_center), axis=1)
+            bond_centers=center_coords         
+
+        #del the additional atom box that not connected by bond box also mismatch other rules
+        if len(deleted_ais) > 0:  # 如果有需要删除的索引
+            print(f"will delete atom box with idx :: {deleted_ais}")
+            # 使用 np.delete 一次性删除所有指定的行
+            atom_classes = np.delete(atom_classes, deleted_ais, axis=0)
+            atom_scores = np.delete(atom_scores, deleted_ais, axis=0)
+            atom_bbox = np.delete(atom_bbox, deleted_ais, axis=0)
+            atom_ocr = np.delete(atom_ocr, deleted_ais, axis=0)
+
+        # eles=[idx_to_labels[i] for i in atom_classes]
+        # print(eles,len(eles))        
+        cur_atomSymbols=[idx_to_labels[i] for i in atom_classes]
+        ocr_wholeImg=[]
+        for i in atom_classes:
+            if i in ai2relplace:
+                ocr_wholeImg.append(ai2relplace[i])
+            elif  i in ai2rdkitlab_unknown:
+                ocr_wholeImg.append(ai2rdkitlab_unknown[i])
+            else:
+                ocr_wholeImg.append(idx_to_labels[i])
+        print("ai2relplace,ai2rdkitlab_unknown",ai2relplace,ai2rdkitlab_unknown)
+        print("cur_atomSymbols:",cur_atomSymbols)
+        print(" atomSymbolsOCR:",ocr_wholeImg)
+        
+        # 找到 'H' 的索引, H after Heavy
+        h_indices = np.where(atom_classes == lab2idx['H'])[0]
+        non_h_indices = np.where(atom_classes != lab2idx['H'])[0]
+        # print(h_indices,non_h_indices)
+        # 重新排序
+        new_order = np.concatenate((non_h_indices, h_indices)).astype(np.int64)
+        # newid2old_Hafter={ i:j for i,j in enumerate(new_order)}
+        # old2newid_Hafter={ j:i for i,j in enumerate(new_order)}
+        atom_classes = atom_classes[new_order]
+        atom_bbox = atom_bbox[new_order]
+        atom_scores = atom_scores[new_order]
+        x_center = (atom_bbox[:, 0] + atom_bbox[:, 2]) / 2
+        y_center = (atom_bbox[:, 1] + atom_bbox[:, 3]) / 2
+        # center_coords = torch.stack((x_center, y_center), dim=1)
+        center_coords = np.stack((x_center, y_center), axis=1)
+        atom_centers=center_coords#TODO 记得把 abbve idx label same reoder or mapping then bond
+        #bond box reoder like atom box, let the singleAtomBond later
+        bond_bbox = reorder_bond_bbox(bond_bbox, singleAtomBond)
+        bond_classes = reorder_bond_bbox(bond_classes, singleAtomBond)
+        bond_scores = reorder_bond_bbox(bond_scores, singleAtomBond)
+        bond_centers = reorder_bond_bbox(bond_centers, singleAtomBond)
+
+        # 第二步：构建 atom 到 bond 的映射，并检测孤立原子
+        a2b=dict()
+        for ai, aa in enumerate(atom_bbox):
+            b_nei = []
+            for bi, bb in enumerate(bond_bbox):
+                overlap_flag = boxes_overlap(bb, aa)
+                if overlap_flag:
+                    b_nei.append(bi)
+            a2b[ai] = b_nei
+            if a2b[ai] ==[]:
+                if ai not in isolated_ais:
+                    isolated_ais.append(ai)
+
+        b2a=dict()
+        for bi,bb in enumerate(bond_bbox):
+            overlapped_atoms = []
+            overlapped_abox=[]
+            for ai,aa in enumerate(atom_bbox):
+                overlap_flag=boxes_overlap(bb, aa)#TODO use tghe atom bond box overlap get bond atom mapping,then built mol
+                if overlap_flag:
+                    # print(bb, aa,overlap_flag)
+                    overlapped_atoms.append(ai)
+                    overlapped_abox.append(aa)
+                    if bi not in b2a.keys():
+                        b2a[bi]=[ai]
+                    else:
+                        # vais=b2a[bi]
+                        b2a[bi].append(ai)
+            if len(overlapped_atoms) == 1:
+                print(f"single bond -atom still exists  {overlapped_atoms}")
+
+        #c2a a2c
+        #charge atom idx maping
+        if len(charges_classes) > 0:
+            # print(charges_bbox,charges_classes,len(charges_classes))
+            kdt = cKDTree(atom_centers)
+            atid_list=list(range(len(atom_centers)))
+            used_charge_indices=set()
+            c2a=dict()
+            for i, (x,y) in enumerate(charges_centers):
+                overlapped_abox=[]
+                cc=charges_bbox[i]
+                for ai, aa in  enumerate(atom_bbox):
+                    overlap_flag=boxes_overlap(cc, aa)
+                    ac_iou=calculate_iou(cc, aa)
+                    charge_=charges_classes[i]
+                    charge_score=charges_scores[i]
+                    if overlap_flag:
+                        if i in c2a:
+                            c2a[i].append(ai) 
+                        else:
+                            c2a[i]=[ai] 
+                        if ai not in atid_list:
+                            print(f"Warning: ai {ai} is out of range for atom_list.")
+                            continue  # 跳过当前循环迭代
+            # idx_to_labels[charges_classes[0]]
+            a2c=dict()
+            for ci,v in c2a.items():
+                charge_=idx_to_labels[charges_classes[ci]]
+                if len(v)==1:
+                    a2c[v[0]]=ci
+                else:
+                    for ai in v:
+                        ats=idx_to_labels[atom_classes[ai]]
+                        if ats=='other':
+                            ats='*'
+                        if ats in ['F','Cl','I','Br','O'] and int(charge_)<0:
+                            a2c[ai]=ci
+                        elif ats in ['N','H','P'] and int(charge_)>0:
+                            a2c[ai]=ci
+                        else:
+                            print(f'unusuaal case charge {charge_} with atom {ats}!!')
+
+        print(f"all a2b b2a a2c c2a done, start mol built")
+        #finsh the update of box back to the output for retraining used 
+        output={
+        'bbox':   np.concatenate([atom_bbox, bond_bbox,charges_bbox], axis=0),
+        'bbox_centers': np.concatenate([atom_centers, bond_centers,charges_centers],axis=0),
+        'scores':       np.concatenate([atom_scores, bond_scores, charges_scores],axis=0),
+        'pred_classes': np.concatenate([atom_classes, bond_classes, charges_classes],axis=0),
+        'image_path': image_path
+        }
+        # boxinfo
+        boxinfor={
+        'bbox':   output['bbox'],
+        'scores': output['scores'],#TODO use same vocabl ?
+        'pred_classes': output['pred_classes'],#[ lab2idx[x] for x in output['pred_classes']],#changet it back to character
+        'image_path': image_path
+        }
+        #split agin for buit mol
+        charge_mask = np.array([True if ins  in charge_labels else False for ins in output['pred_classes']])
+        charges_bbox=output['bbox'][charge_mask]
+        charges_centers=bbox2center(charges_bbox)
+        # charges_centers= output['bbox_centers'][charge_mask]
+        charges_classes= output['pred_classes'][charge_mask]
+        charges_scores= output['scores'][charge_mask]
+        charges_bbox, charges_centers, charges_scores,charges_classes,figc =view_box_center2(charges_bbox, charges_centers, charges_scores,charges_classes, overlap_dist_thresh=5.0, max_centers_per_box=5)
+        #view_box_center2 help remove large box if boxscore small than 0.5
+        # bonds_mask2 = np.array([True if ins  in bond_labels else False for ins in output['pred_classes']])
+        # bonds_mask= output['scores'][bonds_mask2]>=0.1# TODO fix me, as training bond box overlap with bondbox,aussme bond socre make sense
+        bonds_mask = np.array([True if ins  in bond_labels and output['scores'][i]>0.2 else False for i, ins in enumerate(output['pred_classes'])])
+        bond_bbox=output['bbox'][bonds_mask]
+        bond_centers=bbox2center(bond_bbox)
+        # bond_centers= output['bbox_centers'][bonds_mask]
+        bond_classes= output['pred_classes'][bonds_mask]
+        bond_scores= output['scores'][bonds_mask]
+        print(f"before view_box_center2 bond nums {len(bond_scores)}")
+        # bond_bbox2, bond_centers2, bond_scores2,bond_classes2,fig=view_box_center2(bond_bbox, bond_centers, bond_scores,bond_classes, overlap_dist_thresh=5.0, max_centers_per_box=5)
+        bond_bbox, bond_centers, bond_scores,bond_classes,fig =view_box_center2(bond_bbox, bond_centers, bond_scores,bond_classes, overlap_dist_thresh=5.0, max_centers_per_box=3)
+        print(f"after view_box_center2 bond nums {len(bond_scores)}")
+
+        heavy_mask= np.array([True if ins not in bond_labels and ins not in charge_labels and ins != lab2idx['H'] else False for ins in output['pred_classes']])
+        h_mask= np.array([True if ins not in bond_labels and ins not in charge_labels and ins == lab2idx['H'] else False for ins in output['pred_classes']])
+
+        #TODO fix me if heavy or H all need this view_box_center2 filtering
+        heavy_bbox = output['bbox'][heavy_mask]
+        # heavy_classes = output['pred_classes'][heavy_mask]
+        heavy_centers=bbox2center(heavy_bbox)
+        # heavy_centers= output['bbox_centers'][heavy_mask]
+        heavy_scores= output['scores'][heavy_mask]
+        heavy_classes = output['pred_classes'][heavy_mask]
+        heavy_bbox, heavy_centers, heavy_scores,heavy_classes,fighv =view_box_center2(heavy_bbox, heavy_centers, heavy_scores,heavy_classes, overlap_dist_thresh=5.0, max_centers_per_box=5)                            
+        ###########################start build mol ##########################
+        rwmol_ = Chem.RWMol()
+        boxi2ai = {}  # 预测索引 -> RDKit 索引
+        placeholder_atoms=dict()
+        J=0
+        for i, (bbox, a) in enumerate(zip(atom_bboxes, atom_classes)):
+            a2labl=False
+            a=replace_cg_notation(a)
+            # print(a,'atom box class label')
+            if a in ['H', 'C', 'O', 'N', 'Cl', 'Br', 'S', 'F', 'B', 'I', 'P', 'Si']:#  '*', I2M's defined atom types
+                # if a=='H':continue#skip H fristly,only with heavy atom then addH 
+                ad = Chem.Atom(a)#TODO consider non chemical group and label for using
+            #TODO add pd rdkit known elemetns here
+            elif a in ELEMENTS:
+                ad = Chem.Atom(a)
+            elif a in ABBREVIATIONS :
+                ad = Chem.Atom("*")
+                placeholder_atoms[i] = a # 记录非标准原但有定义的官能团   类型及其位置,
+                a2labl=True
+           
+            else:
+                if  N_C_H_expand(a):
+                    ad = Chem.Atom("*")
+                    placeholder_atoms[i] = a # 记录非标准原但有定义的官能团   类型及其位置,
+                    a2labl=True
+                elif C_H_expand(a):
+                    ad = Chem.Atom("*")
+                    placeholder_atoms[i] = a # 记录非标准原但有定义的官能团   类型及其位置,
+                    a2labl=True
+                elif C_H_expand2(a):
+                            ad = Chem.Atom("*")
+                            placeholder_atoms[i] = a # 记录非标准原但有定义的官能团   类型及其位置,
+                            a2labl=True
+                elif  formula_regex(a):
+                    ad = Chem.Atom("*")
+                    placeholder_atoms[i] = a # 记录非标准原但有定义的官能团   类型及其位置,
+                    a2labl=True
+                else:
+                    ad = Chem.Atom("*")
+                    if a not in ['*',"other"]:
+                        a2labl=True
+                # placeholder_atoms[idx] = a  
+            # atom = Chem.Atom(symbol)
+            rwmol_.AddAtom(ad)
+            boxi2ai[J] = rwmol_.GetNumAtoms() - 1
+            if a2labl: rwmol_.GetAtomWithIdx(J).SetProp("atomLabel", f"{a}")#mol set with label, mol_rebuild not
+            J+=1
+
+        # 使用 KDTree 构建重原子间的键（如果提供了 bond_bbox）
+        if len(charges_classes) > 0:
+            for k,v in a2c.items():
+                fc=int(idx_to_labels[charges_classes[v]])
+                rwmol_.GetAtomWithIdx(k).SetFormalCharge(fc)
+        # print(f"mol with heavy atoms number {i+1}, max heavy atom id {i}")
+        print(f"mol with  atoms number {i+1}, max  atom id {i}")
+        print(f"mol with bond box number {len(bond_classes)}")
+        print(f"placeholder_atoms@@ {placeholder_atoms}")
+
+        #重原子 skeleton mol
+        bonds=dict()
+        existing_bonds = set()
+        b2aa=dict()
+        singleAtomBond=[]
+        bondWithdirct=[]
+
+        # tree_heavy = KDTree(heavy_centers)#TODO before add bond consdiering reodering bond ??
+        tree_atom = KDTree(atom_centers)#TODO as atom bond are all reodered to kee H last
+        if len(idx_to_labels)==30:
+            _margin=0#ad this version bond dynamicaly changed
+        for bi, (bbox, idx_) in enumerate(zip(bond_bbox, bond_classes)):#not work for cross-bond, longer bond, as the center of bond may be close to as atoms not it two atoms
+            bond_type = idx_to_labels[idx_]
+            if len(idx_to_labels)==23:
+                if idx_to_labels[bond_type] in ['-','SINGLE', 'NONE', 'ENDUPRIGHT', 'BEGINWEDGE', 'BEGINDASH', 'ENDDOWNRIGHT']:
+                    _margin = 5
+                else:
+                    _margin = 8
+            anchor_positions = (bbox + [_margin, _margin, -_margin, -_margin]).reshape([2, -1])
+            oposite_anchor_positions = anchor_positions.copy()
+            oposite_anchor_positions[:, 1] = oposite_anchor_positions[:, 1][::-1]
+            # Upper left, lower right, lower left, upper right
+            # x1y1, x2y2, x1y2, x2y1 : dinuogl lines
+            anchor_positions = np.concatenate([anchor_positions, oposite_anchor_positions])
+            # print(f"anchor_positions {anchor_positions.shape}\n{anchor_positions}")
+            dists, neighbours = tree_atom.query(anchor_positions, k=1)
+            if np.argmin((dists[0] + dists[1], dists[2] + dists[3])) == 0:
+                # visualize setup
+                begin_idx, end_idx = neighbours[:2]
+            else:
+                # visualize setup
+                begin_idx, end_idx = neighbours[2:]
+            atom1_idx = boxi2ai[begin_idx]
+            atom2_idx = boxi2ai[end_idx]
+            if atom1_idx == atom2_idx:#NOTE when bond with only one terminal atom, other side H not used
+                print(f"attempt to add self-bond:{bi}  atomIdx1 == atomIdx2 ::{[atom1_idx, atom2_idx]}")
+                print(f"for bond bi {bi} H atom may involbed   dists:",dists)
+                print(neighbours)
+                print("anchor_positions",anchor_positions)
+            else:
+                if bond_type in  ['-', 'NONE', 'ENDUPRIGHT', 'BEGINWEDGE', 'BEGINDASH', 'ENDDOWNRIGHT']:
+                    if bond_type in BONDDIRECT:
+                        bonds[bi] = (atom1_idx, atom2_idx, 'SINGLE', bond_type)
+                        bondWithdirct.append(bi)
+                    else:
+                        bonds[bi] = (atom1_idx, atom2_idx, 'SINGLE', None)
+                    bond_type=BONDTYPE['SINGLE']
+                elif bond_type == '=':
+                    bonds[bi] = (atom1_idx, atom2_idx, 'DOUBLE', None)
+                    bond_type=BONDTYPE['DOUBLE']
+                elif bond_type == '#':
+                    bonds[bi] = (atom1_idx, atom2_idx, 'TRIPLE', None)
+                    bond_type=BONDTYPE['TRIPLE']
+                else:
+                    print(f'unkown bond type relaced with single@@ {bond_type}')
+                    bonds[bi] = (atom1_idx, atom2_idx, 'SINGLE', None)
+                    bond_type=BONDTYPE['SINGLE']
+                # 检查价态
+                atom1 = rwmol_.GetAtomWithIdx(atom1_idx)
+                atom2 = rwmol_.GetAtomWithIdx(atom2_idx)
+                val1 = sum(b.GetBondTypeAsDouble() for b in atom1.GetBonds())
+                val2 = sum(b.GetBondTypeAsDouble() for b in atom2.GetBonds())
+                max_val1 = max(VALENCES[atom1.GetSymbol()])
+                max_val2 = max(VALENCES[atom2.GetSymbol()])
+                # bond_order = bond_type.AsDouble()
+                bond_order=BONDTYPE2ORD[bond_type]
+                if val1 + bond_order <= max_val1 and val2 + bond_order <= max_val2:
+                    bond1 = rwmol_.GetBondBetweenAtoms(atom1_idx, atom2_idx)
+                    bond2 = rwmol_.GetBondBetweenAtoms(atom2_idx, atom1_idx)
+                    if bond1 or bond2:
+                        # print(f'bond exists for {[atom1_idx, atom2_idx]}')
+                        pass
+                    # if (atom1_idx, atom2_idx) not in existing_bonds and (atom2_idx, atom1_idx) not in existing_bonds:
+                    else:    
+                        # print(atom1_idx, atom2_idx, bond_type,[ bi, idx_to_labels[idx_] ])
+                        rwmol_.AddBond(atom1_idx, atom2_idx, bond_type)
+                else:
+                    print(f"Skipping bond {bi}: Exceeds valence.")
+            existing_bonds.add((atom1_idx, atom2_idx))
+            b2aa[bi]=sorted([atom1_idx, atom2_idx])
+
+        if len(bond_bbox)==1 and len(atom_bbox)==2:
+            ca1='[*:0][C:2]#[C:3][*:1]'#acs phC#CpH
+            rwmol_ = Chem.RWMol()
+            ats= ['*','*','C','C']
+            for ia in ats:
+                a=Chem.Atom(ia)
+                id_=rwmol_.AddAtom(a)
+                # print(ia,id_)
+            rwmol_.AddBond(2, 3, Chem.BondType.TRIPLE)
+            rwmol_.AddBond(0, 2, Chem.BondType.SINGLE)
+            rwmol_.AddBond(1, 3, Chem.BondType.SINGLE)
+            
+            # Chem.MolFromSmiles(ca1)
+            for i in range(len(atom_classes)):
+                atom_classes[i]=lab2idx['*']
+            AllChem.Compute2DCoords(rwmol_)
+        else:
+            rwmol_=copy.deepcopy(rwmol_)
+        print(f"placeholder_atoms {placeholder_atoms}")
+        
+        #assign 2D coords
+        mol = rwmol_.GetMol()
+        mol.RemoveAllConformers()
+        conf = Chem.Conformer(mol.GetNumAtoms())
+        # conf.Set3D(True)
+        # for i, (x, y) in enumerate(heavy_centers):
+        for i, (x, y) in enumerate(atom_centers):
+            x, y=float(x),float(y)
+            conf.SetAtomPosition(i, (x, y, 0))#TODO why some time need -y, just display same as ori?
+        mol.AddConformer(conf)
+        # Chem.SanitizeMol(mol)
+        Chem.AssignStereochemistryFrom3D(mol)
+        rwmol_=Chem.RWMol(mol) 
+        #as afte H a\lso didthis
+        skeleton_mol=copy.deepcopy(rwmol_)
+        print(skeleton_mol.GetNumBonds())
+        chiral_centers_aids = Chem.FindMolChiralCenters(mol, includeUnassigned=True)
+        
+        # H realted post-process
+        heavyNumber=len(heavy_centers)
+        print(f'mol with heavy number atoms {heavyNumber}, max id {heavyNumber-1}')    
+        onlyHeayMol=copy.deepcopy(rwmol_)
+        chiral_centers = Chem.FindMolChiralCenters(
+                        rwmol_, includeUnassigned=True, includeCIP=False, useLegacyImplementation=False)
+        chiral_center_ids = [idx for idx, _ in chiral_centers] 
+        Hais=[]
+        Hais_bt=[]
+        Hbd=[]
+        # H_existing_bonds = set()
+        for bi, ais in b2a.items():#from box overlap
+            bt=bond_classes[bi]# in [14,15]#directon bond
+            for ai in ais:
+                if ai>heavyNumber-1:
+                    if bt in  [14,15]:#directon bond
+                        Hais.append(ais)#NOTE ais ai increasing order as two for loop increasing
+                        print(f"within H  bond box id {bi} bond direction {idx_to_labels[bt]} atoms box id {ais} ")
+                        Hais_bt.append(idx_to_labels[bt])
+                        Hbd.append(bi)
+                        # print(bonds[bi] )
+        # add Hbonds with direction
+        H_existing_bonds = set()
+        ha2boxa=dict()
+        for ais, bt in zip(Hais,Hais_bt):
+            idx_2=ais[-1]
+            idx_1=ais[0]
+            hbond=rwmol_.GetBondBetweenAtoms(idx_1,idx_2)
+            if hbond is not None:
+                if idx_1 in chiral_center_ids:#if not in the chiral atom, will not set bond directions
+                    hbond.SetBondDir(BOND_DIRS[bt])
+            else:
+                had = Chem.Atom("H")
+                addHatom_idx = rwmol_.AddAtom(had)
+                ha2boxa[addHatom_idx]=idx_2
+                # print(idx_2,addHatom_idx)#Note if detected H box will lead idx_2 != addHatom_idx
+                atom= rwmol_.GetAtomWithIdx(idx_1)
+                max_val=max(VALENCES[atom.GetSymbol()])
+                val = sum(b.GetBondTypeAsDouble() for b in atom.GetBonds())
+                if (idx_1, addHatom_idx) not in H_existing_bonds and (addHatom_idx, idx_1) not in H_existing_bonds:
+                    if val<=max_val-1:
+                        # print(f"atom id {idx_1} val {val} max_val {max_val}")
+                        print(idx_1, addHatom_idx)#let check bond exist or not!!
+                        rwmol_.AddBond(idx_1,addHatom_idx, Chem.BondType.SINGLE)#BOND_DIRS[bt]
+                        b=rwmol_.GetBondBetweenAtoms(idx_1,addHatom_idx)
+                        if idx_1 in chiral_center_ids:#if not in the chiral atom, will not set bond directions
+                            b.SetBondDir(BOND_DIRS[bt])#############Note can be done in the following tree
+                H_existing_bonds.add((idx_1,addHatom_idx))
+        i
+        if len(ha2boxa)>0:#consider Hnow
+            #use box coords assign 2D, remove extra Hs also update box
+            rwmol_.RemoveAllConformers()#
+            conf = Chem.Conformer(rwmol_.GetNumAtoms())
+            conf.Set3D(True)
+            coords2d=[]
+            for i, (x, y) in enumerate(heavy_centers):
+                position = Point3D(float(x), float(y), 0.)  # Create a Point3D object with x, y, and z=0
+                conf.SetAtomPosition(i, position)
+                coords2d.append([x,y])
+            for k,v in ha2boxa.items():
+                x,y=atom_centers[v]
+                position = Point3D(float(x), float(y), 0.)  # Create a Point3D object with x, y, and z=0
+                conf.SetAtomPosition(k, position)
+                coords2d.append([x,y])
+            rwmol_.AddConformer(conf)
+            
+        additonalH=detect_unconnected_hydrogens(rwmol_)
+        if len(additonalH)>0:
+            rwmol_,rmovedAtomcoords=remove_unconnected_hydrogens2(rwmol_) #NOTE 留给将来WEB开发用will dercease h atom,but the box have not updated TODO fix me this in feature activate learning
+            #update atom box infors
+            if len(rmovedAtomcoords)>0:#update box infors
+                delbb=[]
+                kdt = cKDTree(atom_centers)
+                for i, (x,y,z) in enumerate(rmovedAtomcoords):#z=0
+                    dist, idx_=kdt.query([x,y], k=1)
+                    delbb.append(idx_)
+                mask = np.ones(len(atom_classes), dtype=bool)  # 初始化为 True
+                mask[delbb] = False 
+                atom_bbox = atom_bbox[mask]
+                atom_classes = atom_classes[mask]
+                atom_centers = atom_centers[mask]
+        # mol# mol_rebuit=copy.deepcopy(mol)
+
+        mol=copy.deepcopy(rwmol_)
+        conf=mol.GetConformers()[0]
+        mola2xy=dict()
+        mola2d=[]
+        for i,a in enumerate(mol.GetAtoms()):
+            x,y,z=conf.GetAtomPosition(i)
+            mola2xy[i]=[x,y]
+            mola2d.append([x,y])
+            # print( x,y,z)
+        kdt = cKDTree(mola2d)
+        chiral_centers = Chem.FindMolChiralCenters(
+                        mol, includeUnassigned=True, includeCIP=False, useLegacyImplementation=False)
+        chiral_center_ids = [idx for idx, _ in chiral_centers] 
+
+        for bi,bcent in enumerate(bond_centers):
+            if bi in bondWithdirct :#and bi not in Hbd:#Note as set Hbd previously
+                dists, a1a2 = kdt.query(bcent, k=2)
+                a1,a2=sorted(a1a2)
+                a1,a2=int(a1),int(a2)
+                bt= mol.GetBondBetweenAtoms(a1, a2)#RDKit 的键是无向的，返回的是同一个 Bond 对象
+                if bt:
+                    # 获取键的当前起点和终点
+                    current_begin = bt.GetBeginAtomIdx()
+                    current_end = bt.GetEndAtomIdx()
+                    bond_dir=bond_dirs[idx_to_labels[bond_classes[bi]]]
+                    if bond_dir == rdchem.BondDir.BEGINWEDGE: 
+                        reverse_dir = rdchem.BondDir.BEGINDASH 
+                    elif bond_dir == rdchem.BondDir.BEGINDASH: 
+                        reverse_dir = rdchem.BondDir.BEGINWEDGE
+                    # else:
+                    #      reverse_dir= rdchem.BondDir.BEGINWEDGE
+                    if a1 in chiral_center_ids:
+                        if current_begin == a1:
+                            bt.SetBondDir(bond_dir)
+                            print(f'a1 dir')
+                        else:
+                            # 如果手性原子是终点，反转方向（例如用相反的楔形键）
+                            bt.SetBondDir(reverse_dir)
+                            print(f'a1 reverse_dir')
+                        # print(f'set bond direction a1a2 {[bi, a1,a2]}')
+                        # bt.SetBondDir(bond_dirs[idx_to_labels[bond_classes[bi]]])
+                    elif a2 in chiral_center_ids:
+                        if current_begin == a2:
+                            bt.SetBondDir(bond_dir)
+                            print(f'a2 dir {bond_dir} {reverse_dir}')
+                        else:
+                            # 如果手性原子是终点，反转方向（例如用相反的楔形键）,but not work, just remove and add
+                            mol.RemoveBond(current_begin, current_end)
+                            mol.AddBond(current_end, current_begin, bt.GetBondType())
+                            bond = mol.GetBondBetweenAtoms(current_end, current_begin)
+                            bond.SetBondDir(bond_dir)
+                            print(f'a2 reverse_dir {bond_dir} {reverse_dir}')
+                        # bt= mol.GetBondBetweenAtoms(a2, a1)
+                        # print(f'set bond direction a2a1  {[bi, a2,a1]}')            
+                        # bt.SetBondDir(bond_dirs[idx_to_labels[bond_classes[bi]]])
+                    else:
+                        print('bond stro not with chiral atom???, will ignore this stero bond infors')
+                        print(f"{[bi, bond_dir, current_begin,current_end]}")
+                        # beginatom=mol.GetAtomWithIdx(current_begin)
+                        # Endatom=mol.GetAtomWithIdx(current_end)
+                        # beginatom_neis=len(beginatom.GetBonds())
+                        # Endatom_neis=len(Endatom.GetBonds())
+        try:
+            mol_rebuit=mol.GetMol()
+            conf = mol_rebuit.GetConformer()
+            Chem.WedgeMolBonds(mol_rebuit,conf)#
+            Chem.DetectBondStereochemistry(mol_rebuit)
+            Chem.AssignChiralTypesFromBondDirs(mol_rebuit)
+            Chem.AssignStereochemistry(mol_rebuit)
+            #
+            smiH=Chem.MolToSmiles(mol_rebuit)
+            print(F"smiH\n",smiH)
+            # canon_smilesH = Chem.CanonSmiles(smiH)
+            # print(F"canon_smilesH\n",canon_smilesH)
+            # rdkit_coni_smiH=Chem.MolToSmiles(Chem.MolFromSmiles(smiH))
+            # print(f"Chem.MolToSmiles(Chem.MolFromSmiles(smiH))\n {rdkit_coni_smiH}")
+            #
+            mol = rdkit.Chem.RWMol(mol_rebuit)
+            other2ppsocr=True
+            if other2ppsocr:
+                print()
+                need_cut=[]
+                ppstr=[]
+                ppstr_score=[]
+                crops=[]
+                index_token=dict()
+                expan=0#NOTE this control how much the part of bond in crop_Img
+                for i_,(heav_c,heav_box) in enumerate(zip(atom_classes,atom_bbox)):
+                    if lab2idx['*']==heav_c or lab2idx['other']==heav_c or lab2idx['Cl']==heav_c:
+                        need_cut.append(i_)
+                        a=heav_box+np.array([-expan,-expan,expan,expan])
+                        # print(heav_box.shape,a.shape)
+                        box=a * [scale_x, scale_y, scale_x, scale_y]#TODO need the fix as w h may not equal!!
+                        # print(a,box,[scale_x, scale_y, scale_x, scale_y])
+                        cropped_img = img_ori_1k.crop(box)
+                        crops.append(cropped_img)
+                        image_npocr = np.array(cropped_img)
+                        result_ocr= ocr2.ocr(image_npocr, det=False)
+                        s_, score_ =result_ocr[0][0]
+                        s_previos=atom_ocr[i_]
+                        if s_previos != "other" :
+                            s_=s_previos if len(s_previos)>=len(s_) else s_
+                        print(f'ocr::idx:{i_}',s_, score_ )
+                        if score_<=0.1:# process cropped_img and try again
+                            # print(s_, "xxx",score_)
+                            s_='*'
+                        if s_=='+' or s_=='-':
+                            s_="*"
+                        if len(s_)>1:
+                            s_=re.sub(r'[^a-zA-Z0-9,\*\-\+]', '', s_)#remove special chars
+                            if re.match(r'^\d+$', s_):
+                                s_=f'{s_}*'#number+ *
+                                # print(f'why only numbers ?  {s_}')
+                        if s_=='L':s_='Li'
+                        elif s_=='0':s_='O'
+                        elif s_  in ['N,+ CI','N,+ Cl' ,'N,+Cl','N,+CI','N+CI']:s_='N2+Cl-'
+                        elif s_  in ['NO,','O,N' ]:s_='NO2'
+                        
+
+                        match = re.match(r'^(\d+)?(.*)', s_)
+                        # print(s_,'xxxx')
+                        if match:
+                            numeric_part, remaining_part = match.groups()
+                            fc_=mol.GetAtomWithIdx(i_).GetFormalCharge()
+                            if remaining_part in ELEMENTS:
+                                new_atom = Chem.Atom(remaining_part)
+                                mol.ReplaceAtom(i_, new_atom)
+                                print(i_, remaining_part,"@@@")
+                            elif remaining_part in ABBREVIATIONS:# can be expanded with placeholder_atoms
+                                placeholder_atoms[i_]=s_# such 2Na will be get for rdkit
+                            elif remaining_part=='OH':
+                                new_atom = Chem.Atom("O")
+                                mol.ReplaceAtom(i_, new_atom)
+                            elif remaining_part=='SH':
+                                new_atom = Chem.Atom("S")
+                                mol.ReplaceAtom(i_, new_atom)
+                            elif remaining_part=='NH':
+                                new_atom = Chem.Atom("N")
+                                mol.ReplaceAtom(i_, new_atom)
+                            mol.GetAtomWithIdx(i_).SetFormalCharge(fc_)
+                        index_token[i_]=f'{s_}:{i_}'
+                        print(f"idx:{i_}, atm: <{idx_to_labels[heav_c]}> --- [{s_}:{i_}] with score:{score_} ||previousOCR:: {atom_ocr[i_]}")
+                        if s_ in ELEMENTS :
+                            new_atom = Chem.Atom(s_)
+                            mol.ReplaceAtom(i_, new_atom)
+                        mol.GetAtomWithIdx(i_).SetProp("atomLabel", f"{s_}")#mol set with label, mol_rebuit not
+                        ppstr.append(s_)
+                        ppstr_score.append(score_)
+                        if  s_ in ABBREVIATIONS.keys():
+                            placeholder_atoms[i_]=s_
+            #            
+            bond_dirs_rev={v:k for k,v in bond_dirs.items()}
+            wdbs=[]
+            for b in mol.GetBonds():
+                bd=b.GetBondDir()
+                bt=b.GetBondType()
+                # print(bd)
+                if bd ==bond_dirs['BEGINDASH'] or  bd==bond_dirs['BEGINWEDGE']:
+                    a1,a2=b.GetBeginAtomIdx(), b.GetEndAtomIdx()
+                    wdbs.append([a1,a2,bt,bond_dirs_rev[bd]])
+
+            #expand mol if exists
+            # if len(placeholder_atoms)>0:###
+            cm=copy.deepcopy(mol)
+            # print(placeholder_atoms)
+            expand_mol, expand_smiles= expandABB(cm,ABBREVIATIONS, placeholder_atoms)
+            SMILESpre=expand_smiles
+            rdm=copy.deepcopy(expand_mol)
+            target_mol, ref_mol=rdm, cm
+            AllChem.Compute2DCoords(target_mol)
+            pair=[target_mol, ref_mol]
+            mcs=rdFMCS.FindMCS([target_mol, ref_mol], # larger,small order
+                                # atomCompare=rdFMCS.AtomCompare.CompareAny,
+                                bondCompare=rdFMCS.BondCompare.CompareAny,
+                                ringCompare=rdFMCS.RingCompare.IgnoreRingFusion,
+                                matchChiralTag=False,
+                )
+            mcs_mol = Chem.MolFromSmarts(mcs.smartsString)
+            AllChem.Compute2DCoords(mcs_mol)
+
+            matches0 = pair[0].GetSubstructMatches(mcs_mol, useQueryQueryMatches=True,uniquify=False, maxMatches=1000, useChirality=False)
+            matches1 = pair[1].GetSubstructMatches(mcs_mol, useQueryQueryMatches=True,uniquify=False, maxMatches=1000, useChirality=False)
+            if len(matches0) != len(matches1):
+                matches0=list(matches0)
+                matches1=list(matches1)
+                # print( "noted: matcher not equal !!")
+                if len(matches0)>len(matches1):
+                    for i in range(0,len(matches0)):
+                        if i < len(matches1):
+                            pass
+                        else:
+                            ii=i % len(matches1)
+                            matches1.append(matches1[ii])
+                else:
+                    for i in range(0,len(matches1)):
+                        if i < len(matches0):
+                            pass
+                        else:
+                            ii=i % len(matches0)
+                            matches0.append(matches0[ii])
+            assert len(matches0) == len(matches1), "matcher not equal break!!"
+            atommaping_pairs=[list(zip(matches0[i],matches1[i])) for i in range(0,len(matches0))]
+            atomMap=atommaping_pairs[0]
+            rmsd2=rdkit.Chem.rdMolAlign.AlignMol(prbMol=target_mol, refMol=ref_mol, atomMap=atomMap,maxIters=2000000)
+            print(f"rmsd {rmsd2}")
+            #ocr_mol
+            ocr_mol = copy.deepcopy(target_mol)
+            AllChem.Compute2DCoords(ocr_mol)
+            ocr_smi = Chem.MolToSmiles(ocr_mol)
+            molexp=ocr_mol
+            expandStero_smi, success= rdkit_canonicalize_smiles(ocr_smi)
+            # expandStero_smi =  Chem.CanonSmiles(ocr_smi)#, useChiral=(not ignore_chiral))
+
+            # TODO #[3H] 2H prpared box for training are too smalled, need adjust
+            if visual_check:
+                boxed_img = draw_objs(img,
+                                    atom_bbox,
+                                    atom_classes,
+                                    atom_scores,
+                                    category_index=idx_to_labels,
+                                    box_thresh=0.5,
+                                    line_thickness=3,
+                                    font='arial.ttf',
+                                    font_size=10)
+                opts = Draw.MolDrawOptions()
+                opts.addAtomIndices = False
+                opts.addStereoAnnotation = False
+                img_ori = Image.open(image_path).convert('RGB')
+                img_ori_1k = img_ori.resize((1000,1000))
+                if other2ppsocr:
+                    img_rebuit = Draw.MolToImage(ocr_mol, options=opts,size=(1000, 1000))
+                else:
+                    img_rebuit = Draw.MolToImage(ocr_mol, options=opts,size=(1000, 1000))
+                combined_img = Image.new('RGB', (img_ori_1k.width + boxed_img.width + img_rebuit.width, img_ori_1k.height))
+                combined_img.paste(img_ori_1k, (0, 0))
+                combined_img.paste(boxed_img, (img_ori_1k.width, 0))
+                combined_img.paste(img_rebuit, (img_ori_1k.width + boxed_img.width, 0))
+                imprefix=os.path.basename(image_path).split('.')[0]
+                combined_img.save(f"{ima_checkdir}/{imprefix}Boxed.png")
+            
+            new_row = {'file_name':image_path, "SMILESori":SMILESori,
+                    'SMILESpre':SMILESpre,
+                    'SMILESexp':expandStero_smi, 
+                    }
+            smiles_data = smiles_data._append(new_row, ignore_index=True)
+            
+            #accu  similarity calculation 
+            if getacc:
+                sameWithOutStero=comparing_smiles(new_row,SMILESpre)#try to ingnore cis chiral, as 2d coords including all the infos
+                sameWithOutStero_exp=comparing_smiles(new_row,expandStero_smi)#this ignore chairity and *number be * NOTE
+
+                if (type(SMILESori)!=type('a')) or (type(SMILESpre)!=type('a')):
+                    if sameWithOutStero or sameWithOutStero_exp:
+                        mysum += 1
+                    else:
+                        print(f"smiles problems\n{SMILESori}\n{SMILESpre}\n{image_path}")
+                        failed.append([SMILESori,SMILESpre,image_path])
+                        mydiff.append([SMILESori,SMILESpre,image_path])
+                        continue
+                mol1 = Chem.MolFromSmiles(SMILESori)#TODO considering smiles with rdkit not recongized in real data
+                if mol1 is None:
+                    rd_smi_ori, success1_=rdkit_canonicalize_smiles(SMILESori)
+                    mol1=Chem.MolFromSmiles(rd_smi_ori)
+                if (mol_rebuit is None) or (mol1 is None):
+                    if sameWithOutStero or sameWithOutStero_exp:
+                        mysum += 1
+                    else:
+                        print(f'get rdkit mol None\n{SMILESori}\n{SMILESpre}\n{image_path}')
+                        failed.append([SMILESori,SMILESpre,image_path])
+                        mydiff.append([SMILESori,SMILESpre,image_path])
+                        continue
+                if mol1:
+                    rdk_smi1=Chem.MolToSmiles(mol1)
+                else:
+                    rdk_smi1=SMILESori
+                if mol_rebuit:
+                    rdk_smi2=Chem.MolToSmiles(mol_rebuit)
+                else:
+                    rdk_smi2=''
+                # if rdk_smi1==rdk_smi2 or rdk_smi1==expandStero_smi or sameWithOutStero:#also considering the abbre in Ori
+                if rdk_smi1==rdk_smi2 or rdk_smi1==expandStero_smi:
+                    mysum += 1
+                else:
+                    if sameWithOutStero or sameWithOutStero_exp:
+                        mysum += 1
+                    else:
+                        mydiff.append([SMILESori,SMILESpre,image_path])
+                        if visual_check:
+                            combined_img.save(f"{ima_checkdir}/{imprefix}Boxed_diff{len(mydiff)}.png")
+                try:
+                    morganfps1 = AllChem.GetMorganFingerprint(mol1, 3,useChirality=True)
+                    morganfps2 = AllChem.GetMorganFingerprint(mol_rebuit, 3,useChirality=True)
+                    morgan_tani = DataStructs.DiceSimilarity(morganfps1, morganfps2)
+                    fp1 = Chem.RDKFingerprint(mol1)
+                    fp2 = Chem.RDKFingerprint(mol_rebuit)
+                    tanimoto = DataStructs.FingerprintSimilarity(fp1, fp2)
+                    if expandStero_smi!= '':
+                        fp3 = Chem.RDKFingerprint(molexp)
+                        morganfps3 = AllChem.GetMorganFingerprint(molexp, 3,useChirality=True)
+                        morgan_tani3 = DataStructs.DiceSimilarity(morganfps1, morganfps3)
+                        tanimoto3 = DataStructs.FingerprintSimilarity(fp1, fp3)
+                    if morgan_tani3> morgan_tani or tanimoto3> tanimoto :
+                        sim+=morgan_tani3
+                        simRD+=tanimoto3
+                    else:
+                        simRD+=tanimoto
+                        sim+=morgan_tani
+                except Exception as e:
+                    print(f"mol to fingerprint erros")
+                    simRD+=0
+                    sim+=0
+                    continue
+        except Exception as e:
+            print(f"file_name@: {image_path}\n SMILES in csv:\n{SMILESori}")
+            raise Exception("@debug this!!\n")
+
+    if getacc:
+        sim_100 = 100*sim/len(smiles_data)
+        simrd100 = 100*simRD/len(smiles_data)
+        flogout.write(f"rdkit concanlized==smiles:{100*mysum/len(smiles_data)}%\n")
+        flogout.write(f"failed:{len(failed)}\n totoal saved in csv : {len(smiles_data)}\n")
+        flogout.write(f"avarage similarity morgan tanimoto: RDKFp tanimoto:: {sim_100}%,  {simrd100}%  \n")#morgan_tani considering chiraty
+        flogout.write(f'I2M@@:: match--{mysum},unmatch--{len(mydiff)},failed--{len(failed)},correct %{100*mysum/len(smiles_data)} \n')
+        #molscribe evalutate
+        from src.solver.evaluate import SmilesEvaluator
+        evaluator = SmilesEvaluator(smiles_data['SMILESori'], tanimoto=False)
+        res_pre=evaluator.evaluate(smiles_data['SMILESpre'])
+        res_exp=evaluator.evaluate(smiles_data['SMILESexp'])
+        flogout.write(f'MolScribe style evaluation@SMILESpre:: {str(res_pre)} \n')
+        flogout.write(f'MolScribe style evaluation@SMILESexp:: {str(res_exp)} \n')
+        flogout.close()
+    print(f"will save {len(smiles_data)} dataframe into csv") 
+    smiles_data.to_csv(outcsv_filename, index=False)
+
+
+import torch.nn as nn 
+import torch.nn.functional as F 
+import torchvision
+
+
+class RTDETRPostProcessor(nn.Module):
+    __share__ = ['num_classes', 'use_focal_loss', 'num_top_queries', 'remap_mscoco_category']
+    
+    def __init__(self, classes_dict=None, use_focal_loss=True, num_top_queries=300, remap_mscoco_category=False) -> None:
+        super().__init__()
+        self.use_focal_loss = use_focal_loss
+        if classes_dict is None:
+            classes_dict = {0:'other',1:'C',2:'O',3:'N',4:'Cl',5:'Br',6:'S',7:'F',8:'B',
+                    9:'I',10:'P',11:'H',12:'Si',
+                    #bond
+                    13:'single',14:'wdge',15:'dash',
+                    16:'=',17:'#',18:':',#aromatic
+                    #charge
+                    19:'-4',20:'-2',
+                    21:'-1',#-
+                    22:'+1',#+
+                    23:'+2',
+                    }
+        num_classes=len(classes_dict)
+        self.num_top_queries = num_top_queries
+        self.num_classes = num_classes
+        self.remap_mscoco_category = remap_mscoco_category 
+        self.deploy_mode = False 
+
+        mscoco_category2label = {k: i for i, k in enumerate(classes_dict.keys())}
+        mscoco_label2category = {v: k for k, v in mscoco_category2label.items()}
+        self.mscoco_label2category=mscoco_label2category
+
+    def extra_repr(self) -> str:
+        return f'use_focal_loss={self.use_focal_loss}, num_classes={self.num_classes}, num_top_queries={self.num_top_queries}'
+    
+    # def forward(self, outputs, orig_target_sizes):
+    def forward(self, outputs, orig_target_sizes):
+
+        logits, boxes = outputs['pred_logits'], outputs['pred_boxes']
+        # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)        
+
+        bbox_pred = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy')
+        bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1)
+
+        if self.use_focal_loss:
+            scores = F.sigmoid(logits)
+            scores, index = torch.topk(scores.flatten(1), self.num_top_queries, axis=-1)
+            labels = index % self.num_classes
+            index = index // self.num_classes
+            boxes = bbox_pred.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, bbox_pred.shape[-1]))
+            
+        else:
+            scores = F.softmax(logits)[:, :, :-1]
+            scores, labels = scores.max(dim=-1)
+            boxes = bbox_pred
+            if scores.shape[1] > self.num_top_queries:
+                scores, index = torch.topk(scores, self.num_top_queries, dim=-1)
+                labels = torch.gather(labels, dim=1, index=index)
+                boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
+
+        # TODO for onnx export
+        if self.deploy_mode:
+            return labels, boxes, scores
+
+        # TODO
+        if self.remap_mscoco_category:
+            # from ...data.coco import mscoco_label2category
+            labels = torch.tensor([self.mscoco_label2category[int(x.item())] for x in labels.flatten()])\
+                .to(boxes.device).reshape(labels.shape)
+
+        results = []
+        for lab, box, sco in zip(labels, boxes, scores):
+            result = dict(labels=lab, boxes=box, scores=sco)
+            results.append(result)
+        
+        return results
+        
+
+    def deploy(self, ):
+        self.eval()
+        self.deploy_mode = True
+        return self 
+
+    @property
+    def iou_types(self, ):
+        return ('bbox', )
\ No newline at end of file