ApexOracle / compare_source_vs_hf.py
Kiria-Nozan's picture
solve same embedding bug
80ad4cd
import torch
from transformers import AutoTokenizer
import sys
import os
from hydra import compose, initialize_config_dir
from pathlib import Path
import numpy as np
# Add current dir to path
sys.path.append(os.getcwd())
try:
from DLM_emb_model import MolEmbDLM
except ImportError:
print("Could not import MolEmbDLM. Make sure you are running from ApexOracle directory.")
exit(1)
def load_source_model():
print("Loading Source Model...")
current_directory = Path(os.getcwd())
# Replicating logic from DLM_emb_model.py
with initialize_config_dir(config_dir=str(current_directory/"configs"), version_base=None):
config = compose(config_name="config")
model_name = "ibm-research/materials.selfies-ted"
tokenizer = AutoTokenizer.from_pretrained(model_name)
DIT_ckpt_path = '/data2/tianang/projects/mdlm/Checkpoints_fangping/1-255000-fine-tune.ckpt'
model = MolEmbDLM(config, len(tokenizer.get_vocab()), DIT_ckpt_path, tokenizer.mask_token_id)
model.eval()
return model, tokenizer
def load_hf_model():
print("Loading HF Model...")
model_path = "/data2/tianang/projects/mdlm/huggingface/huggingface_model"
# We use the same class but loaded via from_pretrained
try:
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = MolEmbDLM.from_pretrained(model_path)
except Exception as e:
print(f"Failed to load HF model: {e}")
# Fallback to local if needed, though path is absolute
model = MolEmbDLM.from_pretrained(".")
model.eval()
return model, tokenizer
def main():
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Load Source Model
source_model, source_tokenizer = load_source_model()
source_model.to(device)
# Load HF Model
hf_model, hf_tokenizer = load_hf_model()
hf_model.to(device)
# Test Input (SELFIES)
selfies = "[C][C][=O][O]" # Ethanol "[C][C][=O][O]"
processed_selfies = selfies.replace('][', '] [')
print(f"Testing with SELFIES: {processed_selfies}")
# Tokenize (using source tokenizer for both to ensure identical input ids if tokenizers are same)
# Note: HF model folder has its own tokenizer files, source uses "ibm-research/materials.selfies-ted".
# They should be the same, but let's verify input_ids match too.
inputs_source = source_tokenizer(processed_selfies, return_tensors="pt", padding=False, truncation=False)
inputs_hf = hf_tokenizer(processed_selfies, return_tensors="pt", padding=False, truncation=False)
print(f"Source Input IDs: {inputs_source['input_ids']}")
print(f"HF Input IDs: {inputs_hf['input_ids']}")
if not torch.equal(inputs_source['input_ids'], inputs_hf['input_ids']):
print("WARNING: Tokenizers produced different input IDs!")
# Run Source Model
inputs_s = {k: v.to(device) for k, v in inputs_source.items() if k in ["input_ids", "attention_mask"]}
with torch.no_grad():
emb_source = source_model(**inputs_s)
# Run HF Model
inputs_h = {k: v.to(device) for k, v in inputs_hf.items() if k in ["input_ids", "attention_mask"]}
with torch.no_grad():
emb_hf = hf_model(**inputs_h)
print(f'Huggingface Embeddings: {emb_hf[0][0]}')
print(f"Source Emb Shape: {emb_source.shape}")
print(f"HF Emb Shape: {emb_hf.shape}")
# Compare
diff = torch.abs(emb_source - emb_hf).sum().item()
max_diff = torch.abs(emb_source - emb_hf).max().item()
print(f"Sum of Absolute Differences: {diff}")
print(f"Max Absolute Difference: {max_diff}")
if diff < 1e-5: # Allow small floating point differences
print("SUCCESS: Embeddings are identical (or extremely close).")
else:
print("FAILURE: Embeddings differ significantly.")
print(f"Source Mean: {emb_source.mean().item()}")
print(f"HF Mean: {emb_hf.mean().item()}")
if __name__ == "__main__":
main()