|
|
|
|
|
import torch |
|
|
from transformers import AutoTokenizer |
|
|
import sys |
|
|
import os |
|
|
from hydra import compose, initialize_config_dir |
|
|
from pathlib import Path |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
sys.path.append(os.getcwd()) |
|
|
|
|
|
try: |
|
|
from DLM_emb_model import MolEmbDLM |
|
|
except ImportError: |
|
|
print("Could not import MolEmbDLM. Make sure you are running from ApexOracle directory.") |
|
|
exit(1) |
|
|
|
|
|
def load_source_model(): |
|
|
print("Loading Source Model...") |
|
|
current_directory = Path(os.getcwd()) |
|
|
|
|
|
with initialize_config_dir(config_dir=str(current_directory/"configs"), version_base=None): |
|
|
config = compose(config_name="config") |
|
|
|
|
|
model_name = "ibm-research/materials.selfies-ted" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
DIT_ckpt_path = '/data2/tianang/projects/mdlm/Checkpoints_fangping/1-255000-fine-tune.ckpt' |
|
|
model = MolEmbDLM(config, len(tokenizer.get_vocab()), DIT_ckpt_path, tokenizer.mask_token_id) |
|
|
model.eval() |
|
|
return model, tokenizer |
|
|
|
|
|
def load_hf_model(): |
|
|
print("Loading HF Model...") |
|
|
model_path = "/data2/tianang/projects/mdlm/huggingface/huggingface_model" |
|
|
|
|
|
try: |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
|
model = MolEmbDLM.from_pretrained(model_path) |
|
|
except Exception as e: |
|
|
print(f"Failed to load HF model: {e}") |
|
|
|
|
|
model = MolEmbDLM.from_pretrained(".") |
|
|
model.eval() |
|
|
return model, tokenizer |
|
|
|
|
|
def main(): |
|
|
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
|
|
|
source_model, source_tokenizer = load_source_model() |
|
|
source_model.to(device) |
|
|
|
|
|
|
|
|
hf_model, hf_tokenizer = load_hf_model() |
|
|
hf_model.to(device) |
|
|
|
|
|
|
|
|
selfies = "[C][C][=O][O]" |
|
|
processed_selfies = selfies.replace('][', '] [') |
|
|
|
|
|
print(f"Testing with SELFIES: {processed_selfies}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inputs_source = source_tokenizer(processed_selfies, return_tensors="pt", padding=False, truncation=False) |
|
|
inputs_hf = hf_tokenizer(processed_selfies, return_tensors="pt", padding=False, truncation=False) |
|
|
|
|
|
print(f"Source Input IDs: {inputs_source['input_ids']}") |
|
|
print(f"HF Input IDs: {inputs_hf['input_ids']}") |
|
|
|
|
|
if not torch.equal(inputs_source['input_ids'], inputs_hf['input_ids']): |
|
|
print("WARNING: Tokenizers produced different input IDs!") |
|
|
|
|
|
|
|
|
inputs_s = {k: v.to(device) for k, v in inputs_source.items() if k in ["input_ids", "attention_mask"]} |
|
|
with torch.no_grad(): |
|
|
emb_source = source_model(**inputs_s) |
|
|
|
|
|
|
|
|
inputs_h = {k: v.to(device) for k, v in inputs_hf.items() if k in ["input_ids", "attention_mask"]} |
|
|
with torch.no_grad(): |
|
|
emb_hf = hf_model(**inputs_h) |
|
|
|
|
|
print(f'Huggingface Embeddings: {emb_hf[0][0]}') |
|
|
|
|
|
print(f"Source Emb Shape: {emb_source.shape}") |
|
|
print(f"HF Emb Shape: {emb_hf.shape}") |
|
|
|
|
|
|
|
|
diff = torch.abs(emb_source - emb_hf).sum().item() |
|
|
max_diff = torch.abs(emb_source - emb_hf).max().item() |
|
|
|
|
|
print(f"Sum of Absolute Differences: {diff}") |
|
|
print(f"Max Absolute Difference: {max_diff}") |
|
|
|
|
|
if diff < 1e-5: |
|
|
print("SUCCESS: Embeddings are identical (or extremely close).") |
|
|
else: |
|
|
print("FAILURE: Embeddings differ significantly.") |
|
|
print(f"Source Mean: {emb_source.mean().item()}") |
|
|
print(f"HF Mean: {emb_hf.mean().item()}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|