ApexOracle / compare_source_vs_hf.py

solve same embedding bug

80ad4cd about 2 months ago

4.03 kB


	import torch
	from transformers import AutoTokenizer
	import sys
	import os
	from hydra import compose, initialize_config_dir
	from pathlib import Path
	import numpy as np

	# Add current dir to path
	sys.path.append(os.getcwd())

	try:
	from DLM_emb_model import MolEmbDLM
	except ImportError:
	print("Could not import MolEmbDLM. Make sure you are running from ApexOracle directory.")
	exit(1)

	def load_source_model():
	print("Loading Source Model...")
	current_directory = Path(os.getcwd())
	# Replicating logic from DLM_emb_model.py
	with initialize_config_dir(config_dir=str(current_directory/"configs"), version_base=None):
	config = compose(config_name="config")

	model_name = "ibm-research/materials.selfies-ted"
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	DIT_ckpt_path = '/data2/tianang/projects/mdlm/Checkpoints_fangping/1-255000-fine-tune.ckpt'
	model = MolEmbDLM(config, len(tokenizer.get_vocab()), DIT_ckpt_path, tokenizer.mask_token_id)
	model.eval()
	return model, tokenizer

	def load_hf_model():
	print("Loading HF Model...")
	model_path = "/data2/tianang/projects/mdlm/huggingface/huggingface_model"
	# We use the same class but loaded via from_pretrained
	try:
	tokenizer = AutoTokenizer.from_pretrained(model_path)
	model = MolEmbDLM.from_pretrained(model_path)
	except Exception as e:
	print(f"Failed to load HF model: {e}")
	# Fallback to local if needed, though path is absolute
	model = MolEmbDLM.from_pretrained(".")
	model.eval()
	return model, tokenizer

	def main():
	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	# Load Source Model
	source_model, source_tokenizer = load_source_model()
	source_model.to(device)

	# Load HF Model
	hf_model, hf_tokenizer = load_hf_model()
	hf_model.to(device)

	# Test Input (SELFIES)
	selfies = "[C][C][=O][O]" # Ethanol "[C][C][=O][O]"
	processed_selfies = selfies.replace('][', '] [')

	print(f"Testing with SELFIES: {processed_selfies}")

	# Tokenize (using source tokenizer for both to ensure identical input ids if tokenizers are same)
	# Note: HF model folder has its own tokenizer files, source uses "ibm-research/materials.selfies-ted".
	# They should be the same, but let's verify input_ids match too.

	inputs_source = source_tokenizer(processed_selfies, return_tensors="pt", padding=False, truncation=False)
	inputs_hf = hf_tokenizer(processed_selfies, return_tensors="pt", padding=False, truncation=False)

	print(f"Source Input IDs: {inputs_source['input_ids']}")
	print(f"HF Input IDs: {inputs_hf['input_ids']}")

	if not torch.equal(inputs_source['input_ids'], inputs_hf['input_ids']):
	print("WARNING: Tokenizers produced different input IDs!")

	# Run Source Model
	inputs_s = {k: v.to(device) for k, v in inputs_source.items() if k in ["input_ids", "attention_mask"]}
	with torch.no_grad():
	emb_source = source_model(**inputs_s)

	# Run HF Model
	inputs_h = {k: v.to(device) for k, v in inputs_hf.items() if k in ["input_ids", "attention_mask"]}
	with torch.no_grad():
	emb_hf = hf_model(**inputs_h)

	print(f'Huggingface Embeddings: {emb_hf[0][0]}')

	print(f"Source Emb Shape: {emb_source.shape}")
	print(f"HF Emb Shape: {emb_hf.shape}")

	# Compare
	diff = torch.abs(emb_source - emb_hf).sum().item()
	max_diff = torch.abs(emb_source - emb_hf).max().item()

	print(f"Sum of Absolute Differences: {diff}")
	print(f"Max Absolute Difference: {max_diff}")

	if diff < 1e-5: # Allow small floating point differences
	print("SUCCESS: Embeddings are identical (or extremely close).")
	else:
	print("FAILURE: Embeddings differ significantly.")
	print(f"Source Mean: {emb_source.mean().item()}")
	print(f"HF Mean: {emb_hf.mean().item()}")

	if __name__ == "__main__":
	main()