|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from DLM_emb_model import MolEmbDLM |
|
|
from transformers import AutoTokenizer |
|
|
import torch |
|
|
import selfies as sf |
|
|
|
|
|
MODEL_DIR = "Kiria-Nozan/ApexOracle" |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR) |
|
|
model = MolEmbDLM.from_pretrained(MODEL_DIR) |
|
|
model.eval() |
|
|
|
|
|
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") |
|
|
model = model.to(device) |
|
|
|
|
|
|
|
|
df = pd.read_csv("temp_data/polymers_lit_scraped.csv") |
|
|
|
|
|
|
|
|
monomer_columns = ["monomer A", "monomer B", "monomer C", "monomer D", "monomer E", "monomer F"] |
|
|
all_monomers = set() |
|
|
|
|
|
for col in monomer_columns: |
|
|
if col in df.columns: |
|
|
monomers = df[col].dropna().unique() |
|
|
all_monomers.update(monomers) |
|
|
|
|
|
print(f"Total unique monomers: {len(all_monomers)}") |
|
|
|
|
|
|
|
|
monomer_selfies = {} |
|
|
valid_monomers = [] |
|
|
|
|
|
for smiles in all_monomers: |
|
|
try: |
|
|
selfies = sf.encoder(smiles) |
|
|
monomer_selfies[smiles] = selfies |
|
|
valid_monomers.append((smiles, selfies)) |
|
|
except Exception as e: |
|
|
print(f"Error converting {smiles} to SELFIES: {e}") |
|
|
|
|
|
print(f"Valid monomers for embedding: {len(valid_monomers)}") |
|
|
|
|
|
|
|
|
monomer_embeddings = {} |
|
|
|
|
|
for smiles, selfies in valid_monomers: |
|
|
|
|
|
batch = tokenizer( |
|
|
selfies.replace('][', '] ['), |
|
|
padding="max_length", |
|
|
max_length=1024, |
|
|
truncation=True, |
|
|
return_tensors="pt", |
|
|
) |
|
|
|
|
|
batch = {k: v.to(device) for k, v in batch.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
|
embeddings = model( |
|
|
input_ids=batch["input_ids"], |
|
|
attention_mask=batch["attention_mask"]+1-batch["attention_mask"], |
|
|
) |
|
|
|
|
|
|
|
|
monomer_embeddings[smiles] = embeddings[0][0].cpu().numpy() |
|
|
|
|
|
print(f"Generated embeddings for {len(monomer_embeddings)} monomers") |
|
|
print(f"Embedding shape: {list(monomer_embeddings.values())[0].shape}") |
|
|
|
|
|
|
|
|
print("\nChecking for identical embeddings...") |
|
|
embedding_list = list(monomer_embeddings.items()) |
|
|
identical_pairs = [] |
|
|
|
|
|
for i in range(len(embedding_list)): |
|
|
for j in range(i + 1, len(embedding_list)): |
|
|
smiles1, emb1 = embedding_list[i] |
|
|
smiles2, emb2 = embedding_list[j] |
|
|
|
|
|
|
|
|
if np.allclose(emb1, emb2, rtol=1e-09, atol=1e-09): |
|
|
identical_pairs.append((smiles1, smiles2)) |
|
|
|
|
|
if identical_pairs: |
|
|
print(f"Found {len(identical_pairs)} pairs of identical embeddings:") |
|
|
for smiles1, smiles2 in identical_pairs: |
|
|
print(f" {smiles1} <-> {smiles2}") |
|
|
|
|
|
|
|
|
print("\nAnalyzing identical embedding groups...") |
|
|
|
|
|
|
|
|
identical_groups = {} |
|
|
processed = set() |
|
|
|
|
|
for smiles1, smiles2 in identical_pairs: |
|
|
if smiles1 not in processed and smiles2 not in processed: |
|
|
|
|
|
group = {smiles1, smiles2} |
|
|
for other_smiles1, other_smiles2 in identical_pairs: |
|
|
if other_smiles1 in group: |
|
|
group.add(other_smiles2) |
|
|
elif other_smiles2 in group: |
|
|
group.add(other_smiles1) |
|
|
|
|
|
group_key = frozenset(group) |
|
|
if group_key not in identical_groups: |
|
|
identical_groups[group_key] = group |
|
|
processed.update(group) |
|
|
|
|
|
print(f"Found {len(identical_groups)} groups of molecules with identical embeddings:") |
|
|
for i, group in enumerate(identical_groups.values(), 1): |
|
|
print(f"\nGroup {i} ({len(group)} molecules):") |
|
|
for smiles in sorted(group): |
|
|
selfies_repr = monomer_selfies.get(smiles, "N/A") |
|
|
print(f" SMILES: {smiles}") |
|
|
print(f" SELFIES: {selfies_repr}") |
|
|
print() |
|
|
else: |
|
|
print("No identical embeddings found.") |
|
|
|
|
|
|
|
|
np.save("temp_data/monomer_embeddings.npy", monomer_embeddings) |
|
|
print("Embeddings saved to monomer_embeddings.npy") |