gabrielbianchin commited on
Commit
02a8f7d
·
1 Parent(s): 117e99b

update readme

Browse files
Files changed (1) hide show
  1. README.md +15 -13
README.md CHANGED
@@ -40,21 +40,24 @@ SeqScreen computes cosine similarities between protein and molecule embeddings,
40
 
41
  ```python
42
  from transformers import AutoTokenizer, AutoModel
 
43
  import torch
44
 
45
- # proteins
46
  tokenizer_prot = AutoTokenizer.from_pretrained('facebook/esm2_t36_3B_UR50D')
47
- encoder_prot = AutoModel.from_pretrained('facebook/esm2_t36_3B_UR50D').eval()
 
 
 
 
48
 
49
- proteins = ["MKTFFVLLL", "ABCDE"]
50
- proteins = [" ".join(i) for i in proteins]
51
  inputs_prot = tokenizer_prot(proteins, return_tensors="pt", padding=True)
52
 
53
  with torch.no_grad():
54
- outputs = encoder_prot(**inputs_prot)
55
- hidden = outputs.last_hidden_state[:, :]
56
- mask = inputs_prot['attention_mask'].unsqueeze(-1).float()
57
- prot_rep = (hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-8)
58
 
59
  # molecules
60
  tokenizer_mol = AutoTokenizer.from_pretrained('SaeedLab/MolDeBERTa-base-123M-mlc')
@@ -64,16 +67,15 @@ molecules = ["NCCc1nc(-c2ccccc2)cs1", "CC(=O)OCC(C)C"]
64
  inputs_mol = tokenizer_mol(molecules, return_tensors="pt", padding=True)
65
 
66
  with torch.no_grad():
67
- outputs = encoder_mol(**inputs_mol)
68
- hidden = outputs.last_hidden_state[:, :]
69
- mask = inputs_mol['attention_mask'].unsqueeze(-1).float()
70
- mol_rep = (hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-8)
71
 
72
  # seqscreen
73
  seqscreen = AutoModel.from_pretrained('SaeedLab/SeqScreen-Finetuning', trust_remote_code=True).eval()
74
 
75
  with torch.no_grad():
76
- outputs = seqscreen(prot=prot_rep, mol=mol_rep)
77
 
78
  print('Protein embeddings projected:', outputs.prot_rep)
79
  print('Molecule embeddings projected:', outputs.mol_rep)
 
40
 
41
  ```python
42
  from transformers import AutoTokenizer, AutoModel
43
+ from peft import PeftModel
44
  import torch
45
 
46
+ # proteins: ESM2 + LoRA adapter
47
  tokenizer_prot = AutoTokenizer.from_pretrained('facebook/esm2_t36_3B_UR50D')
48
+ backbone = AutoModel.from_pretrained(
49
+ 'facebook/esm2_t36_3B_UR50D',
50
+ torch_dtype=torch.bfloat16
51
+ )
52
+ backbone = PeftModel.from_pretrained(backbone, 'SaeedLab/SeqScreen-lora').eval()
53
 
54
+ proteins = ["MKTFFVLLL", "ACDEFGHIKLM"]
 
55
  inputs_prot = tokenizer_prot(proteins, return_tensors="pt", padding=True)
56
 
57
  with torch.no_grad():
58
+ hidden = backbone(**inputs_prot).last_hidden_state
59
+ mask = inputs_prot['attention_mask'].unsqueeze(-1).float()
60
+ prot_emb = (hidden * mask).sum(1) / mask.sum(1).clamp(min=1e-8)
 
61
 
62
  # molecules
63
  tokenizer_mol = AutoTokenizer.from_pretrained('SaeedLab/MolDeBERTa-base-123M-mlc')
 
67
  inputs_mol = tokenizer_mol(molecules, return_tensors="pt", padding=True)
68
 
69
  with torch.no_grad():
70
+ hidden = encoder_mol(**inputs_mol).last_hidden_state
71
+ mask = inputs_mol['attention_mask'].unsqueeze(-1).float()
72
+ mol_emb = (hidden * mask).sum(1) / mask.sum(1).clamp(min=1e-8)
 
73
 
74
  # seqscreen
75
  seqscreen = AutoModel.from_pretrained('SaeedLab/SeqScreen-Finetuning', trust_remote_code=True).eval()
76
 
77
  with torch.no_grad():
78
+ outputs = seqscreen(prot=prot_emb, mol=mol_emb)
79
 
80
  print('Protein embeddings projected:', outputs.prot_rep)
81
  print('Molecule embeddings projected:', outputs.mol_rep)