yinuozhang commited on
Commit
3e730f5
·
1 Parent(s): fbb7304

lfs upload

Browse files
embeddings/binding/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9b08ce28b452e9767dfc7c60bd6285421bdc6b791150a5f55158da89c7bda4f
3
+ size 15746448
embeddings/fast_embedding_generation.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ from transformers import AutoModelForMaskedLM
5
+ from datasets import Dataset
6
+ import sys
7
+ from tqdm import tqdm
8
+ from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
9
+
10
+ # Configuration
11
+ MAX_LENGTH = 768
12
+ BATCH_SIZE = 128 # Adjust based on your GPU memory
13
+
14
+ # Setup device
15
+ if torch.cuda.is_available():
16
+ device = torch.device('cuda:6')
17
+ print(f"Using device: {device}")
18
+ else:
19
+ device = torch.device('cpu')
20
+ print(f"CUDA not available. Using device: {device}")
21
+ print("To use GPU, reinstall PyTorch with CUDA support:")
22
+
23
+ # Load tokenizer and model
24
+ print("Loading tokenizer and model...")
25
+ tokenizer = SMILES_SPE_Tokenizer(
26
+ '/scratch/pranamlab/sophtang/home/scoring/PeptideCLM/tokenizer/new_vocab.txt',
27
+ '/scratch/pranamlab/sophtang/home/scoring/PeptideCLM/tokenizer/new_splits.txt'
28
+ )
29
+
30
+ embedding_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer
31
+ embedding_model.to(device)
32
+ embedding_model.eval()
33
+
34
+ # Load CSV file
35
+ print("Loading CSV file...")
36
+ csv_path = "/scratch/pranamlab/sophtang/home/scoring/functions/nonfouling/combined_nonfouling.csv"
37
+ df = pd.read_csv(csv_path)
38
+
39
+ sequences = df['SMILES'].tolist()
40
+ labels = df['LABEL'].tolist()
41
+ print(f"Total sequences: {len(sequences)}")
42
+ print(f"First sequence: {sequences[0]}")
43
+
44
+ # Filter sequences by length (faster - no tokenization)
45
+ print("Filtering sequences by length...")
46
+ valid_data = []
47
+ for seq, label in zip(sequences, labels):
48
+ if not isinstance(seq, str):
49
+ continue
50
+ # Quick pre-filter: tokenize once to check length
51
+ tokenized = tokenizer(seq, return_tensors='pt', max_length=MAX_LENGTH, truncation=True)
52
+ if tokenized['input_ids'].shape[1] <= MAX_LENGTH:
53
+ valid_data.append((seq, label))
54
+
55
+ filtered_sequences = [item[0] for item in valid_data]
56
+ filtered_labels = [item[1] for item in valid_data]
57
+ print(f"Filtered sequences: {len(filtered_sequences)}")
58
+
59
+ # Generate embeddings in batches
60
+ print("Generating embeddings...")
61
+ def generate_embeddings_batched(sequences, batch_size=BATCH_SIZE):
62
+ embeddings = []
63
+
64
+ for i in tqdm(range(0, len(sequences), batch_size), desc="Processing batches"):
65
+ batch_sequences = sequences[i:i + batch_size]
66
+
67
+ # Tokenize batch
68
+ tokenized = tokenizer(
69
+ batch_sequences,
70
+ return_tensors='pt',
71
+ padding=True,
72
+ max_length=MAX_LENGTH,
73
+ truncation=True
74
+ )
75
+
76
+ # Move to device
77
+ input_ids = tokenized['input_ids'].to(device)
78
+ attention_mask = tokenized['attention_mask'].to(device)
79
+
80
+ # Generate embeddings
81
+ with torch.no_grad():
82
+ outputs = embedding_model(input_ids=input_ids, attention_mask=attention_mask)
83
+ last_hidden_state = outputs.last_hidden_state
84
+
85
+ # Mean pooling with attention mask
86
+ mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
87
+ sum_embeddings = torch.sum(last_hidden_state * mask_expanded, dim=1)
88
+ sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)
89
+ batch_embeddings = (sum_embeddings / sum_mask).cpu().numpy()
90
+
91
+ embeddings.append(batch_embeddings)
92
+
93
+ return np.vstack(embeddings)
94
+
95
+ embeddings = generate_embeddings_batched(filtered_sequences)
96
+ print(f"Embeddings shape: {embeddings.shape}")
97
+
98
+ # Create and save dataset
99
+ print("Creating dataset...")
100
+ data = {
101
+ "sequence": filtered_sequences,
102
+ "labels": filtered_labels,
103
+ "embedding": embeddings
104
+ }
105
+ dataset = Dataset.from_dict(data)
106
+
107
+ output_path = '/scratch/pranamlab/sophtang/home/scoring/data/nonfouling'
108
+ print(f"Saving dataset to {output_path}...")
109
+ dataset.save_to_disk(output_path)
110
+
111
+ print(f"✓ Dataset saved successfully!")
112
+ print(f" Total samples: {len(dataset)}")
113
+ print(f" Embedding dimension: {embeddings.shape[1]}")
embeddings/hemolysis/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bef85bc99bc3c81c99fe290c0b2ef6b0d43f50c0089c59be7bf24219dd428d05
3
+ size 20965576
embeddings/permeability/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82e749eafb2e903ef2dc47255dbe4e489e6db8055b3ba6af4c876d9b1a0f1b38
3
+ size 22250496
embeddings/solubility/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36ac428037f8d09d1f45fcd6a61517428c4409638d63230b3ff1d375bdd0e5cb
3
+ size 106655176