Spaces:

hugging-science
/

ESM2

Running

App Files Files Community

gabboud commited on Feb 27

Commit

ecb4e6c

1 Parent(s): 48ea20b

introduce approximate PPL through mask batching

Browse files

Files changed (2) hide show

app.py +12 -2
utils/pipelines.py +123 -7

app.py CHANGED Viewed

@@ -87,7 +87,9 @@ with gr.Blocks(title="ESM2 Protein Embeddings") as demo:
                         file_count="multiple"
                     )
                 with gr.TabItem("Calculate Pseudo-Perplexity scores"):
-                    ppl_button = gr.Button("Calculate Pseudo-Perplexity", variant="primary", size="lg")
                     ppl_status = gr.Textbox(
                         label="Waiting for pseudo-perplexity calculation...",
                         interactive=False,
@@ -108,7 +110,9 @@ with gr.Blocks(title="ESM2 Protein Embeddings") as demo:
         if task == "embedding":
             return full_embedding_pipeline(fasta_files, model, tokenizer, batch_size_value)
         elif task == "ppl":
-            return full_ppl_pipeline(fasta_files, model, tokenizer, batch_size_value)
     submit_btn.click(
         fn=run_pipeline_with_selected_model,
@@ -122,6 +126,12 @@ with gr.Blocks(title="ESM2 Protein Embeddings") as demo:
         outputs=[ppl_download, ppl_status]
     )
     gr.Markdown("""

                         file_count="multiple"
                     )
                 with gr.TabItem("Calculate Pseudo-Perplexity scores"):
+                    with gr.Row():
+                        ppl_button = gr.Button("Calculate Exact Pseudo-Perplexity", variant="primary", size="lg")
+                        ppl_approx_button = gr.Button("Calculate Approximate Pseudo-Perplexity", variant="primary", size="lg")
                     ppl_status = gr.Textbox(
                         label="Waiting for pseudo-perplexity calculation...",
                         interactive=False,
         if task == "embedding":
             return full_embedding_pipeline(fasta_files, model, tokenizer, batch_size_value)
         elif task == "ppl":
+            return full_ppl_pipeline(fasta_files, model, tokenizer, batch_size_value, mask_percentage=None)
+        elif task == "ppl-approx":
+            return full_ppl_pipeline(fasta_files, model, tokenizer, batch_size_value, mask_percentage=0.1)
     submit_btn.click(
         fn=run_pipeline_with_selected_model,
         outputs=[ppl_download, ppl_status]
     )
+    ppl_approx_button.click(
+        fn=run_pipeline_with_selected_model,
+        inputs=[input_files, model_dropdown, batch_size, gr.State("ppl-approx")],
+        outputs=[ppl_download, ppl_status]
+    )
     gr.Markdown("""

utils/pipelines.py CHANGED Viewed

@@ -146,6 +146,119 @@ def generate_ppl_scores(sequences_batch, model, tokenizer):
     return ppl_scores
 def full_embedding_pipeline(fasta_files, model, tokenizer, batch_size):
     """Full pipeline to process FASTA files and generate embeddings from desired model.
@@ -202,7 +315,7 @@ def full_embedding_pipeline(fasta_files, model, tokenizer, batch_size):
     return all_file_paths, status_string
-def full_ppl_pipeline(fasta_files, model, tokenizer, batch_size):
     """Full pipeline to process FASTA files and generate embeddings from desired model.
     Parameters:
@@ -215,6 +328,8 @@ def full_ppl_pipeline(fasta_files, model, tokenizer, batch_size):
         The pre-loaded tokenizer corresponding to the ESM model.
     batch_size : int
         The number of sequences to process in each batch when generating embeddings.
     Returns:
     --------
@@ -222,6 +337,7 @@ def full_ppl_pipeline(fasta_files, model, tokenizer, batch_size):
             List of file paths where the per-file embeddings were saved. To be passed to gradio for download.
         status_string : str
             A string summarizing the processing steps and output files generated, to be displayed in the gradio interface.
     """
     # Parse FASTA files
     sequences_info, file_info = parse_fasta_files(fasta_files)
@@ -234,9 +350,12 @@ def full_ppl_pipeline(fasta_files, model, tokenizer, batch_size):
     for i in range(0, len(sequences_info), batch_size):
         batch = sequences_info[i:i + batch_size]
         batch_sequences = [seq for _, seq, _ in batch]
-        ppl_scores = generate_ppl_scores(batch_sequences, model, tokenizer)
-        status_string += f"Generated {len(ppl_scores)} pseudo-perplexity scores for batch {i // batch_size + 1}/{n_batches}\n"
         all_ppl.extend(ppl_scores)
     status_string += f"Generated scores for all {len(sequences_info)} sequences.\n"
@@ -269,6 +388,3 @@ def full_ppl_pipeline(fasta_files, model, tokenizer, batch_size):
     status_string += f">{sequences_info[all_ppl.index(lowest_ppl)][0]}\n"
     status_string += f"{sequences_info[all_ppl.index(lowest_ppl)][1]}\n"
-    return all_file_paths, status_string

     return ppl_scores
+@spaces.GPU(duration=240)
+def generate_ppl_scores_approx(sequences_batch, model, tokenizer, mask_percentage=0.15):
+    """Generate approximate pseudo-perplexity scores for ESM models using chunked masking.
+    Parameters:
+    -----------
+    sequences_batch : list of str
+        A batch of sequences for which to generate embeddings.
+    model : AutoModel
+        The pre-loaded ESM model. must already be on the correct device (CPU or GPU).
+    tokenizer : AutoTokenizer
+        The pre-loaded tokenizer corresponding to the ESM model.
+    mask_percentage : float, default=0.15
+        Percentage of positions to mask in each forward pass (0 < mask_percentage <= 1).
+    Returns:
+    --------
+    ppl_scores : list of float
+        A list of approximate perplexity scores for each input sequence.
+    """
+    device = model.device
+    mask_token_id = tokenizer.mask_token_id
+    if mask_token_id is None:
+        raise ValueError("Tokenizer does not define a mask token; cannot compute pseudo-perplexity.")
+    tokens = tokenizer(
+        sequences_batch,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        add_special_tokens=True
+    ).to(device)
+    input_ids = tokens["input_ids"]
+    attention_mask = tokens["attention_mask"]
+    batch_size = input_ids.size(0)
+    seq_len = input_ids.size(1)
+    # Initialize accumulators for each sequence
+    log_prob_sums = torch.zeros(batch_size, device=device)
+    token_counts = torch.zeros(batch_size, device=device)
+    # Precompute which positions to score for each sequence (exclude special tokens)
+    positions_to_score = []
+    for i in range(batch_size):
+        valid_positions = torch.nonzero(attention_mask[i], as_tuple=False).squeeze(-1)
+        if valid_positions.numel() < 3:
+            positions_to_score.append([])
+        else:
+            # Exclude first and last positions (special tokens)
+            positions_to_score.append(valid_positions[1:-1].tolist())
+    # Calculate chunk size based on mask percentage
+    max_positions = max(len(pos) for pos in positions_to_score) if positions_to_score else 0
+    if max_positions == 0:
+        return [float("inf")] * batch_size
+    chunk_size = max(1, int(max_positions * mask_percentage))
+    with torch.no_grad():
+        # Determine all unique positions across sequences
+        all_positions = set()
+        for pos_list in positions_to_score:
+            all_positions.update(pos_list)
+        all_positions = sorted(all_positions)
+        # Process positions in chunks
+        for chunk_start in range(0, len(all_positions), chunk_size):
+            chunk_positions = all_positions[chunk_start:chunk_start + chunk_size]
+            # Clone input_ids and mask all positions in this chunk
+            masked_batch = input_ids.clone()
+            # Track which sequences have tokens at positions in this chunk
+            seq_positions = {i: [] for i in range(batch_size)}
+            for pos in chunk_positions:
+                for seq_idx in range(batch_size):
+                    if pos in positions_to_score[seq_idx]:
+                        seq_positions[seq_idx].append(pos)
+                        masked_batch[seq_idx, pos] = mask_token_id
+            # Skip if no sequences have tokens in this chunk
+            active_sequences = [i for i, pos_list in seq_positions.items() if pos_list]
+            if not active_sequences:
+                continue
+            # Single forward pass for the entire batch with chunk masked
+            outputs = model(masked_batch, attention_mask=attention_mask)
+            logits = outputs.logits  # (batch_size, seq_len, vocab_size)
+            # Compute log-probs for each sequence and position in the chunk
+            for seq_idx in active_sequences:
+                for pos in seq_positions[seq_idx]:
+                    true_token_id = input_ids[seq_idx, pos]
+                    log_probs = torch.log_softmax(logits[seq_idx, pos], dim=-1)
+                    true_log_prob = log_probs[true_token_id]
+                    log_prob_sums[seq_idx] += true_log_prob
+                    token_counts[seq_idx] += 1
+    # Compute final pseudo-perplexity scores
+    ppl_scores = []
+    for i in range(batch_size):
+        if token_counts[i] == 0:
+            ppl_scores.append(float("inf"))
+        else:
+            avg_neg_log_prob = -log_prob_sums[i] / token_counts[i]
+            ppl_scores.append(float(torch.exp(avg_neg_log_prob).item()))
+    return ppl_scores
 def full_embedding_pipeline(fasta_files, model, tokenizer, batch_size):
     """Full pipeline to process FASTA files and generate embeddings from desired model.
     return all_file_paths, status_string
+def full_ppl_pipeline(fasta_files, model, tokenizer, batch_size, mask_percentage=None):
     """Full pipeline to process FASTA files and generate embeddings from desired model.
     Parameters:
         The pre-loaded tokenizer corresponding to the ESM model.
     batch_size : int
         The number of sequences to process in each batch when generating embeddings.
+    mask_percentage : float or None
+        If None, use the exact PPL calculation (masking one token at a time). If a float between 0 and 1, use the approximate chunked masking method with the specified percentage of tokens masked per forward pass.
     Returns:
     --------
             List of file paths where the per-file embeddings were saved. To be passed to gradio for download.
         status_string : str
             A string summarizing the processing steps and output files generated, to be displayed in the gradio interface.
     """
     # Parse FASTA files
     sequences_info, file_info = parse_fasta_files(fasta_files)
     for i in range(0, len(sequences_info), batch_size):
         batch = sequences_info[i:i + batch_size]
         batch_sequences = [seq for _, seq, _ in batch]
+        if mask_percentage is None:
+            ppl_scores = generate_ppl_scores(batch_sequences, model, tokenizer)
+            status_string += f"Generated {len(ppl_scores)} pseudo-perplexity scores for batch {i // batch_size + 1}/{n_batches}\n"
+        else:
+            ppl_scores = generate_ppl_scores_approx(batch_sequences, model, tokenizer, mask_percentage=mask_percentage)
+            status_string += f"Generated {len(ppl_scores)} approximate pseudo-perplexity scores for batch {i // batch_size + 1}/{n_batches} with mask percentage {mask_percentage*100:.1f}%\n"
         all_ppl.extend(ppl_scores)
     status_string += f"Generated scores for all {len(sequences_info)} sequences.\n"
     status_string += f">{sequences_info[all_ppl.index(lowest_ppl)][0]}\n"
     status_string += f"{sequences_info[all_ppl.index(lowest_ppl)][1]}\n"