Spaces:

hugging-science
/

ESM2

Running

App Files Files Community

gabboud commited on Feb 27

Commit

ae38197

1 Parent(s): a749e8f

fix locally and implement PPL

Browse files

Files changed (4) hide show

app.py +57 -42
requirements.txt +1 -0
utils/download_models.py +4 -3
utils/pipelines.py +164 -3

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ import zipfile
 import spaces
 from utils.download_models import *
 from utils.handle_files import parse_fasta_files
-from utils.pipelines import generate_embeddings, full_embedding_pipeline
 print("Downloading ESM2 models...")
@@ -49,32 +49,10 @@ with gr.Blocks(title="ESM2 Protein Embeddings") as demo:
     - `embeddings_[filename].npz`: Per-file embeddings
     """)
-    with gr.Row():
-        with gr.Column():
-            input_files = gr.File(
-                label="Upload FASTA files",
-                file_count="multiple",
-                file_types=[".fasta", ".fa", ".faa"]
-            )
-            submit_btn = gr.Button("Generate Embeddings", variant="primary", size="lg")
-        with gr.Column():
-            status_output = gr.Textbox(
-                label="Processing Status",
-                interactive=False,
-                lines=6
-            )
-    with gr.Row():
-        download_output = gr.File(
-            label="Download Output Files",
-            file_count="multiple"
-        )
     with gr.Row():
         model_dropdown = gr.Dropdown(
-            choices=list(MODELS.values()),
-            value=list(MODELS.values())[0],
             label="Select Model"
         )
         batch_size = gr.Slider(
@@ -84,29 +62,66 @@ with gr.Blocks(title="ESM2 Protein Embeddings") as demo:
             value=32,
             label="Batch Size"
         )
-    current_key = [key for key, value in MODELS.items() if value == model_dropdown.value][0]
-    model_to_use = gr.State(value=models_and_tokenizers[current_key][0])
-    tokenizer_to_use = gr.State(value=models_and_tokenizers[current_key][1])
-    def pick_model(model_name):
-        model_key = [key for key, value in MODELS.items() if value == model_name][0]
-        print(f"Selected model: {model_name} ({model_key})")
-        return models_and_tokenizers[model_key]
-    model_dropdown.change(
-        fn=pick_model,
-        inputs=model_dropdown,
-        outputs=[model_to_use, tokenizer_to_use]
-    )
     submit_btn.click(
-        fn=full_embedding_pipeline,
-        inputs=[input_files, model_to_use, tokenizer_to_use, batch_size],
         outputs=[download_output, status_output]
     )
     gr.Markdown("""

 import spaces
 from utils.download_models import *
 from utils.handle_files import parse_fasta_files
+from utils.pipelines import generate_embeddings, full_embedding_pipeline, full_ppl_pipeline
 print("Downloading ESM2 models...")
     - `embeddings_[filename].npz`: Per-file embeddings
     """)
     with gr.Row():
         model_dropdown = gr.Dropdown(
+            choices=list(MODELS.keys()),
+            value=list(MODELS.keys())[0],
             label="Select Model"
         )
         batch_size = gr.Slider(
             value=32,
             label="Batch Size"
         )
+    with gr.Row():
+        with gr.Column():
+            input_files = gr.File(
+                label="Upload FASTA files",
+                file_count="multiple",
+                file_types=[".fasta", ".fa", ".faa"]
+            )
+        with gr.Column():
+            with gr.Tabs():
+                with gr.TabItem("Generate Embeddings"):
+                    submit_btn = gr.Button("Generate Embeddings", variant="primary", size="lg")
+                    status_output = gr.Textbox(
+                        label="Waiting for embeddings generation...",
+                        interactive=False,
+                        lines=6
+                    )
+                    download_output = gr.File(
+                        label="Download Output Files",
+                        file_count="multiple"
+                    )
+                with gr.TabItem("Calculate Pseudo-Perplexity scores"):
+                    ppl_button = gr.Button("Calculate Pseudo-Perplexity", variant="primary", size="lg")
+                    ppl_status = gr.Textbox(
+                        label="Waiting for pseudo-perplexity calculation...",
+                        interactive=False,
+                        lines=6
+                    )
+                    ppl_download = gr.File(
+                        label="Download Pseudo-Perplexity Output",
+                        file_count="multiple"
+                    )
+    def run_pipeline_with_selected_model(fasta_files, model_key, batch_size_value, task="embedding"):
+        """Wrapper to run pipeline with selected model from dropdown."""
+        if not fasta_files:
+            return gr.update(), "No FASTA files uploaded. Please upload at least one FASTA file for inference."
+        model, tokenizer = models_and_tokenizers[model_key]
+        if task == "embedding":
+            return full_embedding_pipeline(fasta_files, model, tokenizer, batch_size_value)
+        elif task == "ppl":
+            return full_ppl_pipeline(fasta_files, model, tokenizer, batch_size_value)
     submit_btn.click(
+        fn=run_pipeline_with_selected_model,
+        inputs=[input_files, model_dropdown, batch_size, gr.State("embedding")],
         outputs=[download_output, status_output]
     )
+    ppl_button.click(
+        fn=run_pipeline_with_selected_model,
+        inputs=[input_files, model_dropdown, batch_size, gr.State("ppl")],
+        outputs=[ppl_download, ppl_status]
+    )
     gr.Markdown("""

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ biopython>=1.81
 numpy>=1.21.0
 huggingface_hub
 transformers

 numpy>=1.21.0
 huggingface_hub
 transformers
+pandas

utils/download_models.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
 import huggingface_hub
-from transformers import AutoTokenizer, AutoModel
 def cache_model_weights(model_id):
     """
@@ -54,8 +54,8 @@ def load_model(model_id):
     """
     try:
         print(f"Loading {model_id} from local cache...")
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        model = AutoModel.from_pretrained(
             model_id,
             output_hidden_states=True,
         )
@@ -87,6 +87,7 @@ def load_all_models(models):
     return loaded_models
 #def cache_models(models):
 #    """
 #    Download weights to ESM models in cache to be loaded later.

 import torch
 import huggingface_hub
+from transformers import AutoTokenizer, AutoModel, EsmForMaskedLM, EsmTokenizer
 def cache_model_weights(model_id):
     """
     """
     try:
         print(f"Loading {model_id} from local cache...")
+        tokenizer = EsmTokenizer.from_pretrained(model_id)
+        model = EsmForMaskedLM.from_pretrained(
             model_id,
             output_hidden_states=True,
         )
     return loaded_models
 #def cache_models(models):
 #    """
 #    Download weights to ESM models in cache to be loaded later.

utils/pipelines.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import spaces
 import torch
-import spaces
 import numpy as np
 from utils.handle_files import parse_fasta_files
 import gradio as gr
 import time
 import random
 import os
 @spaces.GPU(duration=240)
 def generate_embeddings(sequences_batch, model, tokenizer):
@@ -55,6 +55,97 @@ def generate_embeddings(sequences_batch, model, tokenizer):
     return np.array(sequence_embeddings)
 def full_embedding_pipeline(fasta_files, model, tokenizer, batch_size):
     """Full pipeline to process FASTA files and generate embeddings from desired model.
@@ -96,7 +187,7 @@ def full_embedding_pipeline(fasta_files, model, tokenizer, batch_size):
     unique_files = file_info.keys()
     session_hash = random.getrandbits(128)  # Generate a random hash for this session
     time_stamp = time.strftime("%Y-%m-%d-%H-%M-%S")
-    out_dir = f"./outputs/unconditional_generation/session_{session_hash}_{time_stamp}"
     os.makedirs(out_dir, exist_ok=True)
     all_file_paths = []
     for file_name in unique_files:
@@ -109,5 +200,75 @@ def full_embedding_pipeline(fasta_files, model, tokenizer, batch_size):
         all_file_paths.append(file_path)
-    return all_file_paths, all_embeddings

 import spaces
 import torch
 import numpy as np
 from utils.handle_files import parse_fasta_files
 import gradio as gr
 import time
 import random
 import os
+import pandas as pd
 @spaces.GPU(duration=240)
 def generate_embeddings(sequences_batch, model, tokenizer):
     return np.array(sequence_embeddings)
+@spaces.GPU(duration=240)
+def generate_ppl_scores(sequences_batch, model, tokenizer):
+    """Generate pseudo-perplexity scores for ESM models using batched masking across all sequences.
+    Parameters:
+    -----------
+    sequences_batch : list of str
+        A batch of sequences for which to generate embeddings.
+    model : AutoModel
+        The pre-loaded ESM model. must already be on the correct device (CPU or GPU).
+    tokenizer : AutoTokenizer
+        The pre-loaded tokenizer corresponding to the ESM model.
+    Returns:
+    --------
+    ppl_scores : list of float
+        A list of perplexity scores for each input sequence.
+    """
+    device = model.device
+    mask_token_id = tokenizer.mask_token_id
+    if mask_token_id is None:
+        raise ValueError("Tokenizer does not define a mask token; cannot compute pseudo-perplexity.")
+    tokens = tokenizer(
+        sequences_batch,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        add_special_tokens=True
+    ).to(device)
+    input_ids = tokens["input_ids"]
+    attention_mask = tokens["attention_mask"]
+    batch_size = input_ids.size(0)
+    seq_len = input_ids.size(1)
+    # Initialize accumulators for each sequence
+    log_prob_sums = torch.zeros(batch_size, device=device)
+    token_counts = torch.zeros(batch_size, device=device)
+    # Precompute which positions to score for each sequence (exclude special tokens)
+    positions_to_score = []
+    for i in range(batch_size):
+        valid_positions = torch.nonzero(attention_mask[i], as_tuple=False).squeeze(-1)
+        if valid_positions.numel() < 3:
+            # Too short to score (less than 1 real token after excluding special tokens)
+            positions_to_score.append(set())
+        else:
+            # Exclude first and last positions (special tokens)
+            positions_to_score.append(set(valid_positions[1:-1].tolist()))
+    with torch.no_grad():
+        # Process one position at a time across all sequences
+        for pos in range(1, seq_len - 1):
+            # Find which sequences have a valid token at this position
+            active_indices = [i for i in range(batch_size) if pos in positions_to_score[i]]
+            if not active_indices:
+                continue
+            # Clone input_ids and mask the current position for all sequences
+            masked_batch = input_ids.clone()
+            true_token_ids = masked_batch[active_indices, pos].clone()
+            masked_batch[active_indices, pos] = mask_token_id
+            # Single forward pass for all sequences
+            outputs = model(masked_batch, attention_mask=attention_mask)
+            logits = outputs.logits  # (batch_size, seq_len, vocab_size)
+            # Extract log-probs for each active sequence at this position
+            log_probs = torch.log_softmax(logits[active_indices, pos], dim=-1)
+            # Gather log-probs of the true tokens
+            true_log_probs = log_probs.gather(1, true_token_ids.unsqueeze(-1)).squeeze(-1)
+            # Accumulate for each active sequence
+            for idx, seq_idx in enumerate(active_indices):
+                log_prob_sums[seq_idx] += true_log_probs[idx]
+                token_counts[seq_idx] += 1
+    # Compute final pseudo-perplexity scores
+    ppl_scores = []
+    for i in range(batch_size):
+        if token_counts[i] == 0:
+            ppl_scores.append(float("inf"))
+        else:
+            avg_neg_log_prob = -log_prob_sums[i] / token_counts[i]
+            ppl_scores.append(float(torch.exp(avg_neg_log_prob).item()))
+    return ppl_scores
 def full_embedding_pipeline(fasta_files, model, tokenizer, batch_size):
     """Full pipeline to process FASTA files and generate embeddings from desired model.
     unique_files = file_info.keys()
     session_hash = random.getrandbits(128)  # Generate a random hash for this session
     time_stamp = time.strftime("%Y-%m-%d-%H-%M-%S")
+    out_dir = f"./outputs/session_{session_hash}_{time_stamp}"
     os.makedirs(out_dir, exist_ok=True)
     all_file_paths = []
     for file_name in unique_files:
         all_file_paths.append(file_path)
+    return all_file_paths, status_string
+def full_ppl_pipeline(fasta_files, model, tokenizer, batch_size):
+    """Full pipeline to process FASTA files and generate embeddings from desired model.
+    Parameters:
+    -----------
+    fasta_files : list of str, obtained from gradio file input
+        List of paths to FASTA files to be parsed.
+    model : AutoModel
+        The pre-loaded ESM model. must already be on the correct device (CPU or GPU).
+    tokenizer : AutoTokenizer
+        The pre-loaded tokenizer corresponding to the ESM model.
+    batch_size : int
+        The number of sequences to process in each batch when generating embeddings.
+    Returns:
+    --------
+        all_file_paths : list of str
+            List of file paths where the per-file embeddings were saved. To be passed to gradio for download.
+        status_string : str
+            A string summarizing the processing steps and output files generated, to be displayed in the gradio interface.
+    """
+    # Parse FASTA files
+    sequences_info, file_info = parse_fasta_files(fasta_files)
+    # Generate embeddings in batches
+    all_ppl = []
+    n_batches = (len(sequences_info) + batch_size - 1) // batch_size
+    status_string = f"Processing {len(sequences_info)} sequences from {len(file_info)} file(s) in {n_batches} batches of {batch_size} sequences...\n"
+    for i in range(0, len(sequences_info), batch_size):
+        batch = sequences_info[i:i + batch_size]
+        batch_sequences = [seq for _, seq, _ in batch]
+        ppl_scores = generate_ppl_scores(batch_sequences, model, tokenizer)
+        status_string += f"Generated {len(ppl_scores)} pseudo-perplexity scores for batch {i // batch_size + 1}/{n_batches}\n"
+        all_ppl.extend(ppl_scores)
+    status_string += f"Generated scores for all {len(sequences_info)} sequences.\n"
+    unique_files = file_info.keys()
+    session_hash = random.getrandbits(128)  # Generate a random hash for this session
+    time_stamp = time.strftime("%Y-%m-%d-%H-%M-%S")
+    out_dir = f"./outputs/session_{session_hash}_{time_stamp}"
+    os.makedirs(out_dir, exist_ok=True)
+    all_file_paths = []
+    for file_name in unique_files:
+        indices = [i for i, (_, _, f) in enumerate(sequences_info) if f == file_name]
+        file_path = os.path.join(out_dir, f"{file_name}_ppl.csv")
+        rows = []
+        for idx in indices:
+            description, sequence, _ = sequences_info[idx]
+            rows.append({
+                "description": description,
+                "sequence": sequence,
+                "ppl_score": all_ppl[idx]
+            })
+        df = pd.DataFrame(rows)
+        df.to_csv(file_path, index=False)
+        status_string += f"Saved PPL scores to {file_name}_ppl.csv\n"
+        all_file_paths.append(file_path)
+    lowest_ppl = min(all_ppl)
+    status_string += f"Lowest PPL score across all sequences: {lowest_ppl:.4f}:\n for sequence in file {sequences_info[all_ppl.index(lowest_ppl)][2]}:\n"
+    status_string += f">{sequences_info[all_ppl.index(lowest_ppl)][0]}\n"
+    status_string += f"{sequences_info[all_ppl.index(lowest_ppl)][1]}\n"
+    return all_file_paths, status_string