Spaces:

hugging-science
/

ESM2

Running

App Files Files Community

gabboud commited on Feb 27

Commit

4dcb469

1 Parent(s): c3ad370

replace fair-esm model access with huggingface hub, modularize and simplify post-processing

Browse files

Files changed (5) hide show

app.py +37 -157
requirements.txt +3 -1
utils/download_models.py +157 -0
utils/handle_files.py +90 -0
utils/pipelines.py +113 -0

app.py CHANGED Viewed

@@ -8,164 +8,20 @@ import json
 from pathlib import Path
 import zipfile
 import spaces
-# Load ESM2 model
-print("Loading ESM2 model...")
-import esm
-# Load the model and alphabet
-model_name = "esm2_t33_650M_UR50D"
-try:
-    model, alphabet = esm.pretrained.load_model_and_alphabet_local(model_name)
-except:
-    print(f"Loading {model_name} from HuggingFace...")
-    model, alphabet = esm.pretrained.load_model_and_alphabet_hub(model_name)
-model = model.eval()
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model = model.to(device)
-batch_converter = alphabet.get_batch_converter()
-print(f"Model loaded on {device}")
-def parse_fasta_files(fasta_files):
-    """Parse one or multiple FASTA files and return sequences."""
-    sequences = []
-    file_info = {}
-    for fasta_file in fasta_files:
-        file_name = Path(fasta_file.name).stem
-        file_seqs = []
-        try:
-            for record in SeqIO.parse(fasta_file, "fasta"):
-                sequences.append((record.id, str(record.seq), file_name))
-                file_seqs.append(record.id)
-            file_info[file_name] = file_seqs
-        except Exception as e:
-            raise ValueError(f"Error parsing {fasta_file.name}: {str(e)}")
-    if not sequences:
-        raise ValueError("No sequences found in the provided FASTA files.")
-    return sequences, file_info
-@spaces.GPU(duration=240)
-def generate_embeddings(sequences_batch):
-    """Generate embeddings for a batch of sequences."""
-    # Prepare batch for ESM2
-    batch_labels, batch_strs, batch_tokens = batch_converter(sequences_batch)
-    # Move to device
-    batch_tokens = batch_tokens.to(device)
-    # Generate embeddings
-    with torch.no_grad():
-        results = model(batch_tokens, repr_layers=[33], return_contacts=False)
-    # Extract embeddings (token representations from layer 33)
-    token_embeddings = results["representations"][33]
-    # Get sequence-level embeddings (mean pooling of token embeddings, excluding special tokens)
-    sequence_embeddings = []
-    for i, (label, seq) in enumerate(zip(batch_labels, batch_strs)):
-        # Remove special tokens (first and last)
-        seq_embedding = token_embeddings[i, 1:len(seq) + 1].mean(dim=0)
-        sequence_embeddings.append(seq_embedding.cpu().numpy())
-    return sequence_embeddings
-def process_embeddings(fasta_files):
-    """Main function to process FASTA files and generate embeddings."""
-    try:
-        # Parse FASTA files
-        sequences, file_info = parse_fasta_files(fasta_files)
-        # Generate embeddings in batches
-        batch_size = 8
-        all_embeddings = {}
-        status_updates = [f"Processing {len(sequences)} sequences from {len(file_info)} file(s)..."]
-        for i in range(0, len(sequences), batch_size):
-            batch = sequences[i:i + batch_size]
-            batch_labels = [(seq_id, seq, file_name) for seq_id, seq, file_name in batch]
-            status_updates.append(f"Generating embeddings for sequences {i + 1}-{min(i + batch_size, len(sequences))}...")
-            # Generate embeddings
-            embeddings = generate_embeddings([(label, seq) for label, seq, _ in batch_labels])
-            # Store embeddings
-            for (seq_id, seq, file_name), embedding in zip(batch_labels, embeddings):
-                key = f"{file_name}_{seq_id}"
-                all_embeddings[key] = {
-                    "sequence_id": seq_id,
-                    "file": file_name,
-                    "sequence_length": len(seq),
-                    "embedding": embedding.tolist()
-                }
-        # Create output files
-        output_files = []
-        temp_dir = tempfile.mkdtemp()
-        # Save embeddings as NPZ (numpy compressed format)
-        npz_path = os.path.join(temp_dir, "embeddings.npz")
-        embeddings_array = {k: np.array(v["embedding"]) for k, v in all_embeddings.items()}
-        np.savez_compressed(npz_path, **embeddings_array)
-        output_files.append(npz_path)
-        status_updates.append(f"Saved compressed embeddings to embeddings.npz")
-        # Save metadata as JSON
-        metadata_path = os.path.join(temp_dir, "metadata.json")
-        metadata = {
-            "num_sequences": len(all_embeddings),
-            "embedding_dim": 1280,  # ESM2-650M has 1280-dimensional embeddings
-            "model": model_name,
-            "sequences": {k: {
-                "sequence_id": v["sequence_id"],
-                "file": v["file"],
-                "sequence_length": v["sequence_length"]
-            } for k, v in all_embeddings.items()}
-        }
-        with open(metadata_path, 'w') as f:
-            json.dump(metadata, f, indent=2)
-        output_files.append(metadata_path)
-        status_updates.append(f"Saved metadata to metadata.json")
-        # Create per-file embedding files
-        for file_name in file_info.keys():
-            file_embeddings = {k: v for k, v in embeddings_array.items() if k.startswith(file_name)}
-            if file_embeddings:
-                file_npz_path = os.path.join(temp_dir, f"embeddings_{file_name}.npz")
-                np.savez_compressed(file_npz_path, **file_embeddings)
-                output_files.append(file_npz_path)
-                status_updates.append(f"Saved {len(file_embeddings)} embeddings for {file_name}")
-        # Create a summary report
-        summary_path = os.path.join(temp_dir, "summary.txt")
-        with open(summary_path, 'w') as f:
-            f.write("ESM2 Protein Sequence Embedding Summary\n")
-            f.write("=" * 50 + "\n\n")
-            f.write(f"Model: {model_name}\n")
-            f.write(f"Device: {device}\n")
-            f.write(f"Embedding Dimension: 1280\n\n")
-            f.write(f"Input Files: {len(file_info)}\n")
-            f.write(f"Total Sequences: {len(all_embeddings)}\n\n")
-            f.write("Sequences per file:\n")
-            for file_name, seq_ids in file_info.items():
-                f.write(f"  - {file_name}: {len(seq_ids)} sequences\n")
-        output_files.append(summary_path)
-        status_message = "\n".join(status_updates)
-        status_message += f"\n\nSuccessfully generated embeddings for {len(all_embeddings)} sequences!"
-        return output_files, status_message
-    except Exception as e:
-        raise gr.Error(f"Error processing sequences: {str(e)}")
 # Create Gradio interface
@@ -214,12 +70,36 @@ with gr.Blocks(title="ESM2 Protein Embeddings") as demo:
             label="Download Output Files",
             file_count="multiple"
         )
     submit_btn.click(
-        fn=process_embeddings,
         inputs=[input_files],
         outputs=[download_output, status_output]
     )
     gr.Markdown("""
     ### How to use the embeddings:

 from pathlib import Path
 import zipfile
 import spaces
+from utils.download_models import *
+from utils.handle_files import parse_fasta_files
+from utils.pipelines import generate_embeddings, full_embedding_pipeline
+print("Downloading ESM2 models...")
+MODELS = {
+    "facebook/esm2_t6_8M_UR50D": "ESM2-8M",
+    "facebook/esm2_t12_35M_UR50D": "ESM2-35M",
+    #"esm2_t36_650M_UR50D": "ESM2-650M"
+}
+cache_dirs = cache_all_models(MODELS)
+models_and_tokenizers = load_all_models(MODELS)
 # Create Gradio interface
             label="Download Output Files",
             file_count="multiple"
         )
+    with gr.Row():
+        model_dropdown = gr.Dropdown(
+            choices=list(MODELS.values()),
+            value=list(MODELS.values())[0],
+            label="Select Model"
+        )
+    model_to_use = gr.State(value=models_and_tokenizers[model_dropdown.value][0])
+    tokenizer_to_use = gr.State(value=models_and_tokenizers[model_dropdown.value][1])
+    def pick_model(model_name):
+        model_key = [key for key, value in MODELS.items() if value == model_name][0]
+        print(f"Selected model: {model_name} ({model_key})")
+        return models_and_tokenizers[model_key]
+    model_dropdown.change(
+        fn=pick_model,
+        inputs=model_dropdown,
+        outputs=[model_to_use, tokenizer_to_use]
+    )
     submit_btn.click(
+        fn=full_embedding_pipeline,
         inputs=[input_files],
         outputs=[download_output, status_output]
     )
     gr.Markdown("""
     ### How to use the embeddings:

requirements.txt CHANGED Viewed

@@ -1,4 +1,6 @@
 torch>=2.0.0
-fair-esm>=2.0.0
 biopython>=1.81
 numpy>=1.21.0

 torch>=2.0.0
 biopython>=1.81
 numpy>=1.21.0
+huggingface_hub
+transformers

utils/download_models.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import esm
+import torch
+import huggingface_hub
+from transformers import AutoTokenizer, AutoModel
+def cache_model_weights(model_id):
+    """
+    Download ESM2 model weights to cache without loading into memory. Called upon restarting of spaces to have weights ready to load once inference is called.
+    Downloading weights without
+    Parameters:
+    -----------
+    model_id : str
+        Model identifier (e.g., "facebook/esm2_t6_8M_UR50D")
+    Returns:
+    --------
+    str : Path to cached model directory
+    """
+    cache_dir = huggingface_hub.snapshot_download(model_id)
+    print(f"Model {model_id} cached at: {cache_dir}")
+    return cache_dir
+def cache_all_models(models):
+    """
+    Cache all models in the provided dictionary.
+    Parameters:
+    -----------
+    models : dict
+        A dictionary where keys are model identifiers (e.g., "facebook/esm2_t6_8M_UR50D") and values are human-readable model names (e.g., "ESM2-8M").
+    Returns:
+    --------
+    dict : A dictionary mapping model identifiers to their cache directories.
+    """
+    cache_dirs = {}
+    for model_id in models.keys():
+        cache_dirs[model_id] = cache_model_weights(model_id)
+    return cache_dirs
+def load_model(model_id):
+    """
+    Load ESM model and tokenizer using from_pretrained. Initializes from default cache directory or downloads if missing.
+    To be used after cache_model_weights for control over when models are downloaded
+    Parameters:
+    -----------
+    model_id : str
+        Model identifier (e.g., "facebook/esm2_t6_8M_UR50D")
+    Returns:
+    --------
+    tuple : (model, tokenized) loaded from cache
+    """
+    try:
+        print(f"Loading {model_id} from local cache...")
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = AutoModel.from_pretrained(
+            model_id,
+            output_hidden_states=True,
+        )
+    except Exception as e:
+        raise RuntimeError(f"Failed to load model {model_id} from cache: {e}")
+    model = model.eval()
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = model.to(device)
+    print(f"{model_id} loaded on {device}")
+    return model, tokenizer
+def load_all_models(models):
+    """
+    Load all models in the provided dictionary.
+    Parameters:
+    -----------
+    models : dict
+        A dictionary where keys are model identifiers (e.g., "facebook/esm2_t6_8M_UR50D") and values are human-readable model names (e.g., "ESM2-8M").
+    Returns:
+    --------
+    dict : A dictionary mapping model identifiers to their loaded (model, tokenizer) tuples.
+    """
+    loaded_models = {}
+    for model_id in models.keys():
+        loaded_models[model_id] = load_model(model_id)
+    return loaded_models
+#def cache_models(models):
+#    """
+#    Download weights to ESM models in cache to be loaded later.
+#    We do not load the models into memory at this stage to avoid using GPU memory for models that are not used in the current session.
+#
+#    Parameters:
+#    ----------
+#    models: dict
+#        A dictionary where keys are model identifiers (e.g., "esm2_t6_8M_UR50D") and values are human-readable model names (e.g., "ESM2-8M").
+#
+#    Returns:
+#    -------
+#
+#    """
+#    loaded_models = {}
+#    for model_id, model_name in models.items():
+#        print(f"Loading {model_name}...")
+#        try:
+#            #load from local cache if avilable, upon startup of space will fail and load from HF
+#            model, alphabet = esm.pretrained.load_model_and_alphabet_local(model_id)
+#        except:
+#            print(f"Loading {model_name} from HuggingFace...")
+#            model, alphabet = esm.pretrained.load_model_and_alphabet_hub(model_id)
+#
+#        model = model.eval()
+#        device = "cuda" if torch.cuda.is_available() else "cpu"
+#        model = model.to(device)
+#        loaded_models[model_id] = {
+#            "model": model,
+#            "alphabet": alphabet,
+#            "batch_converter": alphabet.get_batch_converter()
+#        }
+#        print(f"{model_name} loaded on {device}")
+#
+#def download_models(models):
+#    """
+#    Download weights to ESM models in cache to be loaded later.
+#    We do not load the models into memory at this stage to avoid using GPU memory for models that are not used in the current session.
+#
+#    Parameters:
+#    ----------
+#    models: dict
+#        A dictionary where keys are model identifiers (e.g., "esm2_t6_8M_UR50D") and values are human-readable model names (e.g., "ESM2-8M").
+#
+#    Returns:
+#    -------
+#
+#    """
+#    loaded_models = {}
+#    for model_id, model_name in models.items():
+#        print(f"Loading {model_name}...")
+#        try:
+#            #load from local cache if avilable, upon startup of space will fail and load from HF
+#            model, alphabet = esm.pretrained.load_model_and_alphabet_local(model_id)
+#        except:
+#            print(f"Loading {model_name} from HuggingFace...")
+#            model, alphabet = esm.pretrained.load_model_and_alphabet_hub(model_id)
+#
+#        model = model.eval()
+#        device = "cuda" if torch.cuda.is_available() else "cpu"
+#        model = model.to(device)
+#        loaded_models[model_id] = {
+#            "model": model,
+#            "alphabet": alphabet,
+#            "batch_converter": alphabet.get_batch_converter()
+#        }
+#        print(f"{model_name} loaded on {device}")

utils/handle_files.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from pathlib import Path
+from Bio import SeqIO
+def parse_fasta_files(fasta_files):
+    """Parse one or multiple FASTA files and return sequences.
+    This function uses the entire header line as sequence_id to deal with LigandMPNN's omittance of a unique sequence ID at the beginning of the header.
+    Parameters:
+    -----------
+    fasta_files : list of str
+        List of paths to FASTA files to be parsed.
+    Returns:
+    --------
+    sequences : list of tuples
+        A list of tuples containing (sequence_id, sequence, file_name) for each sequence found in the FASTA files.
+    file_info : dict
+        A dictionary mapping file names to lists of sequence IDs contained in each file.
+    """
+    sequences = []
+    file_info = {}
+    for fasta_file in fasta_files:
+        print(fasta_file)
+        if fasta_file.endswith('.fasta') or fasta_file.endswith('.fa'):
+            file_name = Path(fasta_file).stem
+            file_seqs = []
+            try:
+                for record in SeqIO.parse(fasta_file, "fasta"):
+                    # Use the entire header as the sequence ID
+                    full_header = record.description  # Full header line without '>'
+                    sequences.append((full_header, str(record.seq), file_name))
+                    file_seqs.append(full_header)
+                file_info[file_name] = file_seqs
+            except Exception as e:
+                raise ValueError(f"Error parsing {fasta_file.name}: {str(e)}")
+    if not sequences:
+        raise ValueError("No sequences found in the provided FASTA files.")
+    return sequences, file_info
+def parse_fasta_files_from_ligandmpnn(fasta_files):
+    """Parse one or multiple FASTA files and return sequences. These files are expected to be in the format generated by LigandMPNN.
+    In these fasta files, there is no sequence_id in the header, It's the name of the file + some info on generated sequence quality + the number of the designs "id=0"
+    Hence special parsing is needed to extract the sequence_id from the header.
+    Parameters:
+    -----------
+    fasta_files : list of str
+        List of paths to FASTA files to be parsed.
+    Returns:
+    --------
+    sequences : list of tuples
+        A list of tuples containing (sequence_id, sequence, file_name) for each sequence found in the FASTA files.
+    file_info : dict
+        A dictionary mapping file names to lists of sequence IDs contained in each file.
+    """
+    sequences = []
+    file_info = {}
+    for fasta_file in fasta_files:
+        print(fasta_file)
+        if fasta_file.endswith('.fasta') or fasta_file.endswith('.fa'):
+            file_name = Path(fasta_file).stem
+            file_seqs = []
+            try:
+                for record in SeqIO.parse(fasta_file, "fasta"):
+                    # Extract id from description if it contains id=
+                    seq_id = record.id
+                    if "id=" in record.description:
+                        # Parse the description to find id=...
+                        parts = record.description.split()
+                        for part in parts:
+                            if part.startswith("id="):
+                                seq_id = part[3:]  # Remove "id=" prefix
+                                break
+                    sequences.append((seq_id, str(record.seq), file_name))
+                    file_seqs.append(seq_id)
+                file_info[file_name] = file_seqs
+            except Exception as e:
+                raise ValueError(f"Error parsing {fasta_file.name}: {str(e)}")
+    if not sequences:
+        raise ValueError("No sequences found in the provided FASTA files.")
+    return sequences, file_info

utils/pipelines.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import spaces
+import torch
+import spaces
+import numpy as np
+from utils.handle_files import parse_fasta_files
+import gradio as gr
+import time
+import random
+import os
+@spaces(duration=240)
+def generate_embeddings(sequences_batch, model, tokenizer):
+    """Generate embeddings for ESM models using the transformers library.
+    Parameters:
+    -----------
+    sequences_batch : list of str
+        A batch of sequences for which to generate embeddings.
+    model : AutoModel
+        The pre-loaded ESM model. must already be on the correct device (CPU or GPU).
+    tokenizer : AutoTokenizer
+        The pre-loaded tokenizer corresponding to the ESM model.
+    Returns:
+    --------
+    sequence_embeddings : 2D np.array of shape (batch_size, embedding_dim)
+        A list of sequence-level embeddings (mean-pooled) for each input sequence.
+    """
+    # Tokenize sequences
+    device = model.device
+    tokens = tokenizer(
+        sequences_batch,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        add_special_tokens=True
+    ).to(device)
+    # Generate embeddings
+    with torch.no_grad():
+        results = model(**tokens)
+    # Extract hidden states from last layer
+    token_embeddings = results.hidden_states[-1]  # Last layer embeddings
+    # Get sequence-level embeddings (mean pooling, excluding special tokens)
+    sequence_embeddings = []
+    for i, seq in enumerate(sequences_batch):
+        # Remove special tokens (first and last)
+        seq_embedding = token_embeddings[i, 1:len(seq) + 1].mean(dim=0)
+        # this might seem inefficient compared to token_embeddings[:,1:seq_len+1,:].mean...
+        # but it is necessary to account for variable sequence lengths and ensure we only average over the actual sequence tokens, not the padding or special tokens.
+        sequence_embeddings.append(seq_embedding.cpu().numpy())
+    return np.array(sequence_embeddings)
+def full_embedding_pipeline(fasta_files, model, tokenizer, batch_size):
+    """Full pipeline to process FASTA files and generate embeddings from desired model.
+    Parameters:
+    -----------
+    fasta_files : list of str, obtained from gradio file input
+        List of paths to FASTA files to be parsed.
+    model : AutoModel
+        The pre-loaded ESM model. must already be on the correct device (CPU or GPU).
+    tokenizer : AutoTokenizer
+        The pre-loaded tokenizer corresponding to the ESM model.
+    batch_size : int
+        The number of sequences to process in each batch when generating embeddings.
+    Returns:
+    --------
+        all_file_paths : list of str
+            List of file paths where the per-file embeddings were saved. To be passed to gradio for download.
+        status_string : str
+            A string summarizing the processing steps and output files generated, to be displayed in the gradio interface.
+    """
+    # Parse FASTA files
+    sequences_info, file_info = parse_fasta_files(fasta_files)
+    # Generate embeddings in batches
+    all_embeddings = []
+    n_batches = (len(sequences_info) + batch_size - 1) // batch_size
+    status_string = f"Processing {len(sequences_info)} sequences from {len(file_info)} file(s) in {n_batches} batches of {batch_size} sequences...\n"
+    for i in range(0, len(sequences_info), batch_size):
+        batch = sequences_info[i:i + batch_size]
+        batch_sequences = [seq for _, seq, _ in batch]
+        embeddings = generate_embeddings(batch_sequences, model, tokenizer)
+        status_string += f"Generated {len(embeddings)} embeddings for batch {i // batch_size + 1}/{n_batches}\n"
+        all_embeddings.extend(embeddings)
+    status_string += f"Generated embeddings for all {len(sequences_info)} sequences.\n"
+    unique_files = file_info.keys()
+    session_hash = random.getrandbits(128)  # Generate a random hash for this session
+    time_stamp = time.strftime("%Y-%m-%d-%H-%M-%S")
+    out_dir = f"./outputs/unconditional_generation/session_{session_hash}_{time_stamp}"
+    os.makedirs(out_dir, exist_ok=True)
+    all_file_paths = []
+    for file_name in unique_files:
+        indices = [i for i, (_, _, f) in enumerate(sequences_info) if f == file_name]
+        file_embeddings = np.array([all_embeddings[i] for i in indices])
+        sequence_ids = [sequences_info[i][0] for i in indices]  # Extract sequence IDs for this file
+        file_path = os.path.join(out_dir, f"{file_name}_embeddings.npz")
+        np.savez_compressed(file_path, embeddings=file_embeddings, sequence_ids=sequence_ids)
+        status_string += f"Saved compressed embeddings to {file_name}_embeddings.npz\n"
+        all_file_paths.append(file_path)
+    return all_file_paths, all_embeddings