jeanq1 commited on
Commit
e3b7ccf
·
verified ·
1 Parent(s): 9d8a569

Upload 3 files

Browse files
Files changed (3) hide show
  1. README(1).md +52 -0
  2. config(2).json +29 -0
  3. model(1).safetensors +3 -0
README(1).md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ pipeline_tag: feature-extraction
4
+ model_name: InstaDeepAI/IDP-ESM2-8M
5
+ ---
6
+
7
+ # IDP-ESM2-8M
8
+
9
+ **IDP-ESM2-8M** is an ESM2-style encoder for intrinsically disorded protein sequence representation learning, trained on [IDP-Euka-90](https://huggingface.co/datasets/jeanq1/IDP-Euka-90).
10
+ This repository provides a Transformer encoder suitable for extracting **sequence embeddings**.
11
+
12
+ ---
13
+
14
+ ## Quick start: generate embeddings
15
+
16
+ The snippet below loads the tokenizer and model, runs a forward pass on a couple of sequences and extracts embeddings for each sequence.
17
+
18
+ ```python
19
+ from transformers import AutoTokenizer, AutoModel
20
+ import torch
21
+
22
+ # --- Config ---
23
+ model_name = "InstaDeepAI/IDP-ESM2-8M"
24
+
25
+ # --- Load model and tokenizer ---
26
+ tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
27
+ model = AutoModel.from_pretrained(model_name)
28
+ model.eval()
29
+
30
+ # (optional) use GPU if available
31
+ device = "cuda" if torch.cuda.is_available() else "cpu"
32
+ model.to(device)
33
+
34
+ # --- Input sequences ---
35
+ sequences = [
36
+ "MDDNHYPHHHHNHHNHHSTSGGCGESQFTTKLSVNTFARTHPMIQNDLIDLDLISGSAFTMKSKSQQ",
37
+ "PADRDLSSPFGSTVPGVGPNAAAASNAAAAAAAAATAGSNKHQTPPTTFR",
38
+ ]
39
+
40
+ # --- Tokenize ---
41
+ inputs = tokenizer(
42
+ sequences,
43
+ return_tensors="pt",
44
+ padding=True,
45
+ truncation=True,
46
+ )
47
+ inputs = {k: v.to(device) for k, v in inputs.items()}
48
+
49
+ # --- Forward pass ---
50
+ with torch.no_grad():
51
+ outputs = model(**inputs)
52
+ embeddings = outputs.last_hidden_state # shape: (batch, seq_len, hidden_dim)
config(2).json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "EsmForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "classifier_dropout": null,
7
+ "emb_layer_norm_before": false,
8
+ "esmfold_config": null,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.0,
11
+ "hidden_size": 320,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1280,
14
+ "is_folding_model": false,
15
+ "layer_norm_eps": 1e-05,
16
+ "mask_token_id": 32,
17
+ "max_position_embeddings": 1026,
18
+ "model_type": "esm",
19
+ "num_attention_heads": 20,
20
+ "num_hidden_layers": 6,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "rotary",
23
+ "token_dropout": true,
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.54.1",
26
+ "use_cache": true,
27
+ "vocab_list": null,
28
+ "vocab_size": 33
29
+ }
model(1).safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03234e27ad0c9a7f3f423d0ad391ae2f73c3900da0643c91a64b7f1d42729762
3
+ size 30062544