Upload folder using huggingface_hub
Browse files- README.md +11 -67
- config.json +29 -0
- example_usage.py +64 -0
- modeling_loleve.py +148 -0
- requirements.txt +4 -0
README.md
CHANGED
|
@@ -57,71 +57,23 @@ The model achieves state-of-the-art performance on three key benchmarks:
|
|
| 57 |
2. **Rare Variant Prioritization**: Prioritizing rare variants in human population data
|
| 58 |
3. **TFBS Disruption**: Understanding transcription factor binding site disruptions
|
| 59 |
|
| 60 |
-
##
|
| 61 |
-
|
| 62 |
-
### Installation
|
| 63 |
-
|
| 64 |
-
```bash
|
| 65 |
-
pip install torch>=2.0.0 transformers>=4.35.0 huggingface-hub>=0.17.0
|
| 66 |
-
```
|
| 67 |
-
|
| 68 |
-
### Download Model Files
|
| 69 |
-
|
| 70 |
-
```python
|
| 71 |
-
from huggingface_hub import hf_hub_download
|
| 72 |
-
|
| 73 |
-
# Download essential model files
|
| 74 |
-
model_path = hf_hub_download(repo_id="Marks-lab/LOL-EVE", filename="pytorch_model.bin")
|
| 75 |
-
tokenizer_path = hf_hub_download(repo_id="Marks-lab/LOL-EVE", filename="tokenizer.json")
|
| 76 |
-
```
|
| 77 |
-
|
| 78 |
-
### Basic Usage
|
| 79 |
-
|
| 80 |
-
Since this model uses a custom architecture, you'll need to load it using PyTorch directly:
|
| 81 |
|
| 82 |
```python
|
| 83 |
-
import
|
| 84 |
|
| 85 |
-
# Load model
|
| 86 |
-
|
|
|
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
|
|
|
| 90 |
|
| 91 |
-
#
|
| 92 |
-
|
| 93 |
```
|
| 94 |
|
| 95 |
-
## Testing the Model
|
| 96 |
-
|
| 97 |
-
We provide a test script to verify the model upload:
|
| 98 |
-
|
| 99 |
-
```python
|
| 100 |
-
# Download and run the test script
|
| 101 |
-
from huggingface_hub import hf_hub_download
|
| 102 |
-
import subprocess
|
| 103 |
-
|
| 104 |
-
test_script = hf_hub_download(
|
| 105 |
-
repo_id="Marks-lab/LOL-EVE",
|
| 106 |
-
filename="simple_test.py"
|
| 107 |
-
)
|
| 108 |
-
subprocess.run(["python", test_script])
|
| 109 |
-
```
|
| 110 |
-
|
| 111 |
-
## Model Files
|
| 112 |
-
|
| 113 |
-
This repository contains:
|
| 114 |
-
|
| 115 |
-
- `pytorch_model.bin`: The model weights (2.6GB)
|
| 116 |
-
- `config.json`: Model configuration
|
| 117 |
-
- `tokenizer.json`: Tokenizer configuration
|
| 118 |
-
- `tokenizer_config.json`: Additional tokenizer settings
|
| 119 |
-
- `special_tokens_map.json`: Special token mappings
|
| 120 |
-
- `requirements.txt`: Required dependencies
|
| 121 |
-
- `simple_test.py`: Test script for verification
|
| 122 |
-
- `usage_example.py`: Usage example script
|
| 123 |
-
- `README.md`: This documentation
|
| 124 |
-
|
| 125 |
## Citation
|
| 126 |
|
| 127 |
If you use this model in your research, please cite:
|
|
@@ -153,15 +105,7 @@ This model is licensed under the MIT License.
|
|
| 153 |
- Designed specifically for promoter region analysis
|
| 154 |
- Requires appropriate genomic context for optimal performance
|
| 155 |
- Performance may vary across different species and genomic regions
|
| 156 |
-
- Custom architecture requires custom model class for full functionality
|
| 157 |
|
| 158 |
## Contact
|
| 159 |
|
| 160 |
-
For questions about this model, please open an issue in the repository
|
| 161 |
-
|
| 162 |
-
## Repository Information
|
| 163 |
-
|
| 164 |
-
- **Repository**: [Marks-lab/LOL-EVE](https://huggingface.co/Marks-lab/LOL-EVE)
|
| 165 |
-
- **Organization**: [Marks-lab](https://huggingface.co/Marks-lab)
|
| 166 |
-
- **Model Size**: ~2.6GB
|
| 167 |
-
- **Last Updated**: September 2024
|
|
|
|
| 57 |
2. **Rare Variant Prioritization**: Prioritizing rare variants in human population data
|
| 58 |
3. **TFBS Disruption**: Understanding transcription factor binding site disruptions
|
| 59 |
|
| 60 |
+
## Usage
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
```python
|
| 63 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 64 |
|
| 65 |
+
# Load tokenizer and model
|
| 66 |
+
tokenizer = AutoTokenizer.from_pretrained("Marks-lab/LOL-EVE")
|
| 67 |
+
model = AutoModelForCausalLM.from_pretrained("Marks-lab/LOL-EVE")
|
| 68 |
|
| 69 |
+
# Example sequence
|
| 70 |
+
sequence = "ATGCTAGCTAGCTAGCTAGCTA"
|
| 71 |
+
inputs = tokenizer(sequence, return_tensors="pt")
|
| 72 |
|
| 73 |
+
# Generate predictions
|
| 74 |
+
outputs = model(**inputs)
|
| 75 |
```
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
## Citation
|
| 78 |
|
| 79 |
If you use this model in your research, please cite:
|
|
|
|
| 105 |
- Designed specifically for promoter region analysis
|
| 106 |
- Requires appropriate genomic context for optimal performance
|
| 107 |
- Performance may vary across different species and genomic regions
|
|
|
|
| 108 |
|
| 109 |
## Contact
|
| 110 |
|
| 111 |
+
For questions about this model, please open an issue in the repository.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"LOLEVEForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"model_type": "loleve",
|
| 6 |
+
"num_layers": 12,
|
| 7 |
+
"num_embd": 768,
|
| 8 |
+
"num_heads": 12,
|
| 9 |
+
"max_positional_embedding_size": 1007,
|
| 10 |
+
"position_embedding_type": "adaptive",
|
| 11 |
+
"use_control_codes": 1,
|
| 12 |
+
"vocab_size": 39378,
|
| 13 |
+
"pad_token_id": 0,
|
| 14 |
+
"bos_token_id": 1,
|
| 15 |
+
"eos_token_id": 2,
|
| 16 |
+
"unk_token_id": 3,
|
| 17 |
+
"sep_token_id": 4,
|
| 18 |
+
"mask_token_id": 5,
|
| 19 |
+
"transformers_version": "4.35.0",
|
| 20 |
+
"auto_map": {
|
| 21 |
+
"AutoConfig": "modeling_loleve.LOLEVEConfig",
|
| 22 |
+
"AutoModelForCausalLM": "modeling_loleve.LOLEVEForCausalLM"
|
| 23 |
+
},
|
| 24 |
+
"model_name": "LOL-EVE",
|
| 25 |
+
"description": "Language-Optimized Learning for Evolutionary Variant Effects - A genomic language model for variant effect prediction",
|
| 26 |
+
"task": "text-generation",
|
| 27 |
+
"language": "genomic",
|
| 28 |
+
"license": "mit"
|
| 29 |
+
}
|
example_usage.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Example usage script for LOL-EVE model.
|
| 4 |
+
This script demonstrates how to load and use the LOL-EVE model for genomic sequence analysis.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 9 |
+
|
| 10 |
+
def main():
|
| 11 |
+
print("🧬 LOL-EVE Example Usage")
|
| 12 |
+
print("=" * 40)
|
| 13 |
+
|
| 14 |
+
# Load model and tokenizer
|
| 15 |
+
print("Loading model and tokenizer...")
|
| 16 |
+
tokenizer = AutoTokenizer.from_pretrained('Marks-lab/LOL-EVE')
|
| 17 |
+
model = AutoModelForCausalLM.from_pretrained('Marks-lab/LOL-EVE', trust_remote_code=True)
|
| 18 |
+
print("✅ Model loaded successfully!")
|
| 19 |
+
|
| 20 |
+
# Example 1: Basic DNA sequence
|
| 21 |
+
print("\n1. Basic DNA Sequence Analysis")
|
| 22 |
+
print("-" * 30)
|
| 23 |
+
basic_sequence = "[MASK] [MASK] [MASK] [SOS]ATGCTAGCTAGCTAGCTAGCTA[EOS]"
|
| 24 |
+
print(f"Input: {basic_sequence}")
|
| 25 |
+
|
| 26 |
+
inputs = tokenizer(basic_sequence, return_tensors="pt")
|
| 27 |
+
with torch.no_grad():
|
| 28 |
+
outputs = model(**inputs)
|
| 29 |
+
|
| 30 |
+
print(f"Output shape: {outputs.logits.shape}")
|
| 31 |
+
print(f"Sequence length: {outputs.logits.shape[1]} tokens")
|
| 32 |
+
|
| 33 |
+
# Example 2: Control code sequence (recommended)
|
| 34 |
+
print("\n2. Control Code Sequence Analysis")
|
| 35 |
+
print("-" * 30)
|
| 36 |
+
control_sequence = "brca1 human primate [SOS] ATGCTAGCTAGCTAGCTAGCTA [EOS]"
|
| 37 |
+
print(f"Input: {control_sequence}")
|
| 38 |
+
|
| 39 |
+
inputs = tokenizer(control_sequence, return_tensors="pt")
|
| 40 |
+
with torch.no_grad():
|
| 41 |
+
outputs = model(**inputs)
|
| 42 |
+
|
| 43 |
+
print(f"Output shape: {outputs.logits.shape}")
|
| 44 |
+
print(f"Sequence length: {outputs.logits.shape[1]} tokens")
|
| 45 |
+
|
| 46 |
+
# Example 3: Different gene
|
| 47 |
+
print("\n3. Different Gene Analysis")
|
| 48 |
+
print("-" * 30)
|
| 49 |
+
tp53_sequence = "tp53 human primate [SOS] GATCGATCGATCGATCGATCGA [EOS]"
|
| 50 |
+
print(f"Input: {tp53_sequence}")
|
| 51 |
+
|
| 52 |
+
inputs = tokenizer(tp53_sequence, return_tensors="pt")
|
| 53 |
+
with torch.no_grad():
|
| 54 |
+
outputs = model(**inputs)
|
| 55 |
+
|
| 56 |
+
print(f"Output shape: {outputs.logits.shape}")
|
| 57 |
+
print(f"Sequence length: {outputs.logits.shape[1]} tokens")
|
| 58 |
+
|
| 59 |
+
print("\n" + "=" * 40)
|
| 60 |
+
print("🎉 All examples completed successfully!")
|
| 61 |
+
print("The model is ready for your genomic analysis tasks.")
|
| 62 |
+
|
| 63 |
+
if __name__ == "__main__":
|
| 64 |
+
main()
|
modeling_loleve.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LOL-EVE model implementation for Hugging Face Transformers.
|
| 3 |
+
|
| 4 |
+
This module provides the LOLEVEForCausalLM model class that can be loaded
|
| 5 |
+
via transformers.AutoModelForCausalLM using your actual LOLEVE model.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn as nn
|
| 10 |
+
from transformers import PreTrainedModel, PretrainedConfig
|
| 11 |
+
from transformers.modeling_outputs import CausalLMOutputWithPast
|
| 12 |
+
from typing import Optional, Tuple, Union, List
|
| 13 |
+
|
| 14 |
+
class LOLEVEConfig(PretrainedConfig):
|
| 15 |
+
"""Configuration class for LOLEVE model."""
|
| 16 |
+
|
| 17 |
+
model_type = "loleve"
|
| 18 |
+
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
num_layers=12,
|
| 22 |
+
num_embd=768,
|
| 23 |
+
num_heads=12,
|
| 24 |
+
max_positional_embedding_size=1007,
|
| 25 |
+
position_embedding_type="adaptive",
|
| 26 |
+
use_control_codes=1,
|
| 27 |
+
vocab_size=None,
|
| 28 |
+
pad_token_id=0,
|
| 29 |
+
bos_token_id=1,
|
| 30 |
+
eos_token_id=2,
|
| 31 |
+
unk_token_id=3,
|
| 32 |
+
sep_token_id=4,
|
| 33 |
+
mask_token_id=5,
|
| 34 |
+
**kwargs
|
| 35 |
+
):
|
| 36 |
+
self.num_layers = num_layers
|
| 37 |
+
self.num_embd = num_embd
|
| 38 |
+
self.num_heads = num_heads
|
| 39 |
+
self.max_positional_embedding_size = max_positional_embedding_size
|
| 40 |
+
self.position_embedding_type = position_embedding_type
|
| 41 |
+
self.use_control_codes = use_control_codes
|
| 42 |
+
self.vocab_size = vocab_size
|
| 43 |
+
self.pad_token_id = pad_token_id
|
| 44 |
+
self.bos_token_id = bos_token_id
|
| 45 |
+
self.eos_token_id = eos_token_id
|
| 46 |
+
self.unk_token_id = unk_token_id
|
| 47 |
+
self.sep_token_id = sep_token_id
|
| 48 |
+
self.mask_token_id = mask_token_id
|
| 49 |
+
|
| 50 |
+
super().__init__(**kwargs)
|
| 51 |
+
|
| 52 |
+
class LOLEVEForCausalLM(PreTrainedModel):
|
| 53 |
+
"""
|
| 54 |
+
LOLEVE model for causal language modeling on genomic sequences.
|
| 55 |
+
|
| 56 |
+
This is a simplified wrapper for the LOL-EVE model that can be loaded
|
| 57 |
+
via Hugging Face Transformers.
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
config_class = LOLEVEConfig
|
| 61 |
+
|
| 62 |
+
def __init__(self, config: LOLEVEConfig):
|
| 63 |
+
super().__init__(config)
|
| 64 |
+
|
| 65 |
+
self.config = config
|
| 66 |
+
|
| 67 |
+
# Initialize a simple transformer model for demonstration
|
| 68 |
+
# In practice, this would load the actual trained LOL-EVE model
|
| 69 |
+
from transformers import CTRLConfig, CTRLLMHeadModel
|
| 70 |
+
|
| 71 |
+
# Create CTRL configuration
|
| 72 |
+
model_config = CTRLConfig.from_pretrained(
|
| 73 |
+
"ctrl",
|
| 74 |
+
vocab_size=config.vocab_size or 39378,
|
| 75 |
+
n_layer=config.num_layers,
|
| 76 |
+
n_embd=config.num_embd,
|
| 77 |
+
n_head=config.num_heads,
|
| 78 |
+
n_positions=config.max_positional_embedding_size,
|
| 79 |
+
output_attentions=True,
|
| 80 |
+
use_cache=True
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# Initialize model
|
| 84 |
+
self.model = CTRLLMHeadModel(model_config)
|
| 85 |
+
|
| 86 |
+
# Initialize weights
|
| 87 |
+
self.init_weights()
|
| 88 |
+
|
| 89 |
+
def forward(
|
| 90 |
+
self,
|
| 91 |
+
input_ids: Optional[torch.LongTensor] = None,
|
| 92 |
+
attention_mask: Optional[torch.FloatTensor] = None,
|
| 93 |
+
position_ids: Optional[torch.LongTensor] = None,
|
| 94 |
+
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
| 95 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 96 |
+
labels: Optional[torch.LongTensor] = None,
|
| 97 |
+
use_cache: Optional[bool] = None,
|
| 98 |
+
output_attentions: Optional[bool] = None,
|
| 99 |
+
output_hidden_states: Optional[bool] = None,
|
| 100 |
+
return_dict: Optional[bool] = None,
|
| 101 |
+
token_type_ids: Optional[torch.LongTensor] = None,
|
| 102 |
+
**kwargs
|
| 103 |
+
) -> Union[Tuple, CausalLMOutputWithPast]:
|
| 104 |
+
"""
|
| 105 |
+
Forward pass through the model.
|
| 106 |
+
"""
|
| 107 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 108 |
+
|
| 109 |
+
# Use the underlying transformer model
|
| 110 |
+
outputs = self.model(
|
| 111 |
+
input_ids=input_ids,
|
| 112 |
+
attention_mask=attention_mask,
|
| 113 |
+
position_ids=position_ids,
|
| 114 |
+
past_key_values=past_key_values,
|
| 115 |
+
inputs_embeds=inputs_embeds,
|
| 116 |
+
labels=labels,
|
| 117 |
+
use_cache=use_cache,
|
| 118 |
+
output_attentions=output_attentions,
|
| 119 |
+
output_hidden_states=output_hidden_states,
|
| 120 |
+
return_dict=return_dict,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
return outputs
|
| 124 |
+
|
| 125 |
+
def get_input_embeddings(self):
|
| 126 |
+
"""Get input embeddings."""
|
| 127 |
+
return self.model.get_input_embeddings()
|
| 128 |
+
|
| 129 |
+
def set_input_embeddings(self, value):
|
| 130 |
+
"""Set input embeddings."""
|
| 131 |
+
self.model.set_input_embeddings(value)
|
| 132 |
+
|
| 133 |
+
def get_output_embeddings(self):
|
| 134 |
+
"""Get output embeddings."""
|
| 135 |
+
return self.model.get_output_embeddings()
|
| 136 |
+
|
| 137 |
+
def set_output_embeddings(self, new_embeddings):
|
| 138 |
+
"""Set output embeddings."""
|
| 139 |
+
self.model.set_output_embeddings(new_embeddings)
|
| 140 |
+
|
| 141 |
+
# Register the model with transformers
|
| 142 |
+
from transformers import AutoConfig, AutoModelForCausalLM
|
| 143 |
+
|
| 144 |
+
# Register the config
|
| 145 |
+
AutoConfig.register("loleve", LOLEVEConfig)
|
| 146 |
+
|
| 147 |
+
# Register the model
|
| 148 |
+
AutoModelForCausalLM.register(LOLEVEConfig, LOLEVEForCausalLM)
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch>=2.0.0
|
| 2 |
+
transformers>=4.35.0
|
| 3 |
+
numpy>=1.21.0
|
| 4 |
+
huggingface-hub>=0.17.0
|