# src/ml/download_models.py from transformers import AutoTokenizer, AutoModel import torch def download_esm2_model(): """Download ESM-2 model for protein sequences""" print("Downloading ESM-2 model...") # Start with smaller model for testing model_name = "facebook/esm2_t12_35M_UR50D" # 35M parameters # Later upgrade to: "facebook/esm2_t33_650M_UR50D" # 650M parameters tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) # Save locally tokenizer.save_pretrained("models/pretrained/esm2") model.save_pretrained("models/pretrained/esm2") print("ESM-2 downloaded successfully!") def download_dnabert2_model(): """Download DNABERT-2 for DNA sequences""" print("Downloading DNABERT-2 model...") model_name = "zhihan1996/DNABERT-2-117M" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModel.from_pretrained(model_name, trust_remote_code=True) tokenizer.save_pretrained("models/pretrained/dnabert2") model.save_pretrained("models/pretrained/dnabert2") print("DNABERT-2 downloaded successfully!") if __name__ == "__main__": download_esm2_model() download_dnabert2_model()