|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
import torch |
|
|
import random |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("nferruz/ProtGPT2") |
|
|
model = AutoModelForCausalLM.from_pretrained("nferruz/ProtGPT2") |
|
|
|
|
|
|
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
model.config.pad_token_id = model.config.eos_token_id |
|
|
|
|
|
def generate_binders(fusion_context, strategy='low_shot', num_candidates=10): |
|
|
seed_sequence = fusion_context['embedding_vector'][:10] |
|
|
seed = ''.join([chr(int(65 + abs(int(x * 10)) % 20)) for x in seed_sequence]) |
|
|
|
|
|
|
|
|
inputs = tokenizer(seed, return_tensors="pt", padding=True) |
|
|
input_ids = inputs["input_ids"] |
|
|
attention_mask = inputs["attention_mask"] |
|
|
|
|
|
outputs = model.generate( |
|
|
input_ids, |
|
|
attention_mask=attention_mask, |
|
|
do_sample=True, |
|
|
top_k=950, |
|
|
top_p=0.96, |
|
|
temperature=1.0, |
|
|
max_length=200, |
|
|
num_return_sequences=num_candidates, |
|
|
pad_token_id=tokenizer.pad_token_id |
|
|
) |
|
|
|
|
|
binders = [] |
|
|
for output in outputs: |
|
|
sequence = tokenizer.decode(output, skip_special_tokens=True) |
|
|
sequence = ''.join([aa for aa in sequence if aa in "ACDEFGHIKLMNPQRSTVWY"]) |
|
|
if len(sequence) > 30: |
|
|
binders.append({"sequence": sequence}) |
|
|
|
|
|
return {"generated_binders": binders} |
|
|
|