| from transformers import pipeline |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| import math |
| import torch |
| import sys |
| import pandas as pd |
|
|
| |
| def calculate_perplexity(sequence, model, tokenizer): |
| sequence = "<|endoftext|>" + sequence + "<|endoftext|>" |
| input_ids = torch.tensor(tokenizer.encode(sequence)).unsqueeze(0) |
| input_ids = input_ids.to(device) |
| with torch.no_grad(): |
| outputs = model(input_ids, labels=input_ids) |
| loss, _ = outputs[:2] |
| return math.exp(loss) |
|
|
| if __name__ == "__main__": |
| device = torch.device("cuda" if torch.cuda.is_available() else 'cpu') |
| path = "/workspace/sg666/MDpLM/benchmarks/Generation/ProtGPT2" |
|
|
| |
| model_path = path + "/finetuned_models/checkpoint-4510" |
| model = AutoModelForCausalLM.from_pretrained(model_path) |
| tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
|
| |
| protgpt2 = pipeline('text-generation', model=model_path, device=device) |
| sequences = protgpt2("", max_length=100, do_sample=True, top_k=950, repetition_penalty=1.5, num_return_sequences=100, eos_token_id=0) |
|
|
| |
| generated_sequences = [] |
| perplexities = [] |
|
|
|
|
| |
| for item in sequences: |
| raw_sequence = item['generated_text'] |
| ppl = calculate_perplexity(raw_sequence, model.to(device), tokenizer) |
| generated_sequences.append(raw_sequence) |
| perplexities.append(ppl) |
|
|
| |
| cleaned_sequences = [seq.replace('\n', '').replace('<|endoftext|>', '') for seq in generated_sequences] |
|
|
| |
| df = pd.DataFrame({"Sequence": cleaned_sequences, "Perplexity": perplexities}) |
| df.sort_values(by='Perplexity', inplace=True) |
|
|
| |
| df.to_csv(path + "/protgpt2_generated_sequences.csv", index=False) |
|
|
| |
| avg_generation_ppl = df.loc[:, 'Perplexity'].mean() |
| print(f'Average de novo generation perplexity: {avg_generation_ppl}') |
|
|