File size: 3,140 Bytes
ffcfc75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import json
import os
import random
import sys
import torch
from torch.utils.data import Dataset, DataLoader
from typing import Any, Dict, List, Tuple

from bioreason.dataset.utils import torch_to_hf_dataset
from bioreason.models.dl.processing_dl import DLProcessor
from trl.data_utils import maybe_apply_chat_template


def get_format_variant_effect_function(model_name: str) -> Any:
    """
    Get the appropriate format function for a given model name.
    """
    if model_name.lower() == "llm":
        return format_variant_effect_for_llm
    elif model_name.lower() == "dna-llm":
        return format_variant_effect_for_dna_llm
    else:
        raise ValueError(f"Unsupported model name: {model_name}")
    

def clean_variant_effect_example(example: Dict[str, Any]) -> Dict[str, Any]:
    """
    Clean a variant effect example.
    """
    example['answer'] = example['answer'].split(";")[0].strip().lower()
    return example


def clean_variant_effect_non_snv_example(example: Dict[str, Any]) -> Dict[str, Any]:
    """
    Clean a variant effect non-SNV example.
    """
    example['answer'] = example['answer'].replace("[", "").replace("]", "").replace("'", "").replace("_", " ").strip()
    return example


def format_variant_effect_for_dna_llm(example: Dict[str, Any]) -> Dict[str, Any]:
    """
    Format a VEP example into the required chat format for DNA-LLM.
    """
    return {
        "prompt": [
            {
                "role": "user",
                "content": [
                    *({"type": "dna", "text": None} for _ in range(2)),
                    {"type": "text", "text": example["question"].strip()},
                ],
            },
            {
                "role": "assistant",
                "reasoning_content": f"Answer: {example['answer'].strip()}",
                "content": [
                    {"type": "text", "text": f"Answer: {example['answer'].strip()}"},
                ],
            },
        ],
        "dna_sequences": [
            example["reference_sequence"],
            example["variant_sequence"],
        ],
        "answer": example["answer"].strip(),
    }


def format_variant_effect_for_llm(example: Dict[str, Any]) -> Dict[str, Any]:
    """
    Format a VEP example into the required chat format for LLM.
    """
    question = f"Reference sequence: {example['reference_sequence']}\nVariant sequence: {example['variant_sequence']}\nQuestion: {example['question']}"
    return {
        "prompt": [
            {
                "role": "user",
                "content": [
                    *({"type": "dna", "text": None} for _ in range(2)),
                    {"type": "text", "text": question.strip()},
                ],
            },
            {
                "role": "assistant",
                "reasoning_content": f"Answer: {example['answer'].strip()}",
                "content": [
                    {"type": "text", "text": f"Answer: {example['answer'].strip()}"},
                ],
            },
        ],
        "dna_sequences": [
            "",
            "",
        ],
        "answer": example["answer"].strip(),
    }