import json import os import random import sys import torch from torch.utils.data import Dataset, DataLoader from typing import Any, Dict, List, Tuple from bioreason.dataset.utils import torch_to_hf_dataset from bioreason.models.dl.processing_dl import DLProcessor from trl.data_utils import maybe_apply_chat_template def get_format_variant_effect_function(model_name: str) -> Any: """ Get the appropriate format function for a given model name. """ if model_name.lower() == "llm": return format_variant_effect_for_llm elif model_name.lower() == "dna-llm": return format_variant_effect_for_dna_llm else: raise ValueError(f"Unsupported model name: {model_name}") def clean_variant_effect_example(example: Dict[str, Any]) -> Dict[str, Any]: """ Clean a variant effect example. """ example['answer'] = example['answer'].split(";")[0].strip().lower() return example def clean_variant_effect_non_snv_example(example: Dict[str, Any]) -> Dict[str, Any]: """ Clean a variant effect non-SNV example. """ example['answer'] = example['answer'].replace("[", "").replace("]", "").replace("'", "").replace("_", " ").strip() return example def format_variant_effect_for_dna_llm(example: Dict[str, Any]) -> Dict[str, Any]: """ Format a VEP example into the required chat format for DNA-LLM. """ return { "prompt": [ { "role": "user", "content": [ *({"type": "dna", "text": None} for _ in range(2)), {"type": "text", "text": example["question"].strip()}, ], }, { "role": "assistant", "reasoning_content": f"Answer: {example['answer'].strip()}", "content": [ {"type": "text", "text": f"Answer: {example['answer'].strip()}"}, ], }, ], "dna_sequences": [ example["reference_sequence"], example["variant_sequence"], ], "answer": example["answer"].strip(), } def format_variant_effect_for_llm(example: Dict[str, Any]) -> Dict[str, Any]: """ Format a VEP example into the required chat format for LLM. """ question = f"Reference sequence: {example['reference_sequence']}\nVariant sequence: {example['variant_sequence']}\nQuestion: {example['question']}" return { "prompt": [ { "role": "user", "content": [ *({"type": "dna", "text": None} for _ in range(2)), {"type": "text", "text": question.strip()}, ], }, { "role": "assistant", "reasoning_content": f"Answer: {example['answer'].strip()}", "content": [ {"type": "text", "text": f"Answer: {example['answer'].strip()}"}, ], }, ], "dna_sequences": [ "", "", ], "answer": example["answer"].strip(), }