+Made with ❤️ at University of Toronto, Vector Institute, and University Health Network +
diff --git a/BioReason-main/bioreason.egg-info/PKG-INFO b/BioReason-main/bioreason.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..1b0f156459433672bd6d5f2976f0fe642559e568 --- /dev/null +++ b/BioReason-main/bioreason.egg-info/PKG-INFO @@ -0,0 +1,181 @@ +Metadata-Version: 2.4 +Name: bioreason +Version: 0.1.0 +Summary: Bio-related Reasoning with Language Models +License: UNKNOWN +Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.11 +Classifier: License :: OSI Approved :: MIT License +Classifier: Operating System :: OS Independent +Requires-Python: >=3.11 +Description-Content-Type: text/markdown +License-File: LICENSE +Requires-Dist: torch +Requires-Dist: torchvision +Requires-Dist: transformers +Requires-Dist: accelerate +Requires-Dist: qwen-vl-utils +Requires-Dist: jupyter +Requires-Dist: datasets +Requires-Dist: peft +Requires-Dist: pytorch_lightning +Requires-Dist: wandb +Requires-Dist: trl[vllm] +Requires-Dist: bitsandbytes +Requires-Dist: deepspeed +Provides-Extra: dev +Requires-Dist: pytest; extra == "dev" +Requires-Dist: black; extra == "dev" +Requires-Dist: isort; extra == "dev" +Requires-Dist: mypy; extra == "dev" +Dynamic: license-file + ++Made with ❤️ at University of Toronto, Vector Institute, and University Health Network +
diff --git a/BioReason-main/bioreason.egg-info/SOURCES.txt b/BioReason-main/bioreason.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..391fc97522cd46e17ef8ed932a77e26cd6373052 --- /dev/null +++ b/BioReason-main/bioreason.egg-info/SOURCES.txt @@ -0,0 +1,9 @@ +LICENSE +README.md +pyproject.toml +bioreason/__init__.py +bioreason.egg-info/PKG-INFO +bioreason.egg-info/SOURCES.txt +bioreason.egg-info/dependency_links.txt +bioreason.egg-info/requires.txt +bioreason.egg-info/top_level.txt \ No newline at end of file diff --git a/BioReason-main/bioreason.egg-info/dependency_links.txt b/BioReason-main/bioreason.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/BioReason-main/bioreason.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/BioReason-main/bioreason.egg-info/requires.txt b/BioReason-main/bioreason.egg-info/requires.txt new file mode 100644 index 0000000000000000000000000000000000000000..5dee50c1b476cc5e7f85f7b8e996c57311f348d7 --- /dev/null +++ b/BioReason-main/bioreason.egg-info/requires.txt @@ -0,0 +1,19 @@ +torch +torchvision +transformers +accelerate +qwen-vl-utils +jupyter +datasets +peft +pytorch_lightning +wandb +trl[vllm] +bitsandbytes +deepspeed + +[dev] +pytest +black +isort +mypy diff --git a/BioReason-main/bioreason.egg-info/top_level.txt b/BioReason-main/bioreason.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b9995054ed2dec4763dced82d34176d2c514f30 --- /dev/null +++ b/BioReason-main/bioreason.egg-info/top_level.txt @@ -0,0 +1 @@ +bioreason diff --git a/BioReason-main/bioreason/__init__.py b/BioReason-main/bioreason/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/BioReason-main/bioreason/dataset/__init__.py b/BioReason-main/bioreason/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a318a901b205faa943fd38d702d1001effb1cdd0 --- /dev/null +++ b/BioReason-main/bioreason/dataset/__init__.py @@ -0,0 +1,11 @@ +from .kegg import KEGGDataset, split_kegg_dataset +from .utils import torch_to_hf_dataset, truncate_dna +from .variant_effect import get_format_variant_effect_function + +__all__ = [ + "KEGGDataset", + "split_kegg_dataset", + "torch_to_hf_dataset", + "truncate_dna", + "get_format_variant_effect_function", +] diff --git a/BioReason-main/bioreason/dataset/kegg.py b/BioReason-main/bioreason/dataset/kegg.py new file mode 100644 index 0000000000000000000000000000000000000000..d721e79c9d06bfa3b713d0c0223ebc482ec43fee --- /dev/null +++ b/BioReason-main/bioreason/dataset/kegg.py @@ -0,0 +1,382 @@ +import json +import os +import random +import sys +import torch +from torch.utils.data import Dataset, DataLoader +from typing import Any, Dict, List, Tuple + +from bioreason.dataset.utils import torch_to_hf_dataset +from bioreason.models.dl.processing_dl import DLProcessor +from trl.data_utils import maybe_apply_chat_template + + +class KEGGDataset(Dataset): + """Dataset for KEGG data.""" + + def __init__(self, data_dir: str): + """ + Initialize the dataset by loading all JSON files from the given directory. + + Args: + data_dir: Path to the directory containing JSON files + """ + self.data_dir = data_dir + self.data = [] + + # Load all JSON files + json_files = sorted([f for f in os.listdir(data_dir) if f.endswith(".json")]) + + # Process each file + for filename in json_files: + file_path = os.path.join(data_dir, filename) + kegg_id = filename.split("_")[1] + + with open(file_path, "r", encoding="utf-8") as f: + item = json.load(f) + item["kegg_id"] = kegg_id + processed_item = self._process_item(item) + self.data.append(processed_item) + + def _process_item(self, item: Dict[str, Any]) -> Dict[str, Any]: + """ + Process a single data item to format fields as required. + + Args: + item: Original data item from JSON + + Returns: + Processed data item + """ + # Extract question as is + question = item.get("question", "") + + # Convert answer to lowercase and strip whitespace + answer = item.get("answer", "").lower().strip() + + # Combine reasoning steps into a single paragraph with newlines + reasoning_steps = item.get("reasoning", {}).get("reasoning_steps", []) + reasoning = "\n".join(reasoning_steps) + + # Convert sequences to uppercase and strip whitespace + reference_sequence = item.get("reference_sequence", "").upper().strip() + variant_sequence = item.get("variant_sequence", "").upper().strip() + + return { + "question": question, + "answer": answer, + "reasoning": reasoning, + "reference_sequence": reference_sequence, + "variant_sequence": variant_sequence, + } + + def __len__(self) -> int: + """Return the number of items in the dataset.""" + return len(self.data) + + def __getitem__(self, idx: int) -> Dict[str, Any]: + """Return a specific item from the dataset.""" + return self.data[idx] + + +def split_kegg_dataset( + dataset: KEGGDataset, + train_ratio: float = 0.8, + val_ratio: float = 0.1, + test_ratio: float = 0.1, + seed: int = 42, +) -> Tuple[KEGGDataset, KEGGDataset, KEGGDataset]: + """ + Split a KEGG dataset into train, validation, and test sets. + + Args: + dataset: The dataset to split + train_ratio: Proportion of data for training + val_ratio: Proportion of data for validation + test_ratio: Proportion of data for testing + batch_size: Batch size for the dataloaders + seed: Random seed for reproducibility + + Returns: + Tuple of (train_dataset, val_dataset, test_dataset) + """ + # Calculate the size of each split + dataset_size = len(dataset) + train_size = int(train_ratio * dataset_size) + val_size = int(val_ratio * dataset_size) + test_size = dataset_size - train_size - val_size + assert train_ratio + val_ratio + test_ratio == 1.0, "Ratios must sum to 1" + + # Set the random seed + torch.manual_seed(seed) + random.seed(seed) + + # Split the dataset + train_dataset, val_dataset, test_dataset = torch.utils.data.random_split( + dataset, [train_size, val_size, test_size] + ) + + return train_dataset, val_dataset, test_dataset + + +def create_kegg_dataloader( + data_dir: str, + batch_size: int = 2, + shuffle: bool = True, + num_workers: int = 2, + pin_memory: bool = True, +) -> DataLoader: + """ + Create a DataLoader for the KEGG dataset. + + Args: + data_dir: Path to the directory containing JSON files + batch_size: Batch size for the dataloader + shuffle: Whether to shuffle the data + num_workers: Number of worker processes for loading data + pin_memory: Whether to pin memory for faster data transfer + + Returns: + DataLoader for the KEGG dataset + """ + dataset = KEGGDataset(data_dir) + return DataLoader( + dataset, + batch_size=batch_size, + shuffle=shuffle, + num_workers=num_workers, + pin_memory=pin_memory, + ) + + +def get_format_kegg_function(model_name: str) -> Any: + """ + Get the appropriate format function for a given model name. + """ + if model_name.lower() == "llm": + return format_kegg_for_llm + elif model_name.lower() == "dna-llm": + return format_kegg_for_dna_llm + else: + raise ValueError(f"Unsupported model name: {model_name}") + + +def format_kegg_for_dna_llm(example: Dict[str, Any]) -> Dict[str, Any]: + """ + Format a KEGG example into the required chat format for DNA-LLM. + """ + return { + "prompt": [ + { + "role": "user", + "content": [ + *({"type": "dna", "text": None} for _ in range(2)), + {"type": "text", "text": example["question"].strip()}, + ], + }, + { + "role": "assistant", + "reasoning_content": example["reasoning"].strip(), + "content": [ + {"type": "text", "text": f"Answer: {example['answer'].strip()}"}, + ], + }, + ], + "dna_sequences": [ + example["reference_sequence"], + example["variant_sequence"], + ], + "answer": example["answer"], + } + + +def format_kegg_for_llm(example: Dict[str, Any]) -> Dict[str, Any]: + """ + Format a KEGG example into the required chat format for LLM. + """ + question = f"Reference sequence: {example['reference_sequence']}\nVariant sequence: {example['variant_sequence']}\nQuestion: {example['question']}" + return { + "prompt": [ + { + "role": "user", + "content": [ + *({"type": "dna", "text": None} for _ in range(2)), + {"type": "text", "text": question.strip()}, + ], + }, + { + "role": "assistant", + "reasoning_content": example["reasoning"].strip(), + "content": [ + {"type": "text", "text": f"Answer: {example['answer'].strip()}"}, + ], + }, + ], + "dna_sequences": [ + "", + "", + ], + "answer": example["answer"], + } + + +def qwen_dna_collate_fn( + examples: List[Dict], + processor: DLProcessor, + max_length_text: int, + max_length_dna: int, + return_answer_in_batch: bool = False, +) -> Dict: + """ + Custom collate function for Qwen DNA models. + + Creates a batch with proper labels for supervised fine-tuning where only + the assistant responses contribute to the loss calculation. + """ + prompts_text = [ + maybe_apply_chat_template(example, processor)["prompt"] for example in examples + ] + batch_dna_sequences = [example["dna_sequences"] for example in examples] + + batch = processor( + text=prompts_text, + batch_dna_sequences=batch_dna_sequences, + return_tensors="pt", + padding=True, + padding_side="left", + add_special_tokens=False, + max_length_text=max_length_text, + max_length_dna=max_length_dna, + ) + + # Create labels tensor filled with -100 (ignored in loss calculation) + labels = torch.full_like(batch["input_ids"], -100) + + # Get token IDs for special markers + assistant_start_marker = "<|im_start|>assistant\n" + im_end_marker = "<|im_end|>" + + assistant_start_token_ids = processor.tokenizer.encode( + assistant_start_marker, add_special_tokens=False + ) + im_end_token_ids = processor.tokenizer.encode( + im_end_marker, add_special_tokens=False + ) + + # Convert token arrays to tensors for faster comparison + assistant_marker_tensor = torch.tensor( + assistant_start_token_ids, device=batch["input_ids"].device + ) + im_end_marker_tensor = torch.tensor( + im_end_token_ids, device=batch["input_ids"].device + ) + + # Get dimensions for easier reference + assistant_marker_len = len(assistant_start_token_ids) + im_end_marker_len = len(im_end_token_ids) + + # For each sequence in the batch + for i in range(batch["input_ids"].shape[0]): + input_ids = batch["input_ids"][i] + seq_len = input_ids.size(0) + + # Track assistant sections + assistant_sections = [] + + # Find all assistant start markers + start_positions = [] + for pos in range(seq_len - assistant_marker_len + 1): + if torch.all( + input_ids[pos : pos + assistant_marker_len] == assistant_marker_tensor + ): + start_positions.append( + pos + assistant_marker_len + ) # Store position after marker + + # Find all end markers + end_positions = [] + for pos in range(seq_len - im_end_marker_len + 1): + if torch.all( + input_ids[pos : pos + im_end_marker_len] == im_end_marker_tensor + ): + end_positions.append(pos) # Store position at start of end marker + + # Match start and end markers to create sections + for start_pos in start_positions: + # Find the next end marker after this start position + valid_ends = [pos for pos in end_positions if pos > start_pos] + if valid_ends: + end_pos = min(valid_ends) # Take the first end marker after start + # Only include content between markers (not the markers themselves) + if start_pos < end_pos: + assistant_sections.append((start_pos, end_pos)) + else: + # If no end marker, assume the section runs to the end of the sequence + assistant_sections.append((start_pos, seq_len)) + + # Set labels for all identified assistant sections + for start_pos, end_pos in assistant_sections: + if start_pos < end_pos and start_pos < seq_len: + end_pos = min(end_pos, seq_len) # Safety check + labels[i, start_pos:end_pos] = input_ids[start_pos:end_pos] + + # Also mask padding tokens + labels[batch["input_ids"] == processor.tokenizer.pad_token_id] = -100 + + # Add labels to batch + batch["labels"] = labels + + # Add answer to batch + if return_answer_in_batch: + batch["answer"] = [example["answer"].strip() for example in examples] + + return batch + + +def dna_collate_fn( + batch: List[Dict[str, Any]], + dna_tokenizer: Any, + label2id: Dict[str, int], + max_length: int = 2048, +) -> Dict[str, Any]: + """ + Custom collate function for DNA models. + """ + ref_sequences = [item["reference_sequence"] for item in batch] + alt_sequences = [item["variant_sequence"] for item in batch] + + # Tokenize DNA sequences separately + tokenized_ref = dna_tokenizer( + ref_sequences, + padding=True, + truncation=True, + max_length=max_length, + return_tensors="pt", + ) + + tokenized_alt = dna_tokenizer( + alt_sequences, + padding=True, + truncation=True, + max_length=max_length, + return_tensors="pt", + ) + + # Get labels + labels = [] + for item in batch: + label = label2id[item["answer"]] + labels.append(label) + + # Create labels tensor + labels_tensor = torch.tensor(labels, dtype=torch.long) + + tokenized_batch = { + "ref_ids": tokenized_ref.input_ids, + "ref_attention_mask": tokenized_ref.attention_mask, + "alt_ids": tokenized_alt.input_ids, + "alt_attention_mask": tokenized_alt.attention_mask, + "labels": labels_tensor, + } + + return tokenized_batch diff --git a/BioReason-main/bioreason/dataset/utils.py b/BioReason-main/bioreason/dataset/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..25488fe729ff5b34be0488ddc5fdf2dbd6cb9993 --- /dev/null +++ b/BioReason-main/bioreason/dataset/utils.py @@ -0,0 +1,59 @@ +from datasets import Dataset as HFDataset +from torch.utils.data import Dataset as TorchDataset +from typing import Dict, Any, Union, List + + +def truncate_dna( + example: Dict[str, Any], truncate_dna_per_side: int = 1024 +) -> Dict[str, Any]: + """ + Truncate DNA sequences by removing a specified number of base pairs from both ends. + If the sequence is too short, it will return the middle portion. + """ + for key in ["reference_sequence", "variant_sequence"]: + sequence = example[key] + seq_len = len(sequence) + + if seq_len > 2 * truncate_dna_per_side + 8: + example[key] = sequence[truncate_dna_per_side:-truncate_dna_per_side] + + return example + + +def torch_to_hf_dataset(torch_dataset: TorchDataset) -> HFDataset: + """ + Convert a PyTorch Dataset to a Hugging Face Dataset. + + This function takes a PyTorch Dataset and converts it to a Hugging Face Dataset + by extracting all items and organizing them into a dictionary structure that + can be used to create a Hugging Face Dataset. + + Args: + torch_dataset: A PyTorch Dataset object to be converted + + Returns: + A Hugging Face Dataset containing the same data as the input PyTorch Dataset + """ + # Get first item to determine structure + if len(torch_dataset) == 0: + return HFDataset.from_dict({}) + + first_item = torch_dataset[0] + + # Initialize dictionary based on first item's keys + data_dict = ( + {k: [] for k in first_item.keys()} + if isinstance(first_item, dict) + else {"data": []} + ) + + # Populate dictionary + for i in range(len(torch_dataset)): + item = torch_dataset[i] + if isinstance(item, dict): + for k in data_dict: + data_dict[k].append(item[k]) + else: + data_dict["data"].append(item) + + return HFDataset.from_dict(data_dict) diff --git a/BioReason-main/bioreason/dataset/variant_effect.py b/BioReason-main/bioreason/dataset/variant_effect.py new file mode 100644 index 0000000000000000000000000000000000000000..f36b4a29943b9026cadfd2916fa4dc0e70f1722c --- /dev/null +++ b/BioReason-main/bioreason/dataset/variant_effect.py @@ -0,0 +1,98 @@ +import json +import os +import random +import sys +import torch +from torch.utils.data import Dataset, DataLoader +from typing import Any, Dict, List, Tuple + +from bioreason.dataset.utils import torch_to_hf_dataset +from bioreason.models.dl.processing_dl import DLProcessor +from trl.data_utils import maybe_apply_chat_template + + +def get_format_variant_effect_function(model_name: str) -> Any: + """ + Get the appropriate format function for a given model name. + """ + if model_name.lower() == "llm": + return format_variant_effect_for_llm + elif model_name.lower() == "dna-llm": + return format_variant_effect_for_dna_llm + else: + raise ValueError(f"Unsupported model name: {model_name}") + + +def clean_variant_effect_example(example: Dict[str, Any]) -> Dict[str, Any]: + """ + Clean a variant effect example. + """ + example['answer'] = example['answer'].split(";")[0].strip().lower() + return example + + +def clean_variant_effect_non_snv_example(example: Dict[str, Any]) -> Dict[str, Any]: + """ + Clean a variant effect non-SNV example. + """ + example['answer'] = example['answer'].replace("[", "").replace("]", "").replace("'", "").replace("_", " ").strip() + return example + + +def format_variant_effect_for_dna_llm(example: Dict[str, Any]) -> Dict[str, Any]: + """ + Format a VEP example into the required chat format for DNA-LLM. + """ + return { + "prompt": [ + { + "role": "user", + "content": [ + *({"type": "dna", "text": None} for _ in range(2)), + {"type": "text", "text": example["question"].strip()}, + ], + }, + { + "role": "assistant", + "reasoning_content": f"Answer: {example['answer'].strip()}", + "content": [ + {"type": "text", "text": f"Answer: {example['answer'].strip()}"}, + ], + }, + ], + "dna_sequences": [ + example["reference_sequence"], + example["variant_sequence"], + ], + "answer": example["answer"].strip(), + } + + +def format_variant_effect_for_llm(example: Dict[str, Any]) -> Dict[str, Any]: + """ + Format a VEP example into the required chat format for LLM. + """ + question = f"Reference sequence: {example['reference_sequence']}\nVariant sequence: {example['variant_sequence']}\nQuestion: {example['question']}" + return { + "prompt": [ + { + "role": "user", + "content": [ + *({"type": "dna", "text": None} for _ in range(2)), + {"type": "text", "text": question.strip()}, + ], + }, + { + "role": "assistant", + "reasoning_content": f"Answer: {example['answer'].strip()}", + "content": [ + {"type": "text", "text": f"Answer: {example['answer'].strip()}"}, + ], + }, + ], + "dna_sequences": [ + "", + "", + ], + "answer": example["answer"].strip(), + } \ No newline at end of file diff --git a/BioReason-main/bioreason/dna_modules/__init__.py b/BioReason-main/bioreason/dna_modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0b2b166dfc42880b3737646adb004e43873633d9 --- /dev/null +++ b/BioReason-main/bioreason/dna_modules/__init__.py @@ -0,0 +1,4 @@ +from .dna_module import DNABaseModule +from .nucleotide_module import NucleotideDNAModule + +__all__ = ["DNABaseModule", "NucleotideDNAModule"] \ No newline at end of file diff --git a/BioReason-main/bioreason/dna_modules/dna_module.py b/BioReason-main/bioreason/dna_modules/dna_module.py new file mode 100644 index 0000000000000000000000000000000000000000..679d92745fec46687e73d99e5ade6f50a54c4811 --- /dev/null +++ b/BioReason-main/bioreason/dna_modules/dna_module.py @@ -0,0 +1,49 @@ +from abc import ABC, abstractmethod +from typing import Dict, Any, Union +import torch + +class DNABaseModule(ABC): + def __init__(self): + super().__init__() + + @abstractmethod + def get_dnallm_key(self): + pass + + @abstractmethod + def get_model_class(self, model_id: str, model_init_kwargs: dict): + pass + + def post_model_init(self, model, processing_class): + pass + + def is_embeds_input(self): + return False + + @abstractmethod + def get_processing_class(self): + pass + + @abstractmethod + def get_dnallm_modules_keywords(self): + pass + + @abstractmethod + def get_custom_multimodal_keywords(self): + pass + + @abstractmethod + def get_non_generate_params(self): + pass + + @abstractmethod + def get_custom_processing_keywords(self): + pass + + @abstractmethod + def prepare_prompt(self, processing_class, inputs: dict[str, Union[torch.Tensor, Any]]): + pass + + @abstractmethod + def prepare_model_inputs(self, processing_class, prompts_text, images, return_tensors, padding, padding_side, add_special_tokens): + pass \ No newline at end of file diff --git a/BioReason-main/bioreason/dna_modules/nucleotide_module.py b/BioReason-main/bioreason/dna_modules/nucleotide_module.py new file mode 100644 index 0000000000000000000000000000000000000000..ef40652ddbbfd817435460ae2d9033e823cd5096 --- /dev/null +++ b/BioReason-main/bioreason/dna_modules/nucleotide_module.py @@ -0,0 +1,263 @@ +from transformers import ( + Qwen2_5_VLForConditionalGeneration, + Qwen2VLForConditionalGeneration, + AutoProcessor, +) +from typing import Dict, Any, Union, List, Optional, Callable, Type +from trl.data_utils import maybe_apply_chat_template +from trl import SFTTrainer +import torch + +from bioreason.dna_modules.dna_module import DNABaseModule +from bioreason.models.dna_llm import DNALLMModel +from bioreason.models.dl.processing_dl import DLProcessor + + +class NucleotideDNAModule(DNABaseModule): + """ + DNA module implementation for NucleotideTransformer-based models. + + This module provides the interface between DNA-LLM models and the training + infrastructure, handling model loading, processing setup, and reward functions. + """ + + def __init__(self): + """Initialize the NucleotideDNAModule.""" + super().__init__() + + def get_dnallm_key(self) -> str: + """ + Get the key identifier for this DNA-LLM implementation. + + Returns: + String identifier for this module type + """ + return "qwen" + + def get_model_class(self, model_id: str, model_init_kwargs: Dict[str, Any]) -> Type: + """ + Return the appropriate model class based on model ID. + + Args: + model_id: Identifier for the model + model_init_kwargs: Initialization arguments for the model + + Returns: + The model class to instantiate + + Raises: + ValueError: If the model is not supported + """ + if "DNALLM" in model_id: + model_cls = DNALLMModel + else: + raise ValueError(f"Unsupported model: {model_id}") + return model_cls + + def post_model_init(self, model: Any, processing_class: Any) -> None: + """ + Perform any post-initialization setup on the model. + + Args: + model: The initialized model + processing_class: The processor for the model + """ + # No post-init needed for this implementation + pass + + def get_processing_class(self) -> Type: + """ + Get the processing class to use with this DNA-LLM model. + + Returns: + The processing class + """ + return DLProcessor + + def get_dnallm_modules_keywords(self) -> List[str]: + """ + Get keywords to identify DNA-specific modules in the model. + + Used to exclude DNA modules from LoRA adaptation during training. + + Returns: + List of keywords that identify DNA modules + """ + return ["dna"] + + def get_custom_multimodal_keywords(self) -> List[str]: + """ + Get keywords for multimodal inputs that should be passed to the model. + + Returns: + List of input keywords for multimodal processing + """ + return ["dna_tokenized", "batch_idx_map"] + + def get_non_generate_params(self) -> List[str]: + """ + Get parameter names that should be excluded from generation. + + Returns: + List of parameter names to exclude from generation calls + """ + return [] + + def get_custom_processing_keywords(self) -> List[tuple]: + """ + Get custom processing keywords for the processor. + + Returns: + List of (component, parameter) tuples for custom processing + """ + return [("dna_tokenizer", "max_length")] + + def prepare_prompt( + self, processing_class: Any, inputs: List[Dict[str, Union[torch.Tensor, Any]]] + ) -> List[str]: + """ + Prepare prompts from input examples. + + Args: + processing_class: The processor to use + inputs: List of input examples + + Returns: + List of prepared prompts + """ + prompts_text = [ + maybe_apply_chat_template(example, processing_class)["prompt"] + for example in inputs + ] + return prompts_text + + def prepare_model_inputs( + self, + processing_class: Any, + model: Any, + prompts_text: List[str], + batch_dna_sequences: List[List[str]], + return_tensors: str = "pt", + padding: bool = True, + padding_side: str = "left", + add_special_tokens: bool = False, + ) -> Dict[str, Any]: + """ + Prepare inputs for the model. + + Args: + processing_class: The processor to use + model: The model to prepare inputs for + prompts_text: List of text prompts + batch_dna_sequences: List of lists of DNA sequences + return_tensors: Return format for tensors + padding: Whether to pad inputs + padding_side: Side to pad on + add_special_tokens: Whether to add special tokens + + Returns: + Processed inputs for the model + """ + # Handle DataParallel wrapped models by accessing the module attribute if needed + max_length_text = model.max_length_text if not hasattr(model, 'module') else model.module.max_length_text + max_length_dna = model.max_length_dna if not hasattr(model, 'module') else model.module.max_length_dna + + prompt_inputs = processing_class( + text=prompts_text, + batch_dna_sequences=batch_dna_sequences, + return_tensors=return_tensors, + padding=padding, + padding_side=padding_side, + add_special_tokens=add_special_tokens, + max_length_text=max_length_text, + max_length_dna=max_length_dna, + ) + + return prompt_inputs + + def is_embeds_input(self) -> bool: + """ + Whether the model uses embeddings as input (instead of token IDs). + + Returns: + Boolean indicating if the model takes embedding inputs + """ + return True + + @staticmethod + def get_question_template() -> str: + """ + Get the template for formatting questions. + + Returns: + String template for questions + """ + return "{Question}" + + @staticmethod + def format_reward_rec(completions: List[Dict[str, Any]], **kwargs) -> List[float]: + """ + Check if the Qwen model output matches a specific format. + + Args: + completions: List of model completions + **kwargs: Additional arguments + + Returns: + List of reward scores (1.0 for match, 0.0 for no match) + """ + import re + import os + from datetime import datetime + + # Pattern to match the expected output format + pattern = r"| \n", + " | CHROM | \n", + "POS | \n", + "REF | \n", + "ALT | \n", + "LABEL | \n", + "SOURCE | \n", + "CONSEQUENCE | \n", + "ID | \n", + "REVIEW_STATUS | \n", + "GENE | \n", + "split | \n", + "INT_LABEL | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "chr1 | \n", + "976215 | \n", + "A | \n", + "G | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "1320032 | \n", + "no_assertion_criteria_provided | \n", + "NaN | \n", + "train | \n", + "1 | \n", + "
| 1 | \n", + "chr1 | \n", + "1050449 | \n", + "G | \n", + "A | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "1284257 | \n", + "no_assertion_criteria_provided | \n", + "NaN | \n", + "train | \n", + "1 | \n", + "
| 2 | \n", + "chr1 | \n", + "1050575 | \n", + "G | \n", + "C | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "18241 | \n", + "no_assertion_criteria_provided | \n", + "NaN | \n", + "train | \n", + "1 | \n", + "
| 3 | \n", + "chr1 | \n", + "1213738 | \n", + "G | \n", + "A | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "96692 | \n", + "no_assertion_criteria_provided | \n", + "NaN | \n", + "train | \n", + "1 | \n", + "
| 4 | \n", + "chr1 | \n", + "1232279 | \n", + "A | \n", + "G | \n", + "Pathogenic | \n", + "ClinVar | \n", + "initiatior_codon_variant,missense_variant | \n", + "60484 | \n", + "criteria_provided,_multiple_submitters,_no_con... | \n", + "NaN | \n", + "train | \n", + "1 | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 22249 | \n", + "chrY | \n", + "2787412 | \n", + "C | \n", + "T | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "9747 | \n", + "no_assertion_criteria_provided | \n", + "NaN | \n", + "train | \n", + "1 | \n", + "
| 22250 | \n", + "chrY | \n", + "2787426 | \n", + "C | \n", + "G | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "9739 | \n", + "criteria_provided,_single_submitter | \n", + "NaN | \n", + "train | \n", + "1 | \n", + "
| 22251 | \n", + "chrY | \n", + "2787515 | \n", + "C | \n", + "A | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "492908 | \n", + "no_assertion_criteria_provided | \n", + "NaN | \n", + "train | \n", + "1 | \n", + "
| 22252 | \n", + "chrY | \n", + "2787551 | \n", + "C | \n", + "T | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "9754 | \n", + "no_assertion_criteria_provided | \n", + "NaN | \n", + "train | \n", + "1 | \n", + "
| 22253 | \n", + "chrY | \n", + "7063898 | \n", + "A | \n", + "T | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "625467 | \n", + "no_assertion_criteria_provided | \n", + "NaN | \n", + "train | \n", + "1 | \n", + "
22254 rows × 12 columns
\n", + "| \n", + " | CHROM | \n", + "POS | \n", + "REF | \n", + "ALT | \n", + "LABEL | \n", + "SOURCE | \n", + "CONSEQUENCE | \n", + "ID | \n", + "REVIEW_STATUS | \n", + "GENE | \n", + "split | \n", + "INT_LABEL | \n", + "GENE_ID | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "chr1 | \n", + "976215 | \n", + "A | \n", + "G | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "1320032 | \n", + "no_assertion_criteria_provided | \n", + "\n", + " | train | \n", + "1 | \n", + "\n", + " |
| 1 | \n", + "chr1 | \n", + "1050449 | \n", + "G | \n", + "A | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "1284257 | \n", + "no_assertion_criteria_provided | \n", + "\n", + " | train | \n", + "1 | \n", + "\n", + " |
| 2 | \n", + "chr1 | \n", + "1050575 | \n", + "G | \n", + "C | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "18241 | \n", + "no_assertion_criteria_provided | \n", + "\n", + " | train | \n", + "1 | \n", + "\n", + " |
| 3 | \n", + "chr1 | \n", + "1213738 | \n", + "G | \n", + "A | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "96692 | \n", + "no_assertion_criteria_provided | \n", + "\n", + " | train | \n", + "1 | \n", + "\n", + " |
| 4 | \n", + "chr1 | \n", + "1232279 | \n", + "A | \n", + "G | \n", + "Pathogenic | \n", + "ClinVar | \n", + "initiatior_codon_variant,missense_variant | \n", + "60484 | \n", + "criteria_provided,_multiple_submitters,_no_con... | \n", + "\n", + " | train | \n", + "1 | \n", + "\n", + " |
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 22249 | \n", + "chrY | \n", + "2787412 | \n", + "C | \n", + "T | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "9747 | \n", + "no_assertion_criteria_provided | \n", + "\n", + " | train | \n", + "1 | \n", + "\n", + " |
| 22250 | \n", + "chrY | \n", + "2787426 | \n", + "C | \n", + "G | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "9739 | \n", + "criteria_provided,_single_submitter | \n", + "\n", + " | train | \n", + "1 | \n", + "\n", + " |
| 22251 | \n", + "chrY | \n", + "2787515 | \n", + "C | \n", + "A | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "492908 | \n", + "no_assertion_criteria_provided | \n", + "\n", + " | train | \n", + "1 | \n", + "\n", + " |
| 22252 | \n", + "chrY | \n", + "2787551 | \n", + "C | \n", + "T | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "9754 | \n", + "no_assertion_criteria_provided | \n", + "\n", + " | train | \n", + "1 | \n", + "\n", + " |
| 22253 | \n", + "chrY | \n", + "7063898 | \n", + "A | \n", + "T | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "625467 | \n", + "no_assertion_criteria_provided | \n", + "\n", + " | train | \n", + "1 | \n", + "\n", + " |
22254 rows × 13 columns
\n", + "| \n", + " | CHROM | \n", + "POS | \n", + "REF | \n", + "ALT | \n", + "LABEL | \n", + "SOURCE | \n", + "CONSEQUENCE | \n", + "ID | \n", + "REVIEW_STATUS | \n", + "GENE | \n", + "split | \n", + "INT_LABEL | \n", + "GENE_ID | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "chr1 | \n", + "976215 | \n", + "A | \n", + "G | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "1320032 | \n", + "no_assertion_criteria_provided | \n", + "PERM1 | \n", + "train | \n", + "1 | \n", + "84808 | \n", + "
| 1 | \n", + "chr1 | \n", + "1050449 | \n", + "G | \n", + "A | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "1284257 | \n", + "no_assertion_criteria_provided | \n", + "AGRN | \n", + "train | \n", + "1 | \n", + "375790 | \n", + "
| 2 | \n", + "chr1 | \n", + "1050575 | \n", + "G | \n", + "C | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "18241 | \n", + "no_assertion_criteria_provided | \n", + "AGRN | \n", + "train | \n", + "1 | \n", + "375790 | \n", + "
| 3 | \n", + "chr1 | \n", + "1213738 | \n", + "G | \n", + "A | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "96692 | \n", + "no_assertion_criteria_provided | \n", + "TNFRSF4 | \n", + "train | \n", + "1 | \n", + "7293 | \n", + "
| 4 | \n", + "chr1 | \n", + "1232279 | \n", + "A | \n", + "G | \n", + "Pathogenic | \n", + "ClinVar | \n", + "initiatior_codon_variant,missense_variant | \n", + "60484 | \n", + "criteria_provided,_multiple_submitters,_no_con... | \n", + "B3GALT6 | \n", + "train | \n", + "1 | \n", + "126792 | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 22249 | \n", + "chrY | \n", + "2787412 | \n", + "C | \n", + "T | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "9747 | \n", + "no_assertion_criteria_provided | \n", + "SRY | \n", + "train | \n", + "1 | \n", + "6736 | \n", + "
| 22250 | \n", + "chrY | \n", + "2787426 | \n", + "C | \n", + "G | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "9739 | \n", + "criteria_provided,_single_submitter | \n", + "SRY | \n", + "train | \n", + "1 | \n", + "6736 | \n", + "
| 22251 | \n", + "chrY | \n", + "2787515 | \n", + "C | \n", + "A | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "492908 | \n", + "no_assertion_criteria_provided | \n", + "SRY | \n", + "train | \n", + "1 | \n", + "6736 | \n", + "
| 22252 | \n", + "chrY | \n", + "2787551 | \n", + "C | \n", + "T | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "9754 | \n", + "no_assertion_criteria_provided | \n", + "SRY | \n", + "train | \n", + "1 | \n", + "6736 | \n", + "
| 22253 | \n", + "chrY | \n", + "7063898 | \n", + "A | \n", + "T | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "625467 | \n", + "no_assertion_criteria_provided | \n", + "LOC126057105, TBL1Y | \n", + "train | \n", + "1 | \n", + "126057105, 90665 | \n", + "
22150 rows × 13 columns
\n", + "| \n", + " | CHROM | \n", + "POS | \n", + "REF | \n", + "ALT | \n", + "LABEL | \n", + "SOURCE | \n", + "CONSEQUENCE | \n", + "ID | \n", + "REVIEW_STATUS | \n", + "GENE | \n", + "split | \n", + "INT_LABEL | \n", + "GENE_ID | \n", + "Disease | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "chr1 | \n", + "976215 | \n", + "A | \n", + "G | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "1320032 | \n", + "no_assertion_criteria_provided | \n", + "PERM1 | \n", + "train | \n", + "1 | \n", + "84808 | \n", + "Renal tubular epithelial cell apoptosis | \n", + "
| 1 | \n", + "chr1 | \n", + "976215 | \n", + "A | \n", + "G | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "1320032 | \n", + "no_assertion_criteria_provided | \n", + "PERM1 | \n", + "train | \n", + "1 | \n", + "84808 | \n", + "Neutrophil inclusion bodies | \n", + "
| 2 | \n", + "chr1 | \n", + "1050449 | \n", + "G | \n", + "A | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "1284257 | \n", + "no_assertion_criteria_provided | \n", + "AGRN | \n", + "train | \n", + "1 | \n", + "375790 | \n", + "Congenital myasthenic syndrome 8 | \n", + "
| 3 | \n", + "chr1 | \n", + "1050575 | \n", + "G | \n", + "C | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "18241 | \n", + "no_assertion_criteria_provided | \n", + "AGRN | \n", + "train | \n", + "1 | \n", + "375790 | \n", + "Congenital myasthenic syndrome 8 | \n", + "
| 4 | \n", + "chr1 | \n", + "1213738 | \n", + "G | \n", + "A | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "96692 | \n", + "no_assertion_criteria_provided | \n", + "TNFRSF4 | \n", + "train | \n", + "1 | \n", + "7293 | \n", + "Combined immunodeficiency due to OX40 deficiency | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 32680 | \n", + "chrY | \n", + "2787412 | \n", + "C | \n", + "T | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "9747 | \n", + "no_assertion_criteria_provided | \n", + "SRY | \n", + "train | \n", + "1 | \n", + "6736 | \n", + "46,XY sex reversal 1 | \n", + "
| 32681 | \n", + "chrY | \n", + "2787426 | \n", + "C | \n", + "G | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "9739 | \n", + "criteria_provided,_single_submitter | \n", + "SRY | \n", + "train | \n", + "1 | \n", + "6736 | \n", + "not provided | \n", + "
| 32682 | \n", + "chrY | \n", + "2787515 | \n", + "C | \n", + "A | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "492908 | \n", + "no_assertion_criteria_provided | \n", + "SRY | \n", + "train | \n", + "1 | \n", + "6736 | \n", + "46,XY sex reversal 1 | \n", + "
| 32683 | \n", + "chrY | \n", + "2787551 | \n", + "C | \n", + "T | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "9754 | \n", + "no_assertion_criteria_provided | \n", + "SRY | \n", + "train | \n", + "1 | \n", + "6736 | \n", + "46,XY sex reversal 1 | \n", + "
| 32684 | \n", + "chrY | \n", + "7063898 | \n", + "A | \n", + "T | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "625467 | \n", + "no_assertion_criteria_provided | \n", + "LOC126057105, TBL1Y | \n", + "train | \n", + "1 | \n", + "126057105, 90665 | \n", + "Deafness, Y-linked 2 | \n", + "
32685 rows × 14 columns
\n", + "| \n", + " | CHROM | \n", + "POS | \n", + "REF | \n", + "ALT | \n", + "LABEL | \n", + "SOURCE | \n", + "CONSEQUENCE | \n", + "ID | \n", + "REVIEW_STATUS | \n", + "GENE | \n", + "split | \n", + "INT_LABEL | \n", + "GENE_ID | \n", + "Disease | \n", + "GENE_Name | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "chr1 | \n", + "976215 | \n", + "A | \n", + "G | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "1320032 | \n", + "no_assertion_criteria_provided | \n", + "PERM1 | \n", + "train | \n", + "1 | \n", + "84808 | \n", + "Renal tubular epithelial cell apoptosis | \n", + "PPARGC1 and ESRR induced regulator, muscle 1 | \n", + "
| 1 | \n", + "chr1 | \n", + "976215 | \n", + "A | \n", + "G | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "1320032 | \n", + "no_assertion_criteria_provided | \n", + "PERM1 | \n", + "train | \n", + "1 | \n", + "84808 | \n", + "Neutrophil inclusion bodies | \n", + "PPARGC1 and ESRR induced regulator, muscle 1 | \n", + "
| 2 | \n", + "chr1 | \n", + "1050449 | \n", + "G | \n", + "A | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "1284257 | \n", + "no_assertion_criteria_provided | \n", + "AGRN | \n", + "train | \n", + "1 | \n", + "375790 | \n", + "Congenital myasthenic syndrome 8 | \n", + "agrin | \n", + "
| 3 | \n", + "chr1 | \n", + "1050575 | \n", + "G | \n", + "C | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "18241 | \n", + "no_assertion_criteria_provided | \n", + "AGRN | \n", + "train | \n", + "1 | \n", + "375790 | \n", + "Congenital myasthenic syndrome 8 | \n", + "agrin | \n", + "
| 4 | \n", + "chr1 | \n", + "1213738 | \n", + "G | \n", + "A | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "96692 | \n", + "no_assertion_criteria_provided | \n", + "TNFRSF4 | \n", + "train | \n", + "1 | \n", + "7293 | \n", + "Combined immunodeficiency due to OX40 deficiency | \n", + "TNF receptor superfamily member 4 | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 32680 | \n", + "chrY | \n", + "2787412 | \n", + "C | \n", + "T | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "9747 | \n", + "no_assertion_criteria_provided | \n", + "SRY | \n", + "train | \n", + "1 | \n", + "6736 | \n", + "46,XY sex reversal 1 | \n", + "sex determining region Y | \n", + "
| 32681 | \n", + "chrY | \n", + "2787426 | \n", + "C | \n", + "G | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "9739 | \n", + "criteria_provided,_single_submitter | \n", + "SRY | \n", + "train | \n", + "1 | \n", + "6736 | \n", + "not provided | \n", + "sex determining region Y | \n", + "
| 32682 | \n", + "chrY | \n", + "2787515 | \n", + "C | \n", + "A | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "492908 | \n", + "no_assertion_criteria_provided | \n", + "SRY | \n", + "train | \n", + "1 | \n", + "6736 | \n", + "46,XY sex reversal 1 | \n", + "sex determining region Y | \n", + "
| 32683 | \n", + "chrY | \n", + "2787551 | \n", + "C | \n", + "T | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "9754 | \n", + "no_assertion_criteria_provided | \n", + "SRY | \n", + "train | \n", + "1 | \n", + "6736 | \n", + "46,XY sex reversal 1 | \n", + "sex determining region Y | \n", + "
| 32684 | \n", + "chrY | \n", + "7063898 | \n", + "A | \n", + "T | \n", + "Pathogenic | \n", + "ClinVar | \n", + "missense_variant | \n", + "625467 | \n", + "no_assertion_criteria_provided | \n", + "LOC126057105, TBL1Y | \n", + "train | \n", + "1 | \n", + "126057105, 90665 | \n", + "Deafness, Y-linked 2 | \n", + "P300/CBP strongly-dependent group 1 enhancer G... | \n", + "
32685 rows × 15 columns
\n", + "