diff --git a/BioReason-0813/=1.18.1 b/BioReason-0813/=1.18.1 new file mode 100644 index 0000000000000000000000000000000000000000..11dc674dbfbb3db168638eb651c43d0755aea8d8 --- /dev/null +++ b/BioReason-0813/=1.18.1 @@ -0,0 +1,10 @@ +Looking in indexes: https://mirrors.aliyun.com/pypi/simple/ +Requirement already satisfied: modelscope in /root/miniconda3/envs/bioreason/lib/python3.11/site-packages (1.29.1) +Requirement already satisfied: filelock in /root/miniconda3/envs/bioreason/lib/python3.11/site-packages (from modelscope) (3.18.0) +Requirement already satisfied: requests>=2.25 in /root/miniconda3/envs/bioreason/lib/python3.11/site-packages (from modelscope) (2.32.4) +Requirement already satisfied: setuptools in /root/miniconda3/envs/bioreason/lib/python3.11/site-packages (from modelscope) (78.1.1) +Requirement already satisfied: tqdm>=4.64.0 in /root/miniconda3/envs/bioreason/lib/python3.11/site-packages (from modelscope) (4.67.1) +Requirement already satisfied: urllib3>=1.26 in /root/miniconda3/envs/bioreason/lib/python3.11/site-packages (from modelscope) (2.5.0) +Requirement already satisfied: charset_normalizer<4,>=2 in /root/miniconda3/envs/bioreason/lib/python3.11/site-packages (from requests>=2.25->modelscope) (3.4.3) +Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/bioreason/lib/python3.11/site-packages (from requests>=2.25->modelscope) (3.10) +Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/bioreason/lib/python3.11/site-packages (from requests>=2.25->modelscope) (2025.8.3) diff --git a/BioReason-0813/__pycache__/blip2_dna_module.cpython-310.pyc b/BioReason-0813/__pycache__/blip2_dna_module.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d5ddd5d280c3c2e5715fd592728a4bcd4a2b1f0d Binary files /dev/null and b/BioReason-0813/__pycache__/blip2_dna_module.cpython-310.pyc differ diff --git a/BioReason-0813/__pycache__/blip2_grpo_trainer.cpython-310.pyc b/BioReason-0813/__pycache__/blip2_grpo_trainer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f9cc194efb7b0cf7cf0f137b3921a4ff6981afa Binary files /dev/null and b/BioReason-0813/__pycache__/blip2_grpo_trainer.cpython-310.pyc differ diff --git a/BioReason-0813/bioreason/dna_modules/__pycache__/dna_module.cpython-310.pyc b/BioReason-0813/bioreason/dna_modules/__pycache__/dna_module.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b3449928c7a4774e60974c1f4fdfc49fbf9b76b0 Binary files /dev/null and b/BioReason-0813/bioreason/dna_modules/__pycache__/dna_module.cpython-310.pyc differ diff --git a/BioReason-0813/bioreason/dna_modules/dna_module.py b/BioReason-0813/bioreason/dna_modules/dna_module.py new file mode 100644 index 0000000000000000000000000000000000000000..679d92745fec46687e73d99e5ade6f50a54c4811 --- /dev/null +++ b/BioReason-0813/bioreason/dna_modules/dna_module.py @@ -0,0 +1,49 @@ +from abc import ABC, abstractmethod +from typing import Dict, Any, Union +import torch + +class DNABaseModule(ABC): + def __init__(self): + super().__init__() + + @abstractmethod + def get_dnallm_key(self): + pass + + @abstractmethod + def get_model_class(self, model_id: str, model_init_kwargs: dict): + pass + + def post_model_init(self, model, processing_class): + pass + + def is_embeds_input(self): + return False + + @abstractmethod + def get_processing_class(self): + pass + + @abstractmethod + def get_dnallm_modules_keywords(self): + pass + + @abstractmethod + def get_custom_multimodal_keywords(self): + pass + + @abstractmethod + def get_non_generate_params(self): + pass + + @abstractmethod + def get_custom_processing_keywords(self): + pass + + @abstractmethod + def prepare_prompt(self, processing_class, inputs: dict[str, Union[torch.Tensor, Any]]): + pass + + @abstractmethod + def prepare_model_inputs(self, processing_class, prompts_text, images, return_tensors, padding, padding_side, add_special_tokens): + pass \ No newline at end of file diff --git a/BioReason-0813/bioreason/trainer/grpo_config.py b/BioReason-0813/bioreason/trainer/grpo_config.py new file mode 100644 index 0000000000000000000000000000000000000000..eaf704018cc74354682ec4433d07e7d53cdfaf13 --- /dev/null +++ b/BioReason-0813/bioreason/trainer/grpo_config.py @@ -0,0 +1,365 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field +from typing import Optional, Union + +from transformers import TrainingArguments + + +@dataclass +class DNALLMGRPOConfig(TrainingArguments): + r""" + Configuration class for the [`GRPOTrainer`]. + + Only the parameters specific to GRPO training are listed here. For details on other parameters, refer to the + [`~transformers.TrainingArguments`] documentation. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + > Parameters that control the model and reference model + + model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): + Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model` + argument of the [`GRPOTrainer`] is provided as a string. + + > Parameters that control the data preprocessing + + remove_unused_columns (`bool`, *optional*, defaults to `False`): + Whether to only keep the column `"prompt"` in the dataset. If you use a custom reward function that + requires any column other than `"prompts"` and `"completions"`, you should keep this to `False`. + max_prompt_length (`int` or `None`, *optional*, defaults to `512`): + Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left. + num_generations (`int` or `None`, *optional*, defaults to `8`): + Number of generations per prompt to sample. The global batch size (num_processes * per_device_batch_size) + must be divisible by this value. + max_completion_length (`int` or `None`, *optional*, defaults to `256`): + Maximum length of the generated completion. + ds3_gather_for_generation (`bool`, *optional*, defaults to `True`): + This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, + improving generation speed. However, disabling this option allows training models that exceed the VRAM + capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible + with vLLM generation. + + > Parameters that control generation + + temperature (`float`, defaults to `0.9`): + Temperature for sampling. The higher the temperature, the more random the completions. + top_p (`float`, *optional*, defaults to `1.0`): + Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to + `1.0` to consider all tokens. + top_k (`int` or `None`, *optional*, defaults to `50`): + Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, top-k-filtering is + disabled. + min_p (`float` or `None`, *optional*, defaults to `None`): + Minimum token probability, which will be scaled by the probability of the most likely token. It must be a + value between `0.0` and `1.0`. Typical values are in the `0.01-0.2` range. + repetition_penalty (`float`, *optional*, defaults to `1.0`): + Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far. + Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat + tokens. + cache_implementation (`str` or `None`, *optional*, defaults to `None`): + Implementation of the cache method for faster generation when use_vllm is set to False. + + > Parameters that control generation acceleration powered by vLLM + + use_vllm (`bool`, *optional*, defaults to `False`): + Whether to use vLLM for generating completions. If set to `True`, ensure that a GPU is kept unused for + training, as vLLM will require one for generation. vLLM must be installed (`pip install vllm`). + vllm_device (`str`, *optional*, defaults to `"auto"`): + Device where vLLM generation will run, e.g. `"cuda:1"`. If set to `"auto"` (default), the system will + automatically select the next available GPU after the last one used for training. This assumes that + training has not already occupied all available GPUs. If only one device is available, the device will be + shared between both training and vLLM. + vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.9`): + Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV cache on the + device dedicated to generation powered by vLLM. Higher values will increase the KV cache size and thus + improve the model's throughput. However, if the value is too high, it may cause out-of-memory (OOM) errors + during initialization. + vllm_dtype (`str`, *optional*, defaults to `"auto"`): + Data type to use for vLLM generation. If set to `"auto"`, the data type will be automatically determined + based on the model configuration. Find the supported values in the vLLM documentation. + vllm_max_model_len (`int` or `None`, *optional*, defaults to `None`): + If set, the `max_model_len` to use for vLLM. This could be useful when running with reduced + `vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model + context size, which might be much larger than the KV cache, leading to inefficiencies. + vllm_enable_prefix_caching (`bool`, *optional*, defaults to `True`): + Whether to enable prefix caching in vLLM. If set to `True` (default), ensure that the model and the hardware + support this feature. + vllm_guided_decoding_regex (`str` or `None`, *optional*, defaults to `None`): + Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled. + + > Parameters that control the training + + learning_rate (`float`, *optional*, defaults to `1e-6`): + Initial learning rate for [`AdamW`] optimizer. The default value replaces that of + [`~transformers.TrainingArguments`]. + beta (`float`, *optional*, defaults to `0.04`): + KL coefficient. If `0.0`, the reference model is not loaded, reducing memory usage and improving training + speed, but may be numerically unstable for long training runs. + num_iterations (`int`, *optional*, defaults to `1`): + Number of iterations per batch (denoted as μ in the algorithm). + epsilon (`float`, *optional*, defaults to `0.2`): + Epsilon value for clipping. + epsilon_high (`float` or `None`, *optional*, defaults to `None`): + Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the lower-bound + specified in argument `epsilon`. Paper [DAPO](https://huggingface.co/papers/2503.14476) recommends `0.28`. + reward_weights (`list[float]` or `None`, *optional*, defaults to `None`): + Weights for each reward function. Must match the number of reward functions. If `None`, all rewards are + weighted equally with weight `1.0`. + sync_ref_model (`bool`, *optional*, defaults to `False`): + Whether to synchronize the reference model with the active model every `ref_model_sync_steps` steps, using + the `ref_model_mixup_alpha` parameter. This synchronization originites from the + [TR-DPO](https://huggingface.co/papers/2404.09656) paper. + ref_model_mixup_alpha (`float`, *optional*, defaults to `0.6`): + α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix + between the current policy and the previous reference policy during updates. The reference policy is + updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you + must set `sync_ref_model=True`. + ref_model_sync_steps (`int`, *optional*, defaults to `512`): + τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how + frequently the current policy is synchronized with the reference policy. To use this parameter, you must + set `sync_ref_model=True`. + + > Parameters that control the logging + + log_completions (`bool`, *optional*, defaults to `False`): + Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is + installed, it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`. + """ + + # Parameters that control the model and reference model + model_init_kwargs: Optional[dict] = field( + default=None, + metadata={ + "help": "Keyword arguments for `transformers.AutoModelForCausalLM.from_pretrained`, used when the `model` " + "argument of the `GRPOTrainer` is provided as a string." + }, + ) + + # Parameters that control the data preprocessing + # The default value remove_unused_columns is overwritten from the parent class, because in GRPO we usually rely on + # additional columns to compute the reward + remove_unused_columns: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether to only keep the column 'prompt' in the dataset. If you use a custom reward function " + "that requires any column other than 'prompts' and 'completions', you should keep this to `False`." + }, + ) + max_prompt_length: Optional[int] = field( + default=512, + metadata={ + "help": "Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left." + }, + ) + num_generations: Optional[int] = field( + default=8, + metadata={ + "help": "Number of generations to sample. The global batch size (num_processes * per_device_batch_size) " + "must be divisible by this value." + }, + ) + max_completion_length: Optional[int] = field( + default=800, + metadata={"help": "Maximum length of the generated completion."}, + ) + ds3_gather_for_generation: bool = field( + default=True, + metadata={ + "help": "This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for " + "generation, improving generation speed. However, disabling this option allows training models that " + "exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation. Disabling this option " + "is not compatible with vLLM generation." + }, + ) + + # Parameters that control generation + temperature: float = field( + default=0.6, + metadata={"help": "Temperature for sampling. The higher the temperature, the more random the completions."}, + ) + top_p: float = field( + default=0.95, + metadata={ + "help": "Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. " + "Set to 1.0 to consider all tokens." + }, + ) + top_k: Optional[int] = field( + default=20, + metadata={ + "help": "Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, " + "top-k-filtering is disabled." + }, + ) + min_p: Optional[float] = field( + default=None, + metadata={ + "help": "Minimum token probability, which will be scaled by the probability of the most likely token. It " + "must be a value between 0.0 and 1.0. Typical values are in the 0.01-0.2 range." + }, + ) + repetition_penalty: float = field( + default=1.0, + metadata={ + "help": "Float that penalizes new tokens based on whether they appear in the prompt and the generated " + "text so far. Values > 1.0 encourage the model to use new tokens, while values < 1.0 encourage the model " + "to repeat tokens." + }, + ) + cache_implementation: Optional[str] = field( + default=None, + metadata={"help": "Implementation of the cache method for faster generation when use_vllm is set to False."}, + ) + + # Parameters that control generation acceleration powered by vLLM + use_vllm: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether to use vLLM for generating completions. If set to `True`, ensure that a GPU is kept " + "unused for training, as vLLM will require one for generation. vLLM must be installed " + "(`pip install vllm`)." + }, + ) + vllm_device: Optional[str] = field( + default="auto", + metadata={ + "help": "Device where vLLM generation will run, e.g. 'cuda:1'. If set to 'auto' (default), the system " + "will automatically select the next available GPU after the last one used for training. This assumes " + "that training has not already occupied all available GPUs." + }, + ) + vllm_gpu_memory_utilization: float = field( + default=0.9, + metadata={ + "help": "Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV " + "cache on the device dedicated to generation powered by vLLM. Higher values will increase the KV cache " + "size and thus improve the model's throughput. However, if the value is too high, it may cause " + "out-of-memory (OOM) errors during initialization." + }, + ) + vllm_dtype: Optional[str] = field( + default="auto", + metadata={ + "help": "Data type to use for vLLM generation. If set to 'auto', the data type will be automatically " + "determined based on the model configuration. Find the supported values in the vLLM documentation." + }, + ) + vllm_max_model_len: Optional[int] = field( + default=None, + metadata={ + "help": "If set, the `max_model_len` to use for vLLM. This could be useful when running with reduced " + "`vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model " + "context size, which might be much larger than the KV cache, leading to inefficiencies." + }, + ) + vllm_enable_prefix_caching: Optional[bool] = field( + default=True, + metadata={ + "help": "Whether to enable prefix caching in vLLM. If set to `True` (default), ensure that the model and " + "the hardware support this feature." + }, + ) + vllm_guided_decoding_regex: Optional[str] = field( + default=None, + metadata={"help": "Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled."}, + ) + + # Parameters that control the training + learning_rate: float = field( + default=1e-6, + metadata={ + "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of " + "`transformers.TrainingArguments`." + }, + ) + beta: float = field( + default=0.04, + metadata={ + "help": "KL coefficient. If `0.0`, the reference model is not loaded, reducing memory usage and improving " + "training speed, but may be numerically unstable for long training runs." + }, + ) + num_iterations: int = field( + default=1, + metadata={"help": "Number of iterations per batch (denoted as μ in the algorithm)."}, + ) + epsilon: float = field( + default=0.2, + metadata={"help": "Epsilon value for clipping."}, + ) + epsilon_high: Optional[float] = field( + default=None, + metadata={ + "help": "Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the " + "lower-bound specified in argument `epsilon`. Paper DAPO recommends `0.28`." + }, + ) + reward_weights: Optional[list[float]] = field( + default=None, + metadata={ + "help": "Weights for each reward function. Must match the number of reward functions. If `None`, all " + "rewards are weighted equally with weight `1.0`." + }, + ) + sync_ref_model: bool = field( + default=False, + metadata={ + "help": "Whether to synchronize the reference model with the active model every `ref_model_sync_steps` " + "steps, using the `ref_model_mixup_alpha` parameter." + }, + ) + ref_model_mixup_alpha: float = field( + default=0.6, + metadata={ + "help": "α parameter from the TR-DPO paper, which controls the mix between the current policy and the " + "previous reference policy during updates. The reference policy is updated according to the equation: " + "`π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you must set `sync_ref_model=True`." + }, + ) + ref_model_sync_steps: int = field( + default=512, + metadata={ + "help": "τ parameter from the TR-DPO paper, which determines how frequently the current policy is " + "synchronized with the reference policy. To use this parameter, you must set `sync_ref_model=True`." + }, + ) + + # Parameters that control the logging + log_completions: bool = field( + default=True, + metadata={ + "help": "Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is " + "installed, it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`." + }, + ) + + report_to: Union[None, str, list[str]] = field( + default="wandb", metadata={"help": "The list of integrations to report the results and logs to."} + ) + + logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"}) + logging_steps: float = field( + default=2, + metadata={ + "help": ( + "Log every X updates steps. Should be an integer or a float in range `[0,1)`. " + "If smaller than 1, will be interpreted as ratio of total training steps." + ) + }, + ) \ No newline at end of file diff --git a/BioReason-0813/blip2_dna_module.py b/BioReason-0813/blip2_dna_module.py new file mode 100644 index 0000000000000000000000000000000000000000..5a87497e0be3a864cc820bde25240fbe9a163c7b --- /dev/null +++ b/BioReason-0813/blip2_dna_module.py @@ -0,0 +1,349 @@ +from transformers import ( + AutoProcessor, + AutoTokenizer, +) +from typing import Dict, Any, Union, List, Optional, Callable, Type +from trl.data_utils import maybe_apply_chat_template +import torch + +from bioreason.dna_modules.dna_module import DNABaseModule +from model.blip2_stage2 import Blip2Stage2 + + +class Blip2DNAModule(DNABaseModule): + """ + DNA module implementation for BLIP2-based models. + + This module provides the interface between BLIP2 models and the GRPO training + infrastructure, handling model loading, processing setup, and reward functions. + """ + + def __init__(self): + """Initialize the Blip2DNAModule.""" + super().__init__() + + def get_dnallm_key(self) -> str: + """ + Get the key identifier for this DNA-LLM implementation. + + Returns: + String identifier for this module type + """ + return "blip2" + + def get_model_class(self, model_id: str, model_init_kwargs: Dict[str, Any]) -> Type: + """ + Return the appropriate model class based on model ID. + + Args: + model_id: Identifier for the model + model_init_kwargs: Initialization arguments for the model + + Returns: + The model class to instantiate + + Raises: + ValueError: If the model is not supported + """ + if "blip2" in model_id.lower() or "stage2" in model_id.lower(): + model_cls = Blip2Stage2 + else: + raise ValueError(f"Unsupported model: {model_id}") + return model_cls + + def post_model_init(self, model: Any, processing_class: Any) -> None: + """ + Perform any post-initialization setup on the model. + + Args: + model: The initialized model + processing_class: The processor for the model + """ + # BLIP2 models might need specific post-init setup + if hasattr(model, 'blip2') and hasattr(model.blip2, 'llm_tokenizer'): + # Ensure the tokenizer is properly configured + if not hasattr(model.blip2.llm_tokenizer, 'pad_token') or model.blip2.llm_tokenizer.pad_token is None: + model.blip2.llm_tokenizer.pad_token = model.blip2.llm_tokenizer.eos_token + + def get_processing_class(self) -> Type: + """ + Get the processing class to use with this BLIP2 model. + + Returns: + The processing class + """ + return Blip2Processor + + def get_dnallm_modules_keywords(self) -> List[str]: + """ + Get keywords to identify DNA-specific modules in the model. + + Used to exclude DNA modules from LoRA adaptation during training. + + Returns: + List of keywords that identify DNA modules + """ + return ["plm", "qformer", "opt_proj"] + + def get_custom_multimodal_keywords(self) -> List[str]: + """ + Get keywords for multimodal inputs that should be passed to the model. + + Returns: + List of input keywords for multimodal processing + """ + return ["prot_batch", "prompt_batch"] + + def get_non_generate_params(self) -> List[str]: + """ + Get parameter names that should be excluded from generation. + + Returns: + List of parameter names to exclude from generation calls + """ + return ["prot_batch"] + + def get_custom_processing_keywords(self) -> List[tuple]: + """ + Get custom processing keywords for the processor. + + Returns: + List of (component, parameter) tuples for custom processing + """ + return [("plm_tokenizer", "max_length"), ("llm_tokenizer", "max_length")] + + def prepare_prompt( + self, processing_class: Any, inputs: List[Dict[str, Union[torch.Tensor, Any]]] + ) -> List[str]: + """ + Prepare prompts from input examples. + + Args: + processing_class: The processor to use + inputs: List of input examples + + Returns: + List of prepared prompts + """ + prompts_text = [] + for example in inputs: + if "prompt" in example: + # Extract text content from conversational format + if isinstance(example["prompt"], list) and len(example["prompt"]) > 0: + user_content = example["prompt"][0].get("content", "") + if isinstance(user_content, list): + # Extract text from multimodal content + text_parts = [item.get("text", "") for item in user_content if item.get("type") == "text"] + prompt_text = " ".join(text_parts) + else: + prompt_text = str(user_content) + else: + prompt_text = str(example["prompt"]) + else: + prompt_text = "" + prompts_text.append(prompt_text) + return prompts_text + + def prepare_model_inputs( + self, + processing_class: Any, + model: Any, + prompts_text: List[str], + batch_dna_sequences: List[List[str]], + return_tensors: str = "pt", + padding: bool = True, + padding_side: str = "left", + add_special_tokens: bool = False, + ) -> Dict[str, Any]: + """ + Prepare inputs for the BLIP2 model. + + Args: + processing_class: The processor to use + model: The model to prepare inputs for + prompts_text: List of text prompts + batch_dna_sequences: List of lists of DNA sequences (treated as protein sequences) + return_tensors: Return format for tensors + padding: Whether to pad inputs + padding_side: Side to pad on + add_special_tokens: Whether to add special tokens + + Returns: + Processed inputs for the model + """ + # Get the BLIP2 model from the wrapper + blip2_model = model.blip2 if hasattr(model, 'blip2') else model + + # Prepare protein batch (using DNA sequences as protein sequences) + # Flatten all DNA sequences to treat them as individual protein sequences + all_sequences = [] + for sequences in batch_dna_sequences: + all_sequences.extend(sequences) + + if all_sequences: + prot_batch = blip2_model.plm_tokenizer( + all_sequences, + padding=padding, + truncation=True, + max_length=512, # Default protein sequence length + return_tensors=return_tensors, + ) + else: + # Empty batch handling + prot_batch = { + 'input_ids': torch.empty(0, 1, dtype=torch.long), + 'attention_mask': torch.empty(0, 1, dtype=torch.long) + } + + # Prepare prompt batch + prompt_batch = blip2_model.llm_tokenizer( + prompts_text, + padding=padding, + truncation=True, + max_length=256, # Default prompt length + return_tensors=return_tensors, + ) + + return { + "prot_batch": prot_batch, + "prompt_batch": prompt_batch, + "input_ids": prompt_batch["input_ids"], # For compatibility + "attention_mask": prompt_batch["attention_mask"], # For compatibility + } + + def is_embeds_input(self) -> bool: + """ + Whether the model uses embeddings as input (instead of token IDs). + + Returns: + Boolean indicating if the model takes embedding inputs + """ + return True # BLIP2 uses embeddings internally + + @staticmethod + def get_question_template() -> str: + """ + Get the template for formatting questions. + + Returns: + String template for questions + """ + return "{Question}" + + @staticmethod + def format_reward_rec(completions: List[Dict[str, Any]], **kwargs) -> List[float]: + """ + Check if the BLIP2 model output matches a specific format. + + Args: + completions: List of model completions + **kwargs: Additional arguments + + Returns: + List of reward scores (1.0 for match, 0.0 for no match) + """ + import re + import os + from datetime import datetime + + # Pattern to match the expected output format + pattern = r".*?\s*.*?\{.*\[\d+,\s*\d+,\s*\d+,\s*\d+\].*\}.*?" + completion_contents = [completion[0]["content"] for completion in completions] + matches = [ + re.search(pattern, content, re.DOTALL) is not None + for content in completion_contents + ] + + # Log format results if in debug mode + current_time = datetime.now().strftime("%d-%H-%M-%S-%f") + if os.getenv("DEBUG_MODE") == "true": + log_path = os.getenv("LOG_PATH") + with open( + log_path.replace(".txt", "_format.txt"), "a", encoding="utf-8" + ) as f: + f.write(f"------------- {current_time} Format reward -------------\n") + for content, match in zip(completion_contents, matches): + f.write(f"Content: {content}\n") + f.write(f"Has format: {bool(match)}\n") + + return [1.0 if match else 0.0 for match in matches] + + @staticmethod + def select_reward_func(func: str, task_type: str) -> Callable: + """ + Select the appropriate reward function based on function name and task type. + + Args: + func: The type of reward function ('accuracy', 'format', etc.) + task_type: The type of task ('rec', etc.) + + Returns: + The reward function to use + + Raises: + ValueError: If the function or task type is not supported + """ + if func == "accuracy": + match task_type: + case "rec": + return Blip2DNAModule.iou_reward + case _: + raise ValueError(f"Unsupported reward function: {func}") + elif func == "format": + match task_type: + case "rec": + return Blip2DNAModule.format_reward_rec + case _: + raise ValueError(f"Unsupported reward function: {func}") + else: + raise ValueError(f"Unsupported reward function: {func}") + + @staticmethod + def iou_reward(completions: List[Dict[str, Any]], **kwargs) -> List[float]: + """ + Placeholder IoU reward function. + + Args: + completions: List of model completions + **kwargs: Additional arguments + + Returns: + List of reward scores + """ + # Placeholder implementation + return [1.0] * len(completions) + + +class Blip2Processor: + """ + Simple processor wrapper for BLIP2 models to maintain compatibility + with the GRPO trainer interface. + """ + + def __init__(self, plm_tokenizer=None, llm_tokenizer=None): + self.plm_tokenizer = plm_tokenizer + self.llm_tokenizer = llm_tokenizer + + # Set compatibility attributes + if llm_tokenizer: + self.eos_token_id = llm_tokenizer.eos_token_id + self.pad_token_id = llm_tokenizer.pad_token_id + + def __call__(self, *args, **kwargs): + """ + Process inputs for BLIP2 model. + This is a simplified version that delegates to the appropriate tokenizer. + """ + # For compatibility, return a simple tokenization result + if self.llm_tokenizer: + return self.llm_tokenizer(*args, **kwargs) + else: + # Fallback behavior + return {"input_ids": torch.tensor([[1]]), "attention_mask": torch.tensor([[1]])} + + def batch_decode(self, *args, **kwargs): + """Decode token sequences.""" + if self.llm_tokenizer: + return self.llm_tokenizer.batch_decode(*args, **kwargs) + else: + return [""] \ No newline at end of file diff --git a/BioReason-0813/blip2_grpo_trainer.py b/BioReason-0813/blip2_grpo_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..b122aa5af333a34ecc2a26ee9edab5a9b3d5db5c --- /dev/null +++ b/BioReason-0813/blip2_grpo_trainer.py @@ -0,0 +1,591 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +import textwrap +import pandas as pd +from collections import defaultdict +from typing import Any, Callable, Optional, Union, Sized + +import torch +import torch.utils.data +import transformers +from datasets import Dataset, IterableDataset +from packaging import version +from transformers import ( + AutoModelForCausalLM, + AutoModelForSequenceClassification, + AutoProcessor, + AutoTokenizer, + GenerationConfig, + PreTrainedModel, + PreTrainedTokenizerBase, + Trainer, + TrainerCallback, + is_wandb_available, +) +from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled +from transformers.utils import is_peft_available + +from trl.data_utils import apply_chat_template, is_conversational, maybe_apply_chat_template +from trl.models import create_reference_model, prepare_deepspeed, unwrap_model_for_generation +from trl.trainer.grpo_config import GRPOConfig +from trl.trainer.utils import generate_model_card, get_comet_experiment_url + +from accelerate.utils import is_peft_model, set_seed, gather_object +import PIL.Image + +import copy +from torch.utils.data import Sampler +import warnings + +if is_peft_available(): + from peft import PeftConfig, get_peft_model, prepare_model_for_kbit_training + +if is_wandb_available(): + import wandb + +from bioreason.dna_modules.dna_module import DNABaseModule +from bioreason.trainer import DNALLMGRPOConfig + +# Import the RepeatRandomSampler from the original trainer +from bioreason.trainer.grpo_trainer import RepeatRandomSampler + +# What we call a reward function is a callable that takes a list of prompts and completions and returns a list of +# rewards. When it's a string, it's a model ID, so it's loaded as a pretrained model. +RewardFunc = Union[str, PreTrainedModel, Callable[[list, list], list[float]]] + + +class Blip2GRPOTrainer(Trainer): + """ + Modified GRPO Trainer for BLIP2 models. + + This trainer adapts the original GRPO trainer to work with BLIP2 architecture, + handling the different input formats and forward pass requirements. + """ + + def __init__( + self, + model: Union[str, PreTrainedModel], + reward_funcs: Union[RewardFunc, list[RewardFunc]], + args: DNALLMGRPOConfig = None, + dna_module: DNABaseModule = None, + train_dataset: Optional[Union[Dataset, IterableDataset]] = None, + eval_dataset: Optional[Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]] = None, + processing_class: Optional[PreTrainedTokenizerBase] = None, + reward_processing_classes: Optional[Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None), + peft_config: Optional["PeftConfig"] = None, + freeze_dna_modules: Optional[bool] = False, + attn_implementation: str = "flash_attention_2", + torch_dtype: str = "bfloat16", + **kwargs, + ): + # Args + if args is None: + model_name = model if isinstance(model, str) else "blip2-model" + args = GRPOConfig(f"{model_name}-GRPO") + + self.dna_module = dna_module + + # Models + model_init_kwargs = args.model_init_kwargs or {} + model_init_kwargs["attn_implementation"] = attn_implementation + if model_init_kwargs.get("torch_dtype") is None: + model_init_kwargs["torch_dtype"] = torch_dtype + + assert not isinstance(model, str), "model must NOT be a string in the current implementation" + + torch_dtype = model_init_kwargs.get("torch_dtype") + if isinstance(torch_dtype, torch.dtype) or torch_dtype == "auto" or torch_dtype is None: + pass # torch_dtype is already a torch.dtype or "auto" or None + elif isinstance(torch_dtype, str): # it's a str, but not "auto" + torch_dtype = getattr(torch, torch_dtype) + else: + raise ValueError( + "Invalid `torch_dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing " + f"a `torch.dtype` (e.g., 'float32'), but got {torch_dtype}." + ) + + # Disable caching if gradient checkpointing is enabled (not supported) + if hasattr(model, 'blip2') and hasattr(model.blip2, 'llm_model'): + model.blip2.llm_model.config.use_cache = ( + False if args.gradient_checkpointing else model.blip2.llm_model.config.use_cache + ) + + # LoRA setup for BLIP2 + self.dna_modules_keywords = self.dna_module.get_dnallm_modules_keywords() + if peft_config is not None: + print("Applying LoRA...") + def find_all_linear_names(model, multimodal_keywords): + cls = torch.nn.Linear + lora_module_names = set() + + # Focus on the LLM part of BLIP2 + if hasattr(model, 'blip2') and hasattr(model.blip2, 'llm_model'): + llm_model = model.blip2.llm_model + for name, module in llm_model.named_modules(): + # Skip DNA/multimodal modules + if any(mm_keyword in name for mm_keyword in multimodal_keywords): + continue + if isinstance(module, cls): + lora_module_names.add(name) + + # Remove embedding layers + for m in list(lora_module_names): + if "embed_tokens" in m or "embedding" in m: + lora_module_names.remove(m) + + return list(lora_module_names) + + target_modules = find_all_linear_names(model, self.dna_modules_keywords) + peft_config.target_modules = target_modules + + # Apply LoRA to the LLM part + if hasattr(model, 'blip2') and hasattr(model.blip2, 'llm_model'): + model.blip2.llm_model = prepare_model_for_kbit_training(model.blip2.llm_model) + model.blip2.llm_model = get_peft_model(model.blip2.llm_model, peft_config) + + # Freeze DNA/protein modules if requested + if freeze_dna_modules: + print("Freezing protein/DNA modules...") + if hasattr(model, 'blip2'): + # Freeze protein language model + if hasattr(model.blip2, 'plm'): + for p in model.blip2.plm.parameters(): + p.requires_grad = False + + # Freeze Q-former if specified + if hasattr(model.blip2, 'Qformer'): + for p in model.blip2.Qformer.parameters(): + p.requires_grad = False + + # Count trainable parameters + trainable_params = [p for p in model.parameters() if p.requires_grad] + total_params = sum(p.numel() for p in trainable_params) + print(f"Total trainable parameters: {total_params}") + + # Enable gradient checkpointing if requested + if args.gradient_checkpointing: + model = self._enable_gradient_checkpointing(model, args) + + # Reference model + self.beta = args.beta + if self.beta == 0.0: + self.ref_model = None + elif is_deepspeed_zero3_enabled(): + # Create reference model for DeepSpeed + self.ref_model = type(model)(model.args) # Create same type of model + elif is_peft_model(model.blip2.llm_model if hasattr(model, 'blip2') else model): + self.ref_model = None + else: + self.ref_model = create_reference_model(model) + + # Processing class setup + if processing_class is None: + processing_cls = self.dna_module.get_processing_class() + + # Get tokenizers from BLIP2 model + if hasattr(model, 'blip2'): + plm_tokenizer = getattr(model.blip2, 'plm_tokenizer', None) + llm_tokenizer = getattr(model.blip2, 'llm_tokenizer', None) + processing_class = processing_cls(plm_tokenizer=plm_tokenizer, llm_tokenizer=llm_tokenizer) + else: + processing_class = processing_cls() + + # Set up tokenizer attributes + if hasattr(processing_class, 'llm_tokenizer') and processing_class.llm_tokenizer: + processing_class.pad_token_id = processing_class.llm_tokenizer.pad_token_id + processing_class.eos_token_id = processing_class.llm_tokenizer.eos_token_id + else: + # Fallback + processing_class.pad_token_id = 0 + processing_class.eos_token_id = 1 + + self.dna_module.post_model_init(model, processing_class) + self.dna_module.post_model_init(self.ref_model, processing_class) + + # Reward functions + if not isinstance(reward_funcs, list): + reward_funcs = [reward_funcs] + for i, reward_func in enumerate(reward_funcs): + if isinstance(reward_func, str): + reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained( + reward_func, num_labels=1, **model_init_kwargs + ) + self.reward_funcs = reward_funcs + + # Reward processing classes + if reward_processing_classes is None: + reward_processing_classes = [None] * len(reward_funcs) + elif not isinstance(reward_processing_classes, list): + reward_processing_classes = [reward_processing_classes] + + for i, (reward_processing_class, reward_func) in enumerate(zip(reward_processing_classes, reward_funcs)): + if isinstance(reward_func, PreTrainedModel): + if reward_processing_class is None: + reward_processing_class = AutoTokenizer.from_pretrained(reward_func.config._name_or_path) + if reward_processing_class.pad_token_id is None: + reward_processing_class.pad_token = reward_processing_class.eos_token + reward_func.config.pad_token_id = reward_processing_class.pad_token_id + reward_processing_classes[i] = reward_processing_class + self.reward_processing_classes = reward_processing_classes + + # Data collator + def data_collator(features): + return features + + # Training arguments + self.max_prompt_length = args.max_prompt_length + self.max_prompt_length = None + if args.max_prompt_length is not None: + warnings.warn("Setting max_prompt_length is currently not supported, it has been set to None") + + self.max_completion_length = args.max_completion_length + self.num_generations = args.num_generations + + # Generation config for BLIP2 + self.generation_config = GenerationConfig( + max_new_tokens=self.max_completion_length, + do_sample=True, + temperature=0.6, + top_p=0.95, + top_k=20, + pad_token_id=processing_class.pad_token_id, + eos_token_id=processing_class.eos_token_id, + ) + + self.beta = args.beta + self.epsilon_low = args.epsilon + self.epsilon_high = args.epsilon_high if args.epsilon_high is not None else args.epsilon + + # Multi-step + self.num_iterations = args.num_iterations + self._step = 0 + self._buffered_inputs = [None] * args.gradient_accumulation_steps + + # Initialize metrics + self._metrics = defaultdict(list) + self.log_completions = args.log_completions + + super().__init__( + model=model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + callbacks=callbacks, + optimizers=optimizers, + ) + + # Validate batch sizes + num_processes = self.accelerator.num_processes + global_batch_size = args.per_device_train_batch_size * num_processes + possible_values = [n_gen for n_gen in range(2, global_batch_size + 1) if (global_batch_size) % n_gen == 0] + if self.num_generations not in possible_values: + raise ValueError( + f"The global train batch size ({num_processes} x {args.per_device_train_batch_size}) must be evenly " + f"divisible by the number of generations per prompt ({self.num_generations}). Given the current train " + f"batch size, the valid values for the number of generations are: {possible_values}." + ) + + # Set unique seed per process + set_seed(args.seed, device_specific=True) + + # Gradient accumulation settings + self.model_accepts_loss_kwargs = False + + # Prepare reference model and reward functions + if self.ref_model is not None: + if is_deepspeed_zero3_enabled(): + self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator) + else: + self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) + + for i, reward_func in enumerate(self.reward_funcs): + if isinstance(reward_func, PreTrainedModel): + self.reward_funcs[i] = self.accelerator.prepare_model(reward_func, evaluation_mode=True) + + def _enable_gradient_checkpointing(self, model: PreTrainedModel, args: DNALLMGRPOConfig) -> PreTrainedModel: + """Enables gradient checkpointing for BLIP2 model.""" + if hasattr(model, 'blip2'): + # Enable for the LLM component + if hasattr(model.blip2, 'llm_model'): + model.blip2.llm_model.config.use_cache = False + if hasattr(model.blip2.llm_model, 'gradient_checkpointing_enable'): + model.blip2.llm_model.gradient_checkpointing_enable() + + # Enable for protein model if needed + if hasattr(model.blip2, 'plm') and hasattr(model.blip2.plm, 'gradient_checkpointing_enable'): + model.blip2.plm.gradient_checkpointing_enable() + + return model + + def _set_signature_columns_if_needed(self): + if self._signature_columns is None: + self._signature_columns = ["prompt"] + + def _get_key_from_inputs(self, x, key): + ele = x.get(key, None) + assert ele is not None, f"The key {key} is not found in the input" + if isinstance(ele, list): + return [e for e in ele] + else: + return [ele] + + def _generate_and_score_completions(self, inputs: dict[str, Union[torch.Tensor, Any]], model) -> dict[str, Union[torch.Tensor, Any]]: + device = self.accelerator.device + prompts = [x["prompt"] for x in inputs] + prompts_text = self.dna_module.prepare_prompt(self.processing_class, inputs) + + # Handle DNA sequences (treat as protein sequences for BLIP2) + batch_dna_sequences = [] + print("_generate_and_score_completions (BLIP2 GRPO):") + for x in inputs: + if 'dna_sequences' in x: + dnas = self._get_key_from_inputs(x, "dna_sequences") + batch_dna_sequences.append(dnas) + else: + batch_dna_sequences.append([]) + + # Prepare model inputs for BLIP2 + prompt_inputs = self.dna_module.prepare_model_inputs( + self.processing_class, + model, + prompts_text, + batch_dna_sequences, + return_tensors="pt", + padding=True, + padding_side="left", + add_special_tokens=False, + ) + + prompt_inputs = super()._prepare_inputs(prompt_inputs) + + # Extract BLIP2-specific inputs + prot_batch = prompt_inputs.get("prot_batch") + prompt_batch = prompt_inputs.get("prompt_batch") + + # Generate completions using BLIP2 + start = time.time() + with unwrap_model_for_generation(model, self.accelerator) as unwrapped_model: + # Prepare samples for BLIP2 generation + samples = { + 'prot_batch': prot_batch, + 'prompt_batch': prompt_batch + } + + # Use BLIP2's generate method + if hasattr(unwrapped_model, 'blip2'): + completions_text = unwrapped_model.blip2.generate( + samples, + do_sample=True, + temperature=0.6, + top_p=0.95, + num_beams=1, + max_length=self.max_completion_length, + min_length=1, + ) + else: + # Fallback if not BLIP2 structure + completions_text = ["Generated text"] * len(prompts_text) + + end = time.time() + print(f"Generation time: {end - start:.9f} seconds") + + # Convert completions to expected format + if is_conversational(inputs[0]): + completions = [[{"role": "assistant", "content": completion}] for completion in completions_text] + else: + completions = completions_text + + # Compute rewards + print("Reward calculation...") + rewards_per_func = torch.zeros(len(prompts), len(self.reward_funcs), device=device) + for i, (reward_func, reward_processing_class) in enumerate( + zip(self.reward_funcs, self.reward_processing_classes) + ): + if isinstance(reward_func, PreTrainedModel): + if is_conversational(inputs[0]): + messages = [{"messages": p + c} for p, c in zip(prompts, completions)] + texts = [apply_chat_template(x, reward_processing_class)["text"] for x in messages] + else: + texts = [p + c for p, c in zip(prompts, completions)] + reward_inputs = reward_processing_class( + texts, return_tensors="pt", padding=True, padding_side="right", add_special_tokens=False + ) + reward_inputs = super()._prepare_inputs(reward_inputs) + with torch.inference_mode(): + rewards_per_func[:, i] = reward_func(**reward_inputs).logits[:, 0] + else: + # Custom reward function + reward_kwargs = {key: [] for key in inputs[0].keys() if key not in ["prompt", "completion"]} + for key in reward_kwargs: + for example in inputs: + reward_kwargs[key].extend([example[key]]) + output_reward_func = reward_func(prompts=prompts, completions=completions, **reward_kwargs) + rewards_per_func[:, i] = torch.tensor(output_reward_func, dtype=torch.float32, device=device) + + # Gather rewards across processes + rewards_per_func = self.accelerator.gather(rewards_per_func) + rewards = rewards_per_func.sum(dim=1) + + # Compute grouped-wise rewards + mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1) + std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1) + + # Normalize rewards to compute advantages + mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(self.num_generations, dim=0) + std_grouped_rewards = std_grouped_rewards.repeat_interleave(self.num_generations, dim=0) + advantages = (rewards - mean_grouped_rewards) / (std_grouped_rewards + 1e-4) + + # Get local slice of advantages + process_slice = slice( + self.accelerator.process_index * len(prompts), + (self.accelerator.process_index + 1) * len(prompts), + ) + advantages = advantages[process_slice] + + # Log metrics + print("Logging metrics...") + completion_length = len(completions_text[0].split()) if completions_text else 0 + self._metrics["completion_length"].append(completion_length) + + reward_per_func = self.accelerator.gather_for_metrics(rewards_per_func).mean(0) + for i, reward_func in enumerate(self.reward_funcs): + if isinstance(reward_func, PreTrainedModel): + reward_func_name = reward_func.config._name_or_path.split("/")[-1] + else: + reward_func_name = reward_func.__name__ + self._metrics[f"rewards/{reward_func_name}"].append(reward_per_func[i].item()) + + self._metrics["reward"].append(self.accelerator.gather_for_metrics(rewards).mean().item()) + self._metrics["reward_std"].append(self.accelerator.gather_for_metrics(std_grouped_rewards).mean().item()) + + # Log completions if enabled + if ( + self.log_completions + and self.state.global_step % self.args.logging_steps == 0 + and "wandb" in self.args.report_to + ): + timestamp = time.time() + num_items = len(gather_object(prompts_text)) + + table = { + "step": [f"{self.state.global_step}_{timestamp}"] * num_items, + "prompt": gather_object(prompts_text), + "completion": gather_object(completions_text), + "reward": rewards.tolist(), + } + df = pd.DataFrame(table) + + if wandb.run is not None and self.accelerator.is_main_process: + wandb.log({f"completions_{self.state.global_step}_{timestamp}": wandb.Table(dataframe=df)}) + + return { + "prot_batch": prot_batch, + "prompt_batch": prompt_batch, + "completions_text": completions_text, + "old_per_token_logps": None, # BLIP2 doesn't need this for current implementation + "ref_per_token_logps": None, # BLIP2 doesn't need this for current implementation + "advantages": advantages, + "multimodal_inputs": {"prot_batch": prot_batch, "prompt_batch": prompt_batch} + } + + def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): + if return_outputs: + raise ValueError("The BLIP2 GRPO Trainer does not support returning outputs") + + print("compute_loss - index 1") + if self.state.global_step % self.num_iterations == 0: + inputs = self._generate_and_score_completions(inputs, model) + self._buffered_inputs[self._step % self.args.gradient_accumulation_steps] = inputs + else: + inputs = self._buffered_inputs[self._step % self.args.gradient_accumulation_steps] + self._step += 1 + + print("compute_loss - index 2") + + # For BLIP2, we need to compute loss differently + # This is a simplified version - you may need to adapt based on your specific BLIP2 implementation + + # Extract the necessary components + prot_batch = inputs.get("prot_batch") + prompt_batch = inputs.get("prompt_batch") + advantages = inputs.get("advantages") + + print("compute_loss - index 3") + + # Create a batch for BLIP2 forward pass + # This assumes your BLIP2 model expects (prot_batch, prompt_batch, text_dict) format + text_dict = {"targets": inputs.get("completions_text", [])} + batch = (prot_batch, prompt_batch, text_dict) + + print("compute_loss - index 4") + + # Forward pass through BLIP2 + if hasattr(model, 'blip2'): + loss = model.blip2(batch) + else: + loss = model(batch) + + print("compute_loss - index 5") + + # For now, return the basic loss + # You may want to incorporate the advantages into the loss calculation + # based on your specific GRPO implementation needs + + if advantages is not None: + # Apply advantages weighting (simplified) + advantage_weight = advantages.mean().item() + loss = loss * (1.0 + advantage_weight) + + print("Computing final loss...") + return loss + + def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None: + metrics = {key: sum(val) / len(val) for key, val in self._metrics.items()} + logs = {**logs, **metrics} + if version.parse(transformers.__version__) >= version.parse("4.47.0.dev0"): + super().log(logs, start_time) + else: + super().log(logs) + self._metrics.clear() + + def _get_train_sampler(self) -> Sampler: + """Returns a sampler that ensures proper data sampling for GRPO training.""" + effective_batch_size = ( + self.args.per_device_train_batch_size + * self.accelerator.num_processes + * self.args.gradient_accumulation_steps + ) + + return RepeatRandomSampler( + data_source=self.train_dataset, + mini_repeat_count=self.num_generations, + batch_size=effective_batch_size // self.num_generations, + repeat_count=self.num_iterations, + seed=self.args.seed, + ) + + def _get_eval_sampler(self, eval_dataset) -> Sampler: + """Returns a sampler for evaluation.""" + return RepeatRandomSampler( + data_source=eval_dataset, + mini_repeat_count=self.num_generations, + seed=self.args.seed, + ) \ No newline at end of file diff --git a/BioReason-0813/blips_reason.py b/BioReason-0813/blips_reason.py new file mode 100644 index 0000000000000000000000000000000000000000..3870d677cf8c32de1d34ad76b10bf575e8d6cc37 --- /dev/null +++ b/BioReason-0813/blips_reason.py @@ -0,0 +1,866 @@ +import os +import re +import pathlib +from argparse import ArgumentParser +from typing import List, Dict, Optional +from dataclasses import dataclass, field + +import torch +from torch import nn +import torch.nn.functional as F +from torch.optim import AdamW +from torch.utils.data import DataLoader, Dataset +from transformers import get_cosine_schedule_with_warmup, AutoTokenizer + +from transformers import ( + AutoTokenizer, + AutoModelForCausalLM, + AutoModelForMaskedLM, + AutoProcessor, +) + +from datasets import load_dataset, DatasetDict + +from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training +from transformers import BitsAndBytesConfig + +import pytorch_lightning as pl +from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor +from pytorch_lightning.loggers import WandbLogger + +from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config + +# Import BLIP2 modules +from model.blip2_stage2 import Blip2Stage2 +from blip2_dna_module import Blip2DNAModule +from blip2_grpo_trainer import Blip2GRPOTrainer +from bioreason.trainer import DNALLMGRPOConfig + +# Custom TrainerCallback to override the saving mechanism +from transformers import TrainerCallback, TrainerState, TrainerControl +from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR + +from prompt_templates import prompt_templates + +class SaveWithPyTorchCallback(TrainerCallback): + """Custom callback to save models with PyTorch's native save mechanism instead of safetensors""" + def on_save(self, args, state, control, **kwargs): + # Get the checkpoint folder + checkpoint_folder = os.path.join( + args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}" + ) + os.makedirs(checkpoint_folder, exist_ok=True) + + # Save with PyTorch instead of safetensors + checkpoint_path = os.path.join(checkpoint_folder, "pytorch_model.bin") + model = kwargs.get("model") + + # Get model unwrapped from accelerator etc. + unwrapped_model = model.module if hasattr(model, "module") else model + + # Save using PyTorch directly + torch.save(unwrapped_model.state_dict(), checkpoint_path) + + # For BLIP2, save the config from the LLM component + if hasattr(unwrapped_model, "blip2") and hasattr(unwrapped_model.blip2, "llm_model"): + if hasattr(unwrapped_model.blip2.llm_model, "config"): + unwrapped_model.blip2.llm_model.config.save_pretrained(checkpoint_folder) + elif hasattr(unwrapped_model.blip2.llm_model, "base_model") and hasattr(unwrapped_model.blip2.llm_model.base_model, "config"): + unwrapped_model.blip2.llm_model.base_model.config.save_pretrained(checkpoint_folder) + + # Print info about what's being saved + print(f"Saved model checkpoint to {checkpoint_folder}") + lora_params = [k for k in unwrapped_model.state_dict().keys() if "lora" in k] + print(f"Checkpoint contains {len(lora_params)} LoRA parameters") + + # Signal that we've saved + control.should_save = False + return control + +def extract_xml_answer(text: str) -> str: + """提取answer标签中的内容,如果没有则返回think标签后的内容""" + # 首先尝试提取answer标签 + answer_match = re.search(r"(.*?)", text, re.DOTALL) + if answer_match: + return answer_match.group(1).strip() + + # 如果没有answer标签,尝试提取think标签后的内容 + think_split = text.split("") + if len(think_split) > 1: + return think_split[-1].strip() + + # 如果都没有,返回原文 + return text.strip() + +def extract_classification_answer(text: str) -> str: + """专门用于提取分类答案的函数""" + # 提取answer标签中的内容 + answer_match = re.search(r"(.*?)", text, re.DOTALL) + if answer_match: + answer_content = answer_match.group(1).strip() + + # 查找分类相关的模式 + classification_patterns = [ + r"[Cc]lassification:\s*(\d+)", + r"[Cc]lass:\s*(\d+)", + r"[Ll]abel:\s*(\d+)", + r"[Pp]rediction:\s*(\d+)", + r"(\d+)", # 任何数字 + ] + + for pattern in classification_patterns: + match = re.search(pattern, answer_content) + if match: + return match.group(1) + + return answer_content + + return extract_xml_answer(text) + +def extract_hash_answer(text: str) -> str | None: + if "####" not in text: + return None + return text.split("####")[1].strip() + +def get_kegg_questions() -> Dataset: + """保留原有的KEGG数据集加载函数作为fallback""" + try: + data = load_dataset('wanglab/kegg', 'default') # type: ignore + example_dna_sequences = ["ATCTACATGCAT", "CAGCAGCTACAG", "CATCACATCGACATCGAC"] + num_dna_sequences = 2 + + data = data.map(lambda x: { # type: ignore + 'prompt': [ + { + 'role': 'user', + 'content': [ + *({'type': 'dna', 'text': None} for _ in range(num_dna_sequences)), + {'type': 'text', 'text': x['question']}, + ], + }, + ], + 'dna_sequences': [x['reference_sequence'], x['variant_sequence']], + 'answer': x['answer'], + }) # type: ignore + + return data + except Exception as e: + print(f"Failed to load KEGG dataset: {e}") + # 返回一个空的数据集结构 + from datasets import Dataset + empty_data = { + 'prompt': [], + 'dna_sequences': [], + 'answer': [] + } + dataset = Dataset.from_dict(empty_data) + return {'train': dataset, 'val': dataset} + +def get_protein_classification_data(data_path: str = None, prompt_template: str = None) -> Dataset: + """ + 加载蛋白质分类数据集 + 数据格式:name,aa_seq,label,location,unique_id,pdb_hash + """ + import pandas as pd + from datasets import Dataset + + if data_path is None: + # 如果没有提供路径,使用默认的kegg数据集作为fallback + return get_kegg_questions() + + # 读取CSV数据 + if data_path.endswith('.csv'): + df = pd.read_csv(data_path) + else: + # 假设是其他格式,可以扩展 + raise ValueError(f"Unsupported file format: {data_path}") + + # 默认prompt模板 + if prompt_template is None: + prompt_template = """ +Please analyze the following protein sequence and predict its classification. + +Protein sequence: {aa_seq} + +Question: What is the classification of this protein sequence? + +Please provide your reasoning in tags and your final answer in tags. +""" + + # 数据转换 + def process_example(row): + # 构建prompt + prompt_text = prompt_template.format( + aa_seq=row['aa_seq'], + name=row.get('name', ''), + location=row.get('location', ''), + unique_id=row.get('unique_id', ''), + ) + + return { + 'prompt': [ + { + 'role': 'user', + 'content': [ + {'type': 'protein', 'text': None}, # 蛋白质序列占位符 + {'type': 'text', 'text': prompt_text}, + ], + }, + ], + 'dna_sequences': [row['aa_seq']], # 使用aa_seq作为"dna_sequences" + 'answer': str(row['label']), # label作为答案 + 'metadata': { + 'name': row.get('name', ''), + 'location': row.get('location', ''), + 'unique_id': row.get('unique_id', ''), + 'pdb_hash': row.get('pdb_hash', ''), + } + } + + # 转换所有数据 + processed_data = [] + for _, row in df.iterrows(): + processed_data.append(process_example(row)) + + # 创建数据集 + dataset = Dataset.from_list(processed_data) + + # 划分训练集和验证集 + if len(dataset) > 100: # 如果数据足够大,进行划分 + dataset = dataset.train_test_split(test_size=0.1, seed=42) + else: + # 数据较小时,复制训练集作为验证集 + dataset = { + 'train': dataset, + 'val': dataset.select(range(min(10, len(dataset)))) # 选择前10个作为验证 + } + + return dataset + +def get_custom_protein_data_with_prompts(data_path: str = None, + prompt_templates: Dict[str, str] = None) -> Dataset: + """ + 更灵活的蛋白质数据加载函数,支持多种prompt模板 + """ + import pandas as pd + from datasets import Dataset + import random + + if data_path is None: + return get_kegg_questions() + + # 读取数据 + df = pd.read_csv(data_path) + + def process_example(row, template_name=None): + # 随机选择或指定template + if template_name is None: + template_name = random.choice(list(prompt_templates.keys())) + + template = prompt_templates[template_name] + + # 格式化prompt + prompt_text = template.format( + aa_seq=row['aa_seq'][:500] + "..." if len(row['aa_seq']) > 500 else row['aa_seq'], # 截断长序列 + label=row['label'], + name=row.get('name', ''), + location=row.get('location', ''), + ) + + return { + 'prompt': [ + { + 'role': 'user', + 'content': [ + {'type': 'protein', 'text': None}, + {'type': 'text', 'text': prompt_text.split('')[0]}, # prompt前半部分 + ], + }, + ], + 'dna_sequences': [row['aa_seq']], # 完整序列用于模型处理 + 'answer': str(row['label']), + 'template_used': template_name, + 'metadata': { + 'name': row.get('name', ''), + 'location': row.get('location', ''), + 'unique_id': row.get('unique_id', ''), + 'pdb_hash': row.get('pdb_hash', ''), + 'full_prompt': prompt_text, + } + } + + # 处理数据 + processed_data = [] + print("template_name") + print(script_args.template_name) + for _, row in df.iterrows(): + processed_data.append(process_example(row,script_args.template_name)) + + dataset = Dataset.from_list(processed_data) + + # 数据集划分 + if len(dataset) > 50: + dataset = dataset.train_test_split(test_size=0.1, seed=42) + else: + dataset = { + 'train': dataset, + 'val': dataset.select(range(min(5, len(dataset)))) + } + + return dataset + +def get_gsm8k_questions(question_prompt: str) -> Dataset: + data = load_dataset('openai/gsm8k', 'main') # type: ignore + + example_dna_sequences = ["ATCTACATGCAT", "CAGCAGCTACAG", "CATCACATCGACATCGAC"] + data = data.map(lambda x: { # type: ignore + 'prompt': [ + { + 'role': 'user', + 'content': [ + *({'type': 'dna', 'text': None} for _ in range(len(example_dna_sequences))), + {'type': 'text', 'text': 'Give me a short introduction to large language model.'} + ] + }, + ], + 'dna_sequences': [dna for dna in example_dna_sequences], + 'answer': extract_hash_answer(x['answer']), + }) # type: ignore + + return data # type: ignore + +# Reward functions +def format_correct_reward_func(completions, **kwargs) -> list[float]: + """ + 奖励函数:检查格式是否正确 + 要求:包含 ...... 标签 + """ + responses = [completion[0]["content"] for completion in completions] + rewards = [] + + for response in responses: + score = 0.0 + + # 检查是否有think标签 + if "" in response and "" in response: + score += 0.5 + + # 检查是否有answer标签 + if "" in response and "" in response: + score += 0.5 + + # 检查标签的顺序是否正确 + think_start = response.find("") + think_end = response.find("") + answer_start = response.find("") + answer_end = response.find("") + + if (think_start != -1 and think_end != -1 and + answer_start != -1 and answer_end != -1 and + think_start < think_end < answer_start < answer_end): + score += 0.5 # 格式完全正确的额外奖励 + + rewards.append(score) + + return rewards + +def accuracy_reward_func(prompts, completions, answer, **kwargs) -> list[float]: + """ + 奖励函数:检查答案准确率 + 适配蛋白质分类任务 + """ + responses = [completion[0]['content'] for completion in completions] + rewards = [] + + for i, response in enumerate(responses): + # 提取answer标签中的内容 + answer_match = re.search(r"(.*?)", response, re.DOTALL) + if answer_match: + extracted_answer = answer_match.group(1).strip() + else: + extracted_answer = response.strip() + + # 获取正确答案 + if isinstance(answer, list) and len(answer) > i: + correct_answer = str(answer[i]).strip() + elif isinstance(answer, list) and len(answer) > 0: + correct_answer = str(answer[0]).strip() + else: + correct_answer = str(answer).strip() + + # 计算准确率奖励 + # 对于分类任务,检查数字或类别匹配 + extracted_clean = re.sub(r'[^\w\d]', '', extracted_answer.lower()) + correct_clean = re.sub(r'[^\w\d]', '', correct_answer.lower()) + + if correct_clean in extracted_clean or extracted_clean == correct_clean: + rewards.append(1.0) # 完全匹配 + elif any(word in extracted_clean for word in correct_clean.split()): + rewards.append(0.5) # 部分匹配 + else: + rewards.append(0.0) # 不匹配 + + return rewards + +def classification_specific_reward_func(prompts, completions, answer, **kwargs) -> list[float]: + """ + 针对蛋白质分类任务的专门奖励函数 + """ + responses = [completion[0]['content'] for completion in completions] + rewards = [] + + for i, response in enumerate(responses): + score = 0.0 + + # 提取答案 + answer_match = re.search(r"(.*?)", response, re.DOTALL) + if answer_match: + extracted_answer = answer_match.group(1).strip() + else: + extracted_answer = response.strip() + + # 获取正确答案 + if isinstance(answer, list) and len(answer) > i: + correct_answer = str(answer[i]).strip() + elif isinstance(answer, list) and len(answer) > 0: + correct_answer = str(answer[0]).strip() + else: + correct_answer = str(answer).strip() + + # 检查是否包含分类关键词 + classification_keywords = ['classification', 'class', 'category', 'type', 'function', 'family'] + if any(keyword in extracted_answer.lower() for keyword in classification_keywords): + score += 0.2 + + # 检查数字匹配(对于数字标签) + if correct_answer.isdigit(): + if correct_answer in extracted_answer: + score += 0.8 + # 检查数字临近性 + try: + extracted_numbers = re.findall(r'\d+', extracted_answer) + if extracted_numbers: + closest_num = min(extracted_numbers, key=lambda x: abs(int(x) - int(correct_answer))) + if abs(int(closest_num) - int(correct_answer)) <= 1: + score += 0.4 + except: + pass + else: + # 文本标签匹配 + if correct_answer.lower() in extracted_answer.lower(): + score += 0.8 + + # 检查是否有推理过程 + if "" in response and "" in response: + think_content = re.search(r"(.*?)", response, re.DOTALL) + if think_content and len(think_content.group(1).strip()) > 20: + score += 0.2 + + rewards.append(min(score, 1.0)) # 确保不超过1.0 + + return rewards + +def repetition_penalty_reward_func(completions, **kwargs) -> list[float]: + """ + 奖励函数:检查重复率(越低越好) + 计算文本中重复词汇的比例,重复率越低奖励越高 + """ + responses = [completion[0]["content"] for completion in completions] + rewards = [] + + for response in responses: + # 提取answer部分的文本 + answer_match = re.search(r"(.*?)", response, re.DOTALL) + if answer_match: + text_to_analyze = answer_match.group(1).strip() + else: + text_to_analyze = response.strip() + + # 分词并计算重复率 + words = text_to_analyze.lower().split() + + if len(words) == 0: + rewards.append(0.0) + continue + + # 计算词汇重复率 + unique_words = set(words) + repetition_rate = 1.0 - (len(unique_words) / len(words)) + + # 计算句子重复率 + sentences = [s.strip() for s in text_to_analyze.split('.') if s.strip()] + if len(sentences) > 1: + unique_sentences = set(sentences) + sentence_repetition_rate = 1.0 - (len(unique_sentences) / len(sentences)) + else: + sentence_repetition_rate = 0.0 + + # 综合重复率 + overall_repetition = (repetition_rate + sentence_repetition_rate) / 2 + + # 重复率越低,奖励越高 + reward = max(0.0, 1.0 - overall_repetition * 2) # 乘以2让惩罚更明显 + rewards.append(reward) + + return rewards + +def combined_reward_func(prompts, completions, answer, + format_weight=0.3, accuracy_weight=0.5, repetition_weight=0.2, + **kwargs) -> list[float]: + """ + 组合奖励函数:格式+准确率+重复率的加权组合 + """ + format_rewards = format_correct_reward_func(completions, **kwargs) + accuracy_rewards = accuracy_reward_func(prompts, completions, answer, **kwargs) + repetition_rewards = repetition_penalty_reward_func(completions, **kwargs) + + # 确保权重总和为1 + total_weight = format_weight + accuracy_weight + repetition_weight + if total_weight != 1.0: + format_weight /= total_weight + accuracy_weight /= total_weight + repetition_weight /= total_weight + print(f"Normalized weights - Format: {format_weight:.3f}, Accuracy: {accuracy_weight:.3f}, Repetition: {repetition_weight:.3f}") + + combined_rewards = [] + for f_reward, a_reward, r_reward in zip(format_rewards, accuracy_rewards, repetition_rewards): + combined = (format_weight * f_reward + + accuracy_weight * a_reward + + repetition_weight * r_reward) + combined_rewards.append(combined) + + return combined_rewards + +# 保留一些原有的奖励函数作为备选 +def less_than_4_reward_func(completions, **kwargs) -> list[float]: + responses = [completion[0]['content'] for completion in completions] + extracted_responses = [extract_xml_answer(r) for r in responses] + return [0.5 if len(r.split(' ')) <= 4 else 0.0 for r in extracted_responses] + +def strict_format_reward_func(completions, **kwargs) -> list[float]: + """Reward function that checks if the completion has a specific format.""" + pattern = r"^\n.*?\n\n.*?\n$" + responses = [completion[0]["content"] for completion in completions] + matches = [re.match(pattern, r) for r in responses] + return [0.5 if match else 0.0 for match in matches] + +def xmlcount_reward_func(completions, **kwargs) -> list[float]: + contents = [completion[0]["content"] for completion in completions] + return [count_xml(c) for c in contents] + +def count_xml(text) -> float: + count = 0.0 + if text.count("\n") == 1: + count += 0.125 + if text.count("\n\n") == 1: + count += 0.125 + return count + +@dataclass +class Blip2ModelConfig(ModelConfig): + # BLIP2 specific configuration + model_name_or_path: str = field(default="blip2-model", metadata={"help": "Model checkpoint for weights initialization."}) + + # BLIP2 Architecture parameters + bert_name: str = field(default="/path/to/bert", metadata={"help": "BERT model for Q-former"}) + num_query_token: int = field(default=8, metadata={"help": "Number of query tokens"}) + cross_attention_freq: int = field(default=2, metadata={"help": "Cross attention frequency"}) + plm_model: str = field(default="facebook/esm2_t30_150M_UR50D", metadata={"help": "Protein language model"}) + plm_tune: str = field(default="freeze", metadata={"help": "PLM tuning strategy"}) + llm_name: str = field(default="facebook/galactica-1.3b", metadata={"help": "Language model name"}) + llm_tune: str = field(default="lora", metadata={"help": "LLM tuning strategy"}) + qformer_tune: str = field(default="train", metadata={"help": "Q-former tuning strategy"}) + peft_dir: str = field(default="", metadata={"help": "PEFT directory"}) + + # LoRA parameters + lora_r: int = field(default=8, metadata={"help": "LoRA rank"}) + lora_alpha: int = field(default=16, metadata={"help": "LoRA alpha"}) + lora_dropout: float = field(default=0.1, metadata={"help": "LoRA dropout"}) + + # Training parameters + enbale_gradient_checkpointing: bool = field(default=False, metadata={"help": "Enable gradient checkpointing"}) + enable_flash: bool = field(default=False, metadata={"help": "Enable flash attention"}) + + # Other parameters + cache_dir: str = field(default=None, metadata={"help": "Path to model cache directory."}) + sft_checkpoint: str = field(default=None, metadata={"help": "Path to the checkpoint for SFT."}) + freeze_dna_modules: bool = field(default=False, metadata={"help": "Freeze DNA/protein modules"}) + +@dataclass +class GRPOScriptArguments(ScriptArguments): + """ + Script arguments for the GRPO training script with BLIP2. + """ + dataset_name: str = field(default="wanglab/kegg", metadata={"help": "Dataset name with default."}) + data_file_paths: str = field( + default=None, + metadata={"help": "Path to protein classification CSV file (format: name,aa_seq,label,location,unique_id,pdb_hash)"}, + ) + arrow_cache_dir: str = field( + default=None, + metadata={"help": "Path to arrow cache directory"}, + ) + val_split_ratio: float = field( + default=0.1, + metadata={"help": "Ratio of validation split, default 0.1"}, + ) + reward_funcs: list[str] = field( + # 选项1:使用组合奖励函数(推荐) + default_factory=lambda: ["combined"], + + # 选项2:使用分离的奖励函数 + # default_factory=lambda: ["format_correct", "accuracy", "repetition_penalty"], + + # 选项3:使用蛋白质分类专用奖励 + # default_factory=lambda: ["format_correct", "classification_specific", "repetition_penalty"], + + metadata={"help": "List of reward functions. Available: 'combined', 'format_correct', 'accuracy', 'classification_specific', 'repetition_penalty', 'xmlcount', 'strict_format', 'less_than_4'"}, + ) + + # 奖励函数权重配置 + format_weight: float = field( + default=0.3, + metadata={"help": "Weight for format correctness reward (used in combined reward)"} + ) + accuracy_weight: float = field( + default=0.5, + metadata={"help": "Weight for accuracy reward (used in combined reward)"} + ) + repetition_weight: float = field( + default=0.2, + metadata={"help": "Weight for repetition penalty reward (used in combined reward)"} + ) + + # 数据处理参数 + template_name: str = field( + default="classification", + metadata={"help": "Prompt template to use: 'classification', 'function_prediction', 'location_prediction'"} + ) + max_seq_length: int = field( + default=1000, + metadata={"help": "Maximum protein sequence length for display in prompt"} + ) + use_custom_prompts: bool = field( + default=True, + metadata={"help": "Whether to use custom protein-specific prompts"} + ) + +reward_funcs_registry = { + # 新的三合一奖励函数 + "combined": combined_reward_func, # 格式+准确率+重复率组合 + + # 分离的奖励函数 + "format_correct": format_correct_reward_func, # 格式正确性 + "accuracy": accuracy_reward_func, # 准确率 + "repetition_penalty": repetition_penalty_reward_func, # 重复率惩罚 + "classification_specific": classification_specific_reward_func, # 蛋白质分类专用 + + # 原有的奖励函数(保留作为备选) + "xmlcount": xmlcount_reward_func, + "strict_format": strict_format_reward_func, + "less_than_4": less_than_4_reward_func, +} + +def get_vlm_module(model_name_or_path): + # Always use BLIP2 module for this implementation + return Blip2DNAModule + +def create_blip2_args_from_config(model_args): + """Create BLIP2 args from model config""" + # Convert model config to the format expected by BLIP2 + blip2_args = { + 'bert_name': model_args.bert_name, + 'num_query_token': model_args.num_query_token, + 'cross_attention_freq': model_args.cross_attention_freq, + 'plm_model': model_args.plm_model, + 'plm_tune': model_args.plm_tune, + 'llm_name': model_args.llm_name, + 'llm_tune': model_args.llm_tune, + 'qformer_tune': model_args.qformer_tune, + 'peft_dir': model_args.peft_dir, + 'lora_r': model_args.lora_r, + 'lora_alpha': model_args.lora_alpha, + 'lora_dropout': model_args.lora_dropout, + 'enbale_gradient_checkpointing': model_args.enbale_gradient_checkpointing, + 'enable_flash': model_args.enable_flash, + } + return blip2_args + +def _prep_for_training(model, training_args): + """ + Prepare BLIP2 model for training with LoRA. + """ + # The BLIP2 model should handle its own LoRA setup + # This is mainly for any additional preparation needed + + target_modules = ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"] + + lora_config = LoraConfig( + r=training_args.lora_r, + lora_alpha=training_args.lora_alpha, + lora_dropout=training_args.lora_dropout, + target_modules=target_modules, + init_lora_weights="gaussian", + bias="none", + task_type="CAUSAL_LM", + ) + + return lora_config + +def main(script_args, training_args, model_args): + print(training_args.output_dir) + torch.cuda.empty_cache() + torch.set_float32_matmul_precision("medium") + + # Create BLIP2 model + blip2_args = create_blip2_args_from_config(model_args) + model = Blip2Stage2(blip2_args) + + # Load checkpoint if specified + if model_args.sft_checkpoint is not None: + print(f"Loading SFT checkpoint from {model_args.sft_checkpoint}") + model = Blip2Stage2.load_from_checkpoint(model_args.sft_checkpoint, strict=False, args=blip2_args, map_location='cpu') + + # if os.path.isdir(model_args.sft_checkpoint): + # # Load Lightning checkpoint + # checkpoint = torch.load(os.path.join(model_args.sft_checkpoint, "last.ckpt"), map_location='cpu') + # model.load_state_dict(checkpoint['state_dict'], strict=False) + # print("Loaded Lightning checkpoint") + # else: + # # Load PyTorch state dict + # checkpoint = torch.load(model_args.sft_checkpoint, map_location='cpu') + + # if "state_dict" in checkpoint: + # state_dict = checkpoint["state_dict"] + # else: + # state_dict = checkpoint + + # # Remove module prefix if present + # state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} + + # result = model.load_state_dict(state_dict, strict=False) + # print(f"Loaded checkpoint with {len(result.missing_keys)} missing keys and {len(result.unexpected_keys)} unexpected keys") + + # Get reward functions with weights + reward_funcs = [] + for func_name in script_args.reward_funcs: + if func_name == "combined": + # 为组合奖励函数传递权重参数 + def weighted_combined_reward(prompts, completions, answer, **kwargs): + return combined_reward_func( + prompts, completions, answer, + format_weight=script_args.format_weight, + accuracy_weight=script_args.accuracy_weight, + repetition_weight=script_args.repetition_weight, + **kwargs + ) + reward_funcs.append(weighted_combined_reward) + else: + reward_funcs.append(reward_funcs_registry[func_name]) + + print("reward_funcs:", [func.__name__ if hasattr(func, '__name__') else 'weighted_combined_reward' for func in reward_funcs]) + print(f"Reward weights - Format: {script_args.format_weight}, Accuracy: {script_args.accuracy_weight}, Repetition: {script_args.repetition_weight}") + + vlm_module_cls = get_vlm_module(model_args.model_name_or_path) + print("using vlm module:", vlm_module_cls.__name__) + question_prompt = vlm_module_cls.get_question_template() + + # Load dataset based on data source + if script_args.data_file_paths and script_args.use_custom_prompts: + print(f"Loading custom protein data from: {script_args.data_file_paths}") + + + dataset = get_custom_protein_data_with_prompts( + data_path=script_args.data_file_paths, + prompt_templates=prompt_templates, + template_name=script_args.template_name + ) + elif script_args.data_file_paths: + print(f"Loading protein data from: {script_args.data_file_paths}") + dataset = get_protein_classification_data( + data_path=script_args.data_file_paths + ) + else: + print("Using default KEGG dataset") + dataset = get_kegg_questions() + + print("Dataset loaded:") + print(f"Train size: {len(dataset['train'])}") + print(f"Val size: {len(dataset.get('val', []))}") + + # 打印数据样例 + if len(dataset['train']) > 0: + print("\nSample data:") + sample = dataset['train'][0] + print(f"Prompt type: {type(sample.get('prompt', 'Unknown'))}") + print(f"DNA sequences count: {len(sample.get('dna_sequences', []))}") + print(f"Answer: {sample.get('answer', 'N/A')}") + if 'metadata' in sample: + print(f"Metadata: {sample['metadata']}") + print(f"First 100 chars of sequence: {sample.get('dna_sequences', [''])[0][:100]}...") + + + # Custom callback to handle saving with PyTorch's native mechanism + custom_save_callback = SaveWithPyTorchCallback() + + # Initialize the BLIP2 GRPO trainer + trainer = Blip2GRPOTrainer( + model=model, + reward_funcs=reward_funcs, + args=training_args, + dna_module=vlm_module_cls(), + train_dataset=dataset['train'], + eval_dataset=dataset['val'] if training_args.eval_strategy != "no" else None, + peft_config=get_peft_config(model_args), + attn_implementation=getattr(model_args, 'attn_implementation', 'flash_attention_2'), + torch_dtype=getattr(model_args, 'torch_dtype', 'bfloat16'), + callbacks=[custom_save_callback], + ) + + # Set the trainer to save in PyTorch format instead of safetensors + training_args.save_safetensors = False + + # Train the model + trainer.train() + +if __name__ == "__main__": + print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES')}") + parser = TrlParser((GRPOScriptArguments, DNALLMGRPOConfig, Blip2ModelConfig)) + script_args, training_args, model_args = parser.parse_args_and_config() + + # Ensure we use PyTorch's save mechanism instead of safetensors + training_args.save_safetensors = False + + main(script_args, training_args, model_args) + +# 使用示例: +""" +使用你的蛋白质数据进行训练: + +1. 准备CSV文件,格式:name,aa_seq,label,location,unique_id,pdb_hash + +2. 运行训练: +python blip2_reason.py \ + --data_file_paths /path/to/your/protein_data.csv \ + --reward_funcs combined \ + --format_weight 0.2 \ + --accuracy_weight 0.6 \ + --repetition_weight 0.2 \ + --use_custom_prompts \ + --prompt_template classification \ + --max_seq_length 1000 \ + --output_dir ./output \ + --per_device_train_batch_size 4 \ + --num_train_epochs 3 \ + --learning_rate 1e-5 + +3. 或者使用分离的奖励函数: +python blip2_reason.py \ + --data_file_paths /path/to/your/protein_data.csv \ + --reward_funcs format_correct classification_specific repetition_penalty \ + --use_custom_prompts \ + --prompt_template function_prediction + +数据格式示例: +P0DM40,MLRVVVESASINPPLSTTPKAFVTVYFRDMMKRTRVEEGHDPIWNETLIWHLWNQPLENDSFLKVILQDSVSKKKERFIGLATVPLKRLAQRPKEVMFVRDLILLNHSMKPTNCTVTLHVAQIYDQDTEMTGNEELLGSTVNEVTQKKLMVSGLPMHRALASKPQHFQVRVKVFEARQLLGNNIKPVVKVNIADQQHLTRIKMGNNPFFNEIFFQNFHEVPAKFFEENISIEVVDSAASRSKAEIGRFQTDIGFIYHSPGHTLLRKWLGLCQRNKTTSGVRGYLKVTICALGVGDQALVDQKLPYEQNTRVQIFKSKEVPVSLAYLQFFIYCAEDLHFGTHKSATPVLEVELIGDKLRTKPQNPSDNPIWNQILTFQIQLPCLSSYIKFRVMDCSKYKCQDEIGSASLCLSQISSTGEEIQGMYSGFLPCFGPSFLTLRGGKKPPFRTSEEGTCIMDAVQHGLAYRGRIFVEIVTKIKSQQDSVMKDLSQEVTQVEMQYYRQKYGLCVIFLSCTMMPKFKDLIQFEVSMGHYGNKTDPNYKPLVSTTQYSPVIYDGTTYHYVPWYNTKPVVAVTSNWEDVGFRMNCLNLLHITRDRLKTNLDILKSIRNPRDPALLQQWEKLLKELQEDCRRPLPCMTDQPRANSLDRNKWQLRSQLLQQLAQMAKEAKPVNMVGTAKEWLHRLNAVIPEPQESLPDVLIWLMSRQQRVAYARVPAHTVLFSPAGPLSSGKFCGKIQNILLQYPEGEGQDTFPASLRVCMWLGNVKYSKNLKLLQQGSMVVYAETYENQAKTRDDWGQQGLYHCPNFSDVMGRKALPKTDFKAPPGWHWKDDWVVEPQRRLLLDIDINKSQVLEEVYENQLRNATGAWVPAAIPNTDVNGQPVEALENVKCPQGWHFKKNWIVKLNHAVDSEGWEYGVGIPPSGLPQIWNSVEKTYHSCRRRRWVRVRFRNHKELGQERSQEQETLSFLQMQDLSEEGKEGWEYGTFDSRFHLDPQPTSRFRRRCWHRQLAPNKDRGVASIFLLEGSLAVEQKDQPRKEMEKTRSWQPWKDLRHTPEDPRIPTTPFIYYILNKPHYYQLFCYIYQARNLMYNQILTFQEPFIQVVFLNHSLCTQTLRSSAAPTWSQSIIFQHLLLFEDPKDTRENPPLVVLELWQHDSRGNKILWGRSMWPPVVWLGLQDWVFTPLRWHPLVRELGEEEGEILASCELILETQKLKELHPPILSIPCKDGIYLLPKNIQPTMKMMAIEIMAWGLRNMTKVRYPQLLLECGGESLKTEPISNFQENPNFPTSTFFFTVFMPLEETHAQPLVVKVVDNQEYGQQIVVGQANIDFLQPYFCDPWSLNYTTVKLPTLSVKKPDTFLDFVYKKFWFDSSKDEEVYEEEVDWWSKLFWATGDADKSLNYNHKSYHTLKVYDCELEAVLTFKGLQDFCQTFKLYQEKPKVDSPVVGEFKGLFRIYPFPEDPEAPKPPRQFSAWPEIEDFPQMCLVRVYLIRAINLQPQDYNGLCDPYVILKLGQTKLGSRDSYYPNTLDPIFGMMYELTCNIPLEKDLEIQLFDFDLITADDEIGSTVIDLENRLLSGFGARCGLSKSYCKSGPFKWRDQMTPSYLLYRYAKQKGLPPPVFDLEGDSLYYNGETFKLQSFESAPPTYKHLGPKKERLALYILNTQGLVPEHVETRTLHSNSQPGIDQGKIQMWVDIFPKMLGPPGPQVNISPRKPKRYQLRCIIWSTAEVDLVQETFSKEKMSDIYVKGWLFGLEEDTQKTDVHYHSLTGEATFNWRFIFTMDYLTTERACVQSQKDYIWSLDPTSTKFPARLMIQIWDNDFFSPDDFLGVLELDLSDMPLPAQNIKQCSLKMMETDSKWPFTPQKRISLFKKTNVTGWWPCQVLDGDKWRLSGKVKMTLEMLSEREALIRPAGRGQSEPNQFPMLHPPERNDSFLLWYQSPIKNFCYAVCKRYRSKIICLVVTLVIGFILLNFVYSAPSYFAMNWIKPQLRLSSPIKIVNLIGTVNTSNINSSILTMEGSTYHASHVFPEAPAP,0,M,af67d99c09f74ea8af5004cc2906bbc5,d55cbc3d94bd9668d97a678b4a04176a +""" \ No newline at end of file diff --git a/BioReason-0813/model/__pycache__/blip2.cpython-310.pyc b/BioReason-0813/model/__pycache__/blip2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab459768cc63ad84d6ba426307ffbbcd30bcdefa Binary files /dev/null and b/BioReason-0813/model/__pycache__/blip2.cpython-310.pyc differ diff --git a/BioReason-0813/model/__pycache__/blip2_opt.cpython-310.pyc b/BioReason-0813/model/__pycache__/blip2_opt.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a88b57a5eee4753e3b43d571b1c783d25193ec2 Binary files /dev/null and b/BioReason-0813/model/__pycache__/blip2_opt.cpython-310.pyc differ diff --git a/BioReason-0813/model/__pycache__/blip2_opt.cpython-311.pyc b/BioReason-0813/model/__pycache__/blip2_opt.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f12d93613e2d2ffcfb1e7b94772c240a28afa183 Binary files /dev/null and b/BioReason-0813/model/__pycache__/blip2_opt.cpython-311.pyc differ diff --git a/BioReason-0813/model/__pycache__/blip2_stage2.cpython-310.pyc b/BioReason-0813/model/__pycache__/blip2_stage2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13ed04d876ed53190a07fd9c3e7f225b12b08ada Binary files /dev/null and b/BioReason-0813/model/__pycache__/blip2_stage2.cpython-310.pyc differ diff --git a/BioReason-0813/model/__pycache__/blip2_stage2.cpython-311.pyc b/BioReason-0813/model/__pycache__/blip2_stage2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1751c379ce7a940a74fd983798ba3ae35e7d972e Binary files /dev/null and b/BioReason-0813/model/__pycache__/blip2_stage2.cpython-311.pyc differ diff --git a/BioReason-0813/model/__pycache__/help_funcs.cpython-310.pyc b/BioReason-0813/model/__pycache__/help_funcs.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..795af64ff9203d70ee192252787143b73bb5603b Binary files /dev/null and b/BioReason-0813/model/__pycache__/help_funcs.cpython-310.pyc differ diff --git a/BioReason-0813/model/blip2.py b/BioReason-0813/model/blip2.py new file mode 100644 index 0000000000000000000000000000000000000000..40c386877ebfe65229e7550b3f6a92b3df05d867 --- /dev/null +++ b/BioReason-0813/model/blip2.py @@ -0,0 +1,126 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" +import torch +import torch.nn as nn + +from lavis.models.base_model import BaseModel +from lavis.models.blip2_models.Qformer import BertConfig, BertLMHeadModel +from transformers import BertTokenizer, BitsAndBytesConfig +from transformers import EsmTokenizer, EsmModel +import os +from pathlib import Path # 添加到文件顶部 + + +def get_gpu_memory(device=0): + # t = torch.cuda.get_device_properties(device).total_memory + # r = torch.cuda.memory_reserved(device) + # a = torch.cuda.memory_allocated(device) + # f = r-a # free inside reserved + free, total = torch.cuda.mem_get_info(device) + free = free / (1024 ** 3) + total = total / (1024 ** 3) + return free, total-free, total + + +class Blip2Base(BaseModel): + # @classmethod + # def init_tokenizer(cls): + # tokenizer = BertTokenizer.from_pretrained('./bert_pretrained/') + # tokenizer.add_special_tokens({"bos_token": "[DEC]"}) + # return tokenizer + + @classmethod + def init_Qformer(cls, model_name, num_query_token, plm_width, cross_attention_freq=2): + # assert model_name == 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract' + # print("bert load microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext") + + print(f"Loading Qformer from: {model_name}") + + # 修改2:添加本地路径检查逻辑 + if not model_name.startswith('microsoft/') and Path(model_name).exists(): + print("Loading from local path...") + else: + print("Loading from Hugging Face Hub...") + + encoder_config = BertConfig.from_pretrained(model_name) + encoder_config.encoder_width = plm_width + # insert cross-attention layer every other block + encoder_config.add_cross_attention = True + encoder_config.cross_attention_freq = cross_attention_freq + encoder_config.query_length = num_query_token + + Qformer = BertLMHeadModel.from_pretrained(model_name, config=encoder_config) + query_tokens = nn.Parameter( + torch.zeros(1, num_query_token, encoder_config.hidden_size) + ) + query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range) + + tokenizer = BertTokenizer.from_pretrained(model_name) + tokenizer.add_special_tokens({"bos_token": "[DEC]"}) + return tokenizer, Qformer, query_tokens + + + def init_protein_encoder(self, plm_name, load_4bit=False): + # assert plm_name.startswith('facebook/esm2') + # plm_tokenizer = EsmTokenizer.from_pretrained(plm_name) + # 检查是否为本地路径(判断是否存在文件夹或文件) + if os.path.isdir(plm_name) or os.path.exists(os.path.join(plm_name, "config.json")): + print(f"Loading local PLM from {plm_name}") + plm_tokenizer = EsmTokenizer.from_pretrained(plm_name) + else: + # 保留远程加载逻辑(可选) + print(f"Loading remote PLM from {plm_name}") + plm_tokenizer = EsmTokenizer.from_pretrained(plm_name) + + if not load_4bit: + plm = EsmModel.from_pretrained(plm_name, add_pooling_layer=False, torch_dtype=torch.bfloat16) + else: + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4', + ) + ## give a device map that assign all layers to device 0 + outputs = get_gpu_memory(6) + used_memory = outputs[1] + if used_memory > 1: + device_map = {"": 7} + else: + device_map = {"": 6} + plm = EsmModel.from_pretrained( + plm_name, + add_pooling_layer=False, + quantization_config=quant_config, + load_in_4bit=True, + load_in_8bit=False, + device_map=device_map, + torch_dtype=torch.bfloat16, + ) + + plm.num_features = plm.config.hidden_size + ln_layer = nn.LayerNorm(plm.num_features) + return plm_tokenizer, plm, ln_layer + + +def disabled_train(self, mode=True): + """Overwrite model.train with this function to make sure train/eval mode + does not change anymore.""" + return self + + +# class LayerNorm(nn.LayerNorm): +# """Subclass torch's LayerNorm to handle fp16.""" + +# def forward(self, x: torch.Tensor): +# orig_type = x.dtype +# ret = super().forward(x.type(torch.float32)) +# return ret.type(orig_type) + diff --git a/BioReason-0813/model/blip2_opt.py b/BioReason-0813/model/blip2_opt.py new file mode 100644 index 0000000000000000000000000000000000000000..75a3f99cca617b39cd1801a99a04abb129a23f6c --- /dev/null +++ b/BioReason-0813/model/blip2_opt.py @@ -0,0 +1,550 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" +import logging +import torch +import torch.nn as nn +from torch.cuda.amp import autocast as autocast +# from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType, PeftModel +from lavis.models.blip2_models.blip2 import disabled_train +from model.blip2 import Blip2Base +from transformers import AutoTokenizer +from transformers import OPTForCausalLM +from transformers import AutoTokenizer, AutoModelForCausalLM +from opendelta import LoraModel +from opendelta.delta_models.lora import LoraConfig as DeltaLoraConfig +from transformers import BertTokenizer, BitsAndBytesConfig +from model.help_funcs import hf_enable_gradient_checkpointing +import json +# from accelerate import Accelerator +# import torch.distributed as dist + +# from peft.tuners.lora import LoraLayer +# from peft import ( +# prepare_model_for_kbit_training, +# LoraConfig as PeftLoraConfig, +# get_peft_model, +# PeftModel +# ) + +# from opendelta.delta_configs + +opt_model_list = [ + "facebook/galactica-125m", + "facebook/galactica-1.3b", + "facebook/galactica-6.7b", + "facebook/galactica-30b", +] + +def get_gpu_memory(device=0): + # t = torch.cuda.get_device_properties(device).total_memory + # r = torch.cuda.memory_reserved(device) + # a = torch.cuda.memory_allocated(device) + # f = r-a # free inside reserved + free, total = torch.cuda.mem_get_info(device) + free = free / (1024 ** 3) + total = total / (1024 ** 3) + return free, total-free, total + +def mask_by_len(input, lens, fill_value=0): + ''' + input: shape = [N, D] + lens: shape = [N] + ''' + mask = torch.arange(input.shape[1], device=input.device).reshape(1, -1) + mask = mask < lens.reshape(-1, 1) + input[mask] = fill_value + return input + + + +class Blip2OPT(Blip2Base): + """ + BLIP2 first-stage model with Q-former and ViT. + Supported model types: + - pretrained: pretrained model with vit-g + - pretrain_vitL: pretrained model with vit-large + - coco: fintuned model on coco + Usage: + >>> from lavis.models import load_model + >>> model = load_model("blip2", "pretrain") + """ + def __init__( + self, + bert_name, + num_query_token=32, + cross_attention_freq=2, + plm_model="facebook/esm2_t30_150M_UR50D", + plm_tune='freeze', + llm_name="facebook/galactica-1.3b", + llm_tune='freeze', + qformer_tune='train', + peft_dir='', + args=None, + ): + super().__init__() + self.args = args + self.enbale_gradient_checkpointing = args.enbale_gradient_checkpointing + + self.plm_tokenizer, self.plm, self.ln_layer = self.init_protein_encoder(plm_model) + self.plm_tune = plm_tune + if plm_tune == 'freeze': + for name, param in self.plm.named_parameters(): + param.requires_grad = False + self.plm = self.plm.eval() + self.plm.train = disabled_train + logging.info("freeze plm encoder") + elif plm_tune == 'lora': + lora_config = DeltaLoraConfig(args.lora_r, + args.lora_alpha, + args.lora_dropout, + modified_modules=["query", "value"]) + self.delta = LoraModel.from_config(lora_config, self.plm) + self.delta.freeze_module(set_state_dict=False) + self.delta.log() + else: + raise NotImplementedError() + + self.num_query_token = num_query_token + self.qformer_tokenizer, self.Qformer, self.query_tokens = self.init_Qformer(bert_name, num_query_token, self.plm.num_features, cross_attention_freq) + ### remove the unused parameters + self.Qformer.cls = None + self.Qformer.bert.embeddings.word_embeddings = None + self.Qformer.bert.embeddings.position_embeddings = None + for layer in self.Qformer.bert.encoder.layer: + layer.output = None + layer.intermediate = None + + # === 3. 控制 Qformer 是否冻结 === + self.qformer_tune = qformer_tune + if self.qformer_tune == 'freeze': + for name, param in self.Qformer.named_parameters(): + param.requires_grad = False + self.Qformer = self.Qformer.eval() + self.Qformer.train = disabled_train + logging.info("freeze Qformer encoder") + elif self.qformer_tune == 'train': + logging.info("train Qformer encoder") + else: + raise NotImplementedError(f"Unsupported qformer_tune mode: {self.qformer_tune}") + + ## initialize llm model + # self.init_distributed() + self.llm_model, self.llm_tokenizer = self.load_llm(llm_name) + + #self.llm_model, self.llm_tokenizer = self.load_model_on_single_gpu(llm_name) + self.eos_token_id = self.llm_tokenizer.eos_token_id + self.pad_token_id = self.llm_tokenizer.pad_token_id + + if llm_tune == 'freeze': + for name, param in self.llm_model.named_parameters(): + param.requires_grad = False + elif llm_tune == 'full': + for name, param in self.llm_model.named_parameters(): + param.requires_grad = True + elif llm_tune == 'lora': + lora_config = DeltaLoraConfig(args.lora_r, + args.lora_alpha, + args.lora_dropout,) + self.delta = LoraModel.from_config(lora_config, self.llm_model) + self.delta.freeze_module(set_state_dict=False) + self.delta.log() + elif llm_tune == 'mid_lora': + print("================") + print("加载了小lora") + print("=================") + lora_config = DeltaLoraConfig(args.lora_r, args.lora_alpha, args.lora_dropout, modified_modules=["q_proj", "v_proj", 'k_proj', "out_proj", "fc1", "fc2"]) + self.delta = LoraModel.from_config(lora_config, self.llm_model) + self.delta.freeze_module(set_state_dict=False) + self.delta.log() + elif llm_tune == 'peft_lora': + config = PeftLoraConfig( + r=args.lora_r, + lora_alpha=args.lora_alpha, + # target_modules=modules, + lora_dropout=args.lora_dropout, + bias="none", + task_type="CAUSAL_LM", + ) + self.llm_model = get_peft_model(self.llm_model, config) + for name, module in self.llm_model.named_modules(): + if isinstance(module, LoraLayer): + if True: + module = module.to(torch.bfloat16) + if 'norm' in name: + module = module.to(torch.float32) + if 'lm_head' in name or 'embed_tokens' in name: + if hasattr(module, 'weight'): + if True and module.weight.dtype == torch.float32: + module = module.to(torch.bfloat16) + else: + raise NotImplementedError() + + ## fixme: this is different from the original BLIP2 + # self.eos_token_id = self.llm_tokenizer( + # "\n", add_special_tokens=False + # ).input_ids[0] + self.opt_proj = nn.Linear(self.Qformer.config.hidden_size, self.llm_model.config.hidden_size) + + def load_llm(self, llm_model, load_4bit=False, enable_gradient_checkpointing=True): + llm_tokenizer = AutoTokenizer.from_pretrained(llm_model, use_fast=False, padding_side='right') + llm_tokenizer.add_special_tokens({'pad_token': ''}) + + special_tokens_dict = {'additional_special_tokens': ['', '']} + llm_tokenizer.add_special_tokens(special_tokens_dict) + + llm_model = AutoModelForCausalLM.from_pretrained(llm_model, torch_dtype=torch.bfloat16) + llm_model.resize_token_embeddings(len(llm_tokenizer)) ## this will cause bug when + + return llm_model, llm_tokenizer + + + # def forward(self, batch): + # prot_batch, text_batch = batch + # prot_embeds = self.plm(**prot_batch, return_dict=True) + # prot_embeds = prot_embeds.last_hidden_state + # if self.plm_tune == 'freeze': + # prot_embeds = prot_embeds.detach() + # prot_embeds = self.ln_layer(prot_embeds) + # device = prot_embeds.device + # query_tokens = self.query_tokens.expand(prot_embeds.shape[0], -1, -1) + # query_output = self.Qformer.bert( + # query_embeds=query_tokens, + # encoder_hidden_states=prot_embeds, + # encoder_attention_mask=prot_batch.attention_mask, + # return_dict=True, + # ) + # prot_tokens = self.opt_proj(query_output.last_hidden_state) + # prot_mask = torch.ones(prot_tokens.shape[:2], dtype=text_batch.attention_mask.dtype, device=device) + # prot_empty_targets = torch.ones(prot_tokens.shape[:2], dtype=torch.long, device=device).fill_(-100) + + # targets = text_batch.input_ids.masked_fill(text_batch.input_ids == self.llm_tokenizer.pad_token_id, -100) + # targets = targets.masked_fill(text_batch.token_type_ids == 0, -100) + # targets = torch.cat([prot_empty_targets, targets], dim=1) + + # inputs_embeds = self.llm_model.get_input_embeddings()(text_batch.input_ids) + # inputs_embeds = torch.cat((prot_tokens, inputs_embeds), dim=1) + # attention_mask = torch.cat([prot_mask, text_batch.attention_mask], dim=1) + + # outputs = self.llm_model( + # inputs_embeds=inputs_embeds, + # attention_mask=attention_mask, + # return_dict=True, + # labels=targets, + # ) + # loss = outputs.loss + # return loss + + def forward(self, batch): + prot_batch, prompt_batch, text_dict = batch + text_seqs = text_dict['targets'] + batch_size = prompt_batch['input_ids'].size(0) + # print("{{{{{}}}}}") + # print(batch_size) + + prot_embeds = self.plm(**prot_batch, return_dict=True) + prot_embeds = prot_embeds.last_hidden_state + if self.plm_tune == 'freeze': + prot_embeds = prot_embeds.detach() + prot_embeds = self.ln_layer(prot_embeds) + device = prot_embeds.device + query_tokens = self.query_tokens.expand(prot_embeds.shape[0], -1, -1) + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=prot_embeds, + encoder_attention_mask=prot_batch.attention_mask, + return_dict=True, + ) + prot_tokens = self.opt_proj(query_output.last_hidden_state) + prot_mask = torch.ones(prot_tokens.shape[:2], dtype=torch.long, device=device) + + # === Step 3: 编码 prompt 输入 === + prompt_embeds = self.llm_model.get_input_embeddings()(prompt_batch.input_ids) # [B, L_prompt, D_llm] + prompt_mask = prompt_batch['attention_mask'] + + + text_batch = self.llm_tokenizer( + list(text_seqs), + padding='longest', + truncation=True, + max_length=1024, + return_tensors='pt' + ).to(device) + target_embeds = self.llm_model.get_input_embeddings()(text_batch['input_ids']) # [B, T, D] + target_mask = text_batch['attention_mask'] + targets = text_batch['input_ids'].masked_fill(text_batch['input_ids'] == self.llm_tokenizer.pad_token_id, -100) + + # === : 加入 ChatML 特殊 token embedding === + embedding_layer = self.llm_model.get_input_embeddings() + + def embed_special_str(token_str): + # 先 tokenize,得到一系列 ID + ids = self.llm_tokenizer(token_str, add_special_tokens=False).input_ids + # 把它变成 [1, N] tensor + ids_tensor = torch.tensor([ids], device=device) + # 查 embedding 层: + embs = embedding_layer(ids_tensor) # shape [1, N, D] + # Expand 到 batch 大小 + return embs.expand(batch_size, -1, -1) + + # 示例 + embed_im_start = embed_special_str("<|im_start|>user\n protein sequence is:") # 可能对应多个 sub-tokens + embed_protein = embed_special_str("") + embed_im_end = embed_special_str("<|im_end|>\n") + embed_assistant= embed_special_str("<|im_start|>assistant\n") + + + user_embeds = torch.cat([embed_im_start, prot_tokens , embed_protein, prompt_embeds,embed_im_end, embed_assistant], dim=1) + user_mask = torch.ones(user_embeds.shape[:2], dtype=torch.long, device=device) + + assistant_embeds = target_embeds + assistant_mask = target_mask + + inputs_embeds = torch.cat([user_embeds, assistant_embeds], dim=1) + attention_mask = torch.cat([user_mask, assistant_mask], dim=1) + + # === Step 6: 构造 labels,只监督 assistant 部分 === + ignore_labels = torch.full(user_embeds.shape[:2], -100, dtype=torch.long, device=device) + assistant_labels = targets + labels = torch.cat([ignore_labels, assistant_labels], dim=1) + + # print("embed_im_start:", embed_im_start.shape) + # print("prompt_embeds:", prompt_embeds.shape) + # print("prot_tokens:", prot_tokens.shape) + # print("embed_im_end:", embed_im_end.shape) + # print("embed_assistant:", embed_assistant.shape) + # print("target_embeds:", target_embeds.shape) + # print("labels:", labels.shape) + # print("inputs_embeds:", inputs_embeds.shape) + + #============================ + + # inputs_embeds = torch.cat([prot_tokens, prompt_embeds, target_embeds], dim=1) + # attention_mask = torch.cat([prot_mask, prompt_mask, target_mask], dim=1) + + # # === Step 7: 构造 labels,只监督 target 部分 === + # prot_label_pad = torch.full(prot_tokens.shape[:2], -100, dtype=torch.long, device=device) + # prompt_label_pad = torch.full(prompt_mask.shape, -100, dtype=torch.long, device=device) + # labels = torch.cat([prot_label_pad, prompt_label_pad, targets], dim=1) + + # === Step 8: 送入 LLM === + outputs = self.llm_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + labels=labels, + return_dict=True, + ) + loss = outputs.loss + # prot_mask = torch.ones(prot_tokens.shape[:2], dtype=text_batch.attention_mask.dtype, device=device) + # prot_empty_targets = torch.ones(prot_tokens.shape[:2], dtype=torch.long, device=device).fill_(-100) + # empty_targets = torch.ones(prompt_batch.attention_mask.shape, dtype=torch.long, device=device).fill_(-100) + # targets = text_batch.input_ids.masked_fill(text_batch.input_ids == self.llm_tokenizer.pad_token_id, -100) + # targets = torch.cat([prot_empty_targets, empty_targets, targets], dim=1) + + # prompt_embeds = self.llm_model.get_input_embeddings()(prompt_batch.input_ids) + # inputs_embeds = self.llm_model.get_input_embeddings()(text_batch.input_ids) + # inputs_embeds = torch.cat((prot_tokens, prompt_embeds, inputs_embeds), dim=1) + # attention_mask = torch.cat([prot_mask, prompt_batch.attention_mask, text_batch.attention_mask], dim=1) + + # outputs = self.llm_model( + # inputs_embeds=inputs_embeds, + # attention_mask=attention_mask, + # return_dict=True, + # labels=targets, + # ) + # loss = outputs.loss + return loss + + # def forwardv2(self, batch): + # prot_batch, prompt_batch, text_batch = batch + # prot_embeds = self.plm(**prot_batch, return_dict=True) + # prot_embeds = prot_embeds.last_hidden_state + # if self.plm_tune == 'freeze': + # prot_embeds = prot_embeds.detach() + # prot_embeds = self.ln_layer(prot_embeds) + # device = prot_embeds.device + # query_tokens = self.query_tokens.expand(prot_embeds.shape[0], -1, -1) + # query_output = self.Qformer.bert( + # query_embeds=query_tokens, + # encoder_hidden_states=prot_embeds, + # encoder_attention_mask=prot_batch.attention_mask, + # return_dict=True, + # ) + # prot_tokens = self.opt_proj(query_output.last_hidden_state) + # prot_mask = torch.ones(prot_tokens.shape[:2], dtype=text_batch.attention_mask.dtype, device=device) + # targets = text_batch.input_ids.masked_fill(text_batch.input_ids == self.llm_tokenizer.pad_token_id, -100) + + # ### forward prefix + # prompt_embeds = self.llm_model.get_input_embeddings()(prompt_batch.input_ids) + # prefix_embeds = torch.cat([prot_tokens, prompt_embeds], dim=1) + # prefix_mask = torch.cat([prot_mask, prompt_batch.attention_mask], dim=1) + # prefix_output = self.llm_model.model( + # inputs_embeds=prefix_embeds, + # attention_mask=prefix_mask, + # use_cache=True, + # return_dict=True, + # ) + + # ## forward decoding + # if False: + # attention_mask = torch.cat([prot_mask, prompt_batch.attention_mask, text_batch.attention_mask], dim=1) + # else: + # attention_mask = text_batch.attention_mask + # print(prefix_output.past_key_values) + # outputs = self.llm_model( + # input_ids=text_batch.input_ids, + # attention_mask=attention_mask, + # past_key_values=prefix_output.past_key_values, + # return_dict=True, + # labels=targets, + # ) + # loss = outputs.loss + # return loss + + @torch.no_grad() + def generate( + self, + samples, + do_sample=False, + num_beams=5, + max_length=128, + min_length=1, + top_p=0.9, + repetition_penalty=1.0, + length_penalty=1.0, + num_captions=1, + temperature=1, + ): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) + num_beams (int): Number of beams for beam search. 1 means no beam search. + max_length (int): The maximum length of the sequence to be generated. + min_length (int): The minimum length of the sequence to be generated. + top_p (float): The cumulative probability for nucleus sampling. + repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty. + num_captions (int): Number of captions to be generated for each image. + Returns: + captions (list): A list of strings of length batch_size * num_captions. + """ + # prot_batch = samples['prot_batch'] + # prompt_batch = samples['prompt_batch'] + + # # with self.maybe_autocast(): + # prot_embeds = self.plm(**prot_batch, return_dict=True) + # prot_embeds = self.ln_layer(prot_embeds.last_hidden_state) + + # query_tokens = self.query_tokens.expand(prot_embeds.shape[0], -1, -1) + # query_output = self.Qformer.bert( + # query_embeds=query_tokens, + # encoder_hidden_states=prot_embeds, + # encoder_attention_mask=prot_batch['attention_mask'], + # return_dict=True, + # ) + # prot_tokens = self.opt_proj(query_output.last_hidden_state) + + + + # # prompt_batch = samples['prompt_batch'] + # prompt_input_ids = prompt_batch['input_ids'] # shape: [B, L] + # # for i, ids in enumerate(prompt_input_ids): + # # print(f"Prompt {i} token length: {len(ids)}") + # decoded_texts = [self.llm_tokenizer.decode(ids, skip_special_tokens=True) for ids in prompt_input_ids] + # # print("=========") + # # print(decoded_texts) + # #print(decoded_texts) + # save_path = "decoded_prompts.json" + + # # 将 list 写入 JSON 文件 + # with open(save_path, 'w', encoding='utf-8') as f: + # json.dump(decoded_texts, f, ensure_ascii=False, indent=4) + + # prompt_attention_mask = prompt_batch['attention_mask'] + # prompt_embeds = self.llm_model.model.embed_tokens(prompt_input_ids) + + # inputs_embeds = torch.cat((prompt_embeds, prot_tokens), dim=1) + + # prot_attention_mask = torch.ones(prot_tokens.shape[:2], dtype=prompt_attention_mask.dtype, device=prompt_attention_mask.device) + # #attention_mask = torch.cat([prot_attention_mask, prompt_attention_mask], dim=1) + # attention_mask = torch.cat([ prompt_attention_mask,prot_attention_mask], dim=1) + + #========================== + prot_batch = samples['prot_batch'] + prompt_batch = samples['prompt_batch'] + + + device = prompt_batch['input_ids'].device + batch_size = prompt_batch['input_ids'].size(0) + + # === Step 1: 编码蛋白质 + QFormer === + prot_embeds = self.plm(**prot_batch, return_dict=True).last_hidden_state + prot_embeds = self.ln_layer(prot_embeds) + query_tokens = self.query_tokens.expand(prot_embeds.shape[0], -1, -1) + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=prot_embeds, + encoder_attention_mask=prot_batch['attention_mask'], + return_dict=True, + ) + prot_tokens = self.opt_proj(query_output.last_hidden_state) # [B, L_qformer, D] + + # === Step 2: 编码 prompt 输入 === + prompt_input_ids = prompt_batch['input_ids'] + prompt_attention_mask = prompt_batch['attention_mask'] + prompt_embeds = self.llm_model.get_input_embeddings()(prompt_input_ids) + + # === Step 3: 获取 ChatML 特殊 token 的 embedding === + embedding_layer = self.llm_model.get_input_embeddings() + + def embed_special_str(token_str): + # 先 tokenize,得到一系列 ID + ids = self.llm_tokenizer(token_str, add_special_tokens=False).input_ids + # 把它变成 [1, N] tensor + ids_tensor = torch.tensor([ids], device=device) + # 查 embedding 层: + embs = embedding_layer(ids_tensor) # shape [1, N, D] + # Expand 到 batch 大小 + return embs.expand(batch_size, -1, -1) + + # 示例 + embed_im_start = embed_special_str("<|im_start|>user\nprotein sequence is: ") # 可能对应多个 sub-tokens + embed_protein = embed_special_str("") + embed_im_end = embed_special_str("<|im_end|>\n") + embed_assistant= embed_special_str("<|im_start|>assistant\n") + + + # === Step 4: 拼接 Embeddings === + user_embeds = torch.cat([embed_im_start, prot_tokens, embed_protein, prompt_embeds, embed_im_end], dim=1) + assistant_prefix = embed_assistant # 模型从这里开始生成 + inputs_embeds = torch.cat([user_embeds, assistant_prefix], dim=1) + + # === Step 5: attention_mask === + user_mask = torch.ones(user_embeds.shape[:2], dtype=torch.long, device=device) + assistant_mask = torch.ones((batch_size, embed_assistant.size(1)), dtype=torch.long, device=device) + attention_mask = torch.cat([user_mask, assistant_mask], dim=1) + + outputs = self.llm_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + do_sample=do_sample, + top_p=top_p, + temperature=temperature, + num_beams=num_beams, + max_new_tokens=max_length, + min_length=min_length, + # pad_token_id=self.pad_token_id, + eos_token_id=self.eos_token_id, + repetition_penalty=repetition_penalty, + length_penalty=length_penalty, + num_return_sequences=num_captions, + use_cache=True, + cache_implementation="hybrid" + ) + output_text = self.llm_tokenizer.batch_decode(outputs, skip_special_tokens=True) + output_text = [text.strip() for text in output_text] + # print(output_text) + return output_text diff --git a/BioReason-0813/model/blip2_stage2.py b/BioReason-0813/model/blip2_stage2.py new file mode 100644 index 0000000000000000000000000000000000000000..f7fafb04393d08e77b71839b1b6f0e3f1510a2fe --- /dev/null +++ b/BioReason-0813/model/blip2_stage2.py @@ -0,0 +1,365 @@ +import os +import torch +from model.blip2_opt import Blip2OPT +import pytorch_lightning as pl +from torch import optim +from lavis.common.optims import LinearWarmupCosineLRScheduler, LinearWarmupStepLRScheduler +import json +import torch.distributed as dist +# from peft import LoraConfig, TaskType +from typing import Any, Dict +from model.help_funcs import caption_evaluate, AttrDict +try: + from model.opt_flash_attention import replace_opt_attn_with_flash_attn, replace_opt_attn_with_original_attn +except ModuleNotFoundError: + pass + + +def get_module_state_dict(state_dict, module_name): + module_state_dict = {} + for key, value in state_dict.items(): + if key.startswith(module_name): + key = key[len(module_name) + 1:] + if key == '': + return value + module_state_dict[key] = value + return module_state_dict + +class Blip2Stage2(pl.LightningModule): + def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None: + # checkpoint.pop('optimizer_states') + to_be_removed = [] + for key, value in checkpoint['state_dict'].items(): + try: + if not self.get_parameter(key).requires_grad: + to_be_removed.append(key) + except AttributeError: + to_be_removed.append(key) + for key in to_be_removed: + checkpoint['state_dict'].pop(key) + + def __init__(self, args): + super().__init__() + if isinstance(args, dict): + args = AttrDict(**args) + + self.args = args + self.caption_eval_epoch = args.caption_eval_epoch + self.do_sample = args.do_sample + self.num_beams = args.num_beams + self.max_inference_len = args.max_inference_len + self.min_inference_len = args.min_inference_len + self.llm_tune = args.llm_tune + self.enable_flash = args.enable_flash + # if args.llm_name.find('galactica') >= 0: + self.blip2 = Blip2OPT(args.bert_name, + args.num_query_token, + args.cross_attention_freq, + args.plm_model, + args.plm_tune, + args.llm_name, + args.llm_tune, + args.qformer_tune, + args.peft_dir, + args) + # else: + # raise NotImplementedError() + self.save_hyperparameters(args) + + def load_from_stage1_checkpoint(self, path): + ckpt = torch.load(path, map_location='cpu') + state_dict = ckpt['state_dict'] + state_dict = {k.split('blip2qformer.')[1]:v for k, v in state_dict.items()} + self.blip2.load_state_dict(state_dict, strict=False) + return self + + def configure_optimizers(self): + self.trainer.fit_loop.setup_data() + warmup_steps = min(len(self.trainer.train_dataloader), self.args.warmup_steps) + optimizer = optim.AdamW(self.parameters(), lr=self.args.init_lr, weight_decay=self.args.weight_decay) + if self.args.scheduler == 'linear_warmup_cosine_lr': + self.scheduler = LinearWarmupCosineLRScheduler(optimizer, self.args.max_epochs, self.args.min_lr, self.args.init_lr, warmup_steps, self.args.warmup_lr) + elif self.args.scheduler == 'linear_warmup_step_lr': + self.scheduler = LinearWarmupStepLRScheduler(optimizer, self.args.max_epochs, self.args.min_lr, self.args.init_lr, self.args.lr_decay_rate, self.args.warmup_lr, warmup_steps) + elif self.args.scheduler == 'None': + self.scheduler = None + else: + raise NotImplementedError() + return optimizer + + def save_predictions(self, predictions, targets, q_types=None, log_prefix=''): + assert len(predictions) == len(targets) + if log_prefix: + name = f'{log_prefix}_predictions.txt' + else: + name = 'predictions.txt' + with open(os.path.join(self.logger.log_dir, name), 'w', encoding='utf8') as f: + if q_types is not None: + for p, t, q in zip(predictions, targets, q_types): + line = {'prediction': p, 'target': t, 'q_type': q} + f.write(json.dumps(line, ensure_ascii=True) + '\n') + else: + for p, t in zip(predictions, targets): + line = {'prediction': p, 'target': t} + f.write(json.dumps(line, ensure_ascii=True) + '\n') + + def on_validation_epoch_start(self) -> None: + if self.enable_flash: + replace_opt_attn_with_original_attn() + self.saved_dict_list = [] + self.prediction_list0 = [] + self.target_list0 = [] + self.prediction_list1 = [] + self.target_list1 = [] + + @torch.no_grad() + def validation_step(self, batch, batch_idx, dataloader_idx=0): + prot_batch, prompt_batch, target_dict = batch + if (dataloader_idx % 2) == 0: + # text_batch = batch[-1] + # batch_size = text_batch.input_ids.shape[0] + batch_size = len(target_dict['targets']) # ✅ 正确获取batch大小 + loss = self.blip2(batch) + ###============== Overall Loss ===================### + self.log(f"dataloader{dataloader_idx}/val loss", float(loss), batch_size=batch_size, sync_dist=True) + elif (dataloader_idx % 2) == 1: + if (self.current_epoch+1) % self.caption_eval_epoch != 0: + return + # prot_batch, prompt_batch, target_dict = batch + ###============== Captioning Results ===================### + samples = {'prot_batch': prot_batch, 'prompt_batch': prompt_batch} + predictions = self.blip2.generate( + samples, + do_sample=self.do_sample, + num_beams=self.num_beams, + max_length=self.max_inference_len, + min_length=self.min_inference_len + ) + target_dict['predictions'] = predictions + self.saved_dict_list.append(target_dict) + + def gather_dict_results(self, dict_list): + list_of_dict_list = [None for _ in range(self.trainer.world_size)] + dist.all_gather_object(list_of_dict_list, dict_list) + dict_list = [i for ii in list_of_dict_list for i in ii] ## dict list, each dict has values that are lists of predictions, etc. + keys = dict_list[0].keys() + gathered_dict = {} # each value is a list of predictions, etc. + for key in keys: + gathered_dict[key] = [i for d in dict_list for i in d[key]] + dict_list = [] + for i in range(len(gathered_dict['predictions'])): + d = {k:gathered_dict[k][i] for k in keys} + dict_list.append(d) + return dict_list + + def save_results(self, dict_list, log_prefix=""): + ## save the results + if log_prefix: + name = f'results/{log_prefix}_predictions.txt' + else: + name = 'predictions.txt' + with open(name, 'w', encoding='utf8') as f: + for d in dict_list: + f.write(json.dumps(d, ensure_ascii=True) + '\n') + + def on_validation_epoch_end(self): + if self.enable_flash: + replace_opt_attn_with_flash_attn() + if (self.current_epoch+1) % self.caption_eval_epoch != 0: + return + result_list = self.gather_dict_results(self.saved_dict_list) + ## empty cache + self.saved_dict_list = [] + + if self.global_rank == 0: + # 假设 args.filename = 'stage2_continue_deeplocmulti_07241522' + filename_parts = self.args.filename.split('_') + # 获取最后两部分并组合 + new_filename = '_'.join(filename_parts[-2:]) # 得到 'deeplocmulti_07241522' + self.save_results(result_list, new_filename) + all_predictions = [i['predictions'] for i in result_list] + all_targets = [i['targets'] for i in result_list] + + log_prefix = 'dataset0' ## fixme: this is just a placeholder + if 'q_types' in result_list[0]: + ## evaluate protein qa + pass + else: + ## evaluate captioning + bleu2, bleu4, rouge_1, rouge_2, rouge_l, meteor_score = \ + caption_evaluate(all_predictions, all_targets, self.blip2.llm_tokenizer, self.max_inference_len) + acc = evaluate_exact_match(all_predictions, all_targets) + self.log(f"{log_prefix}/acc", acc, sync_dist=False) + self.log(f"{log_prefix}/bleu2", bleu2, sync_dist=False) + self.log(f"{log_prefix}/bleu4", bleu4, sync_dist=False) + self.log(f"{log_prefix}/rouge_1", rouge_1, sync_dist=False) + self.log(f"{log_prefix}/rouge_2", rouge_2, sync_dist=False) + self.log(f"{log_prefix}/rouge_l", rouge_l, sync_dist=False) + self.log(f"{log_prefix}/meteor_score", meteor_score, sync_dist=False) + + @torch.no_grad() + def validation_step_old(self, batch, batch_idx, dataloader_idx=0): + if (dataloader_idx % 2) == 0: + text_batch = batch[-1] + batch_size = text_batch.input_ids.shape[0] + loss = self.blip2(batch) + ###============== Overall Loss ===================### + self.log(f"dataloader{dataloader_idx}/val loss", float(loss), batch_size=batch_size, sync_dist=True) + elif (dataloader_idx % 2) == 1: + if (self.current_epoch+1) % self.caption_eval_epoch != 0: + return + prot_batch, prompt_batch, target_dict = batch + ###============== Captioning Results ===================### + samples = {'prot_batch': prot_batch, 'prompt_batch': prompt_batch} + predictions = self.blip2.generate( + samples, + do_sample=self.do_sample, + num_beams=self.num_beams, + max_length=self.max_inference_len, + min_length=self.min_inference_len + ) + if dataloader_idx // 2 == 0: + self.prediction_list0.append(predictions) + self.target_list0.append(target_dict) + elif dataloader_idx // 2 == 1: + self.prediction_list1.append(predictions) + self.target_list1.append(target_dict) + else: + raise NotImplementedError + else: + raise NotImplementedError + + def on_validation_epoch_end_old(self): + if self.enable_flash: + replace_opt_attn_with_flash_attn() + if (self.current_epoch+1) % self.caption_eval_epoch != 0: + return + predictions0 = [i for ii in self.prediction_list0 for i in ii] + targets0 = [i for ii in self.target_list0 for i in ii['answers']] + if 'q_types' in self.target_list0[0]: + q_types0 = [i for ii in self.target_list0 for i in ii['q_types']] + self.reduce_and_evaluate_qa(predictions0, targets0, q_types0, 'dataset0') + else: + self.reduce_and_evaluate_captioning(predictions0, targets0, 'dataset0') + + if len(self.prediction_list1) > 0: + predictions1 = [i for ii in self.prediction_list1 for i in ii] + targets1 = [i for ii in self.target_list1 for i in ii] + self.reduce_and_evaluate_captioning(predictions1, targets1, 'dataset1') + + def reduce_and_evaluate_qa(self, predictions, targets, q_types, log_prefix=""): + all_predictions = [None for _ in range(self.trainer.world_size)] + all_targets = [None for _ in range(self.trainer.world_size)] + all_q_types = [None for _ in range(self.trainer.world_size)] + dist.all_gather_object(all_predictions, predictions) + dist.all_gather_object(all_targets, targets) + dist.all_gather_object(all_q_types, q_types) + if self.global_rank == 0: + all_predictions = [i for ii in all_predictions for i in ii] + all_targets = [i for ii in all_targets for i in ii] + all_q_types = [i for ii in all_q_types for i in ii] + self.save_predictions(all_predictions, all_targets, all_q_types, log_prefix=log_prefix) + + def reduce_and_evaluate_captioning(self, predictions, targets, log_prefix=""): + all_predictions = [None for _ in range(self.trainer.world_size)] + all_targets = [None for _ in range(self.trainer.world_size)] + dist.all_gather_object(all_predictions, predictions) + dist.all_gather_object(all_targets, targets) + if self.global_rank == 0: + all_predictions = [i for ii in all_predictions for i in ii] + all_targets = [i for ii in all_targets for i in ii] + self.save_predictions(all_predictions, all_targets, log_prefix) + ## fixme: I am not sure if the max length is the same as previous experiments + bleu2, bleu4, rouge_1, rouge_2, rouge_l, meteor_score = \ + caption_evaluate(all_predictions, all_targets, self.blip2.llm_tokenizer, self.max_inference_len) + acc = evaluate_exact_match(all_predictions, all_targets) + self.log(f"{log_prefix}/acc", acc, sync_dist=False) + self.log(f"{log_prefix}/bleu2", bleu2, sync_dist=False) + self.log(f"{log_prefix}/bleu4", bleu4, sync_dist=False) + self.log(f"{log_prefix}/rouge_1", rouge_1, sync_dist=False) + self.log(f"{log_prefix}/rouge_2", rouge_2, sync_dist=False) + self.log(f"{log_prefix}/rouge_l", rouge_l, sync_dist=False) + self.log(f"{log_prefix}/meteor_score", meteor_score, sync_dist=False) + + def training_step(self, batch, batch_idx): + if self.scheduler: + self.scheduler.step(self.trainer.current_epoch, self.trainer.global_step) + + #batch_size = batch[-1].input_ids.size(0) + batch_size = len(batch[-1]['targets']) + ###============== Overall Loss ===================### + loss = self.blip2(batch) + self.log("loss", float(loss), batch_size=batch_size, sync_dist=True) + self.log("lr", self.trainer.optimizers[0].param_groups[0]['lr'], batch_size=batch_size, sync_dist=True) + return loss + + @staticmethod + def add_model_specific_args(parent_parser): + parser = parent_parser.add_argument_group("ProtBlip2") + # train mode + parser.add_argument('--save_every_n_epochs', type=int, default=0) + + # Bert + parser.add_argument('--bert_name', type=str, default='/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft') + parser.add_argument('--cross_attention_freq', type=int, default=2) + parser.add_argument('--num_query_token', type=int, default=8) + parser.add_argument('--qformer_tune',type=str,default='train') + # OPT + parser.add_argument('--llm_name', type=str, default="/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged") + parser.add_argument('--num_beams', type=int, default=5) + parser.add_argument('--do_sample', action='store_true', default=False) + parser.add_argument('--max_inference_len', type=int, default=512) + parser.add_argument('--min_inference_len', type=int, default=1) + parser.add_argument('--llm_tune', type=str, default='freeze') + parser.add_argument('--peft_config', type=str, default='') + parser.add_argument('--peft_dir', type=str, default='') + + ## plm model + parser.add_argument('--plm_model', type=str, default='/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m') + parser.add_argument('--plm_tune', type=str, default='freeze') + + ## lora config + parser.add_argument('--lora_r', type=int, default=8) + parser.add_argument('--lora_alpha', type=int, default=16) + parser.add_argument('--lora_dropout', type=int, default=0.1) + parser.add_argument('--enbale_gradient_checkpointing', action='store_true', default=False) + + # optimization + parser.add_argument('--weight_decay', type=float, default=0.05, help='optimizer weight decay') + parser.add_argument('--init_lr', type=float, default=1e-4, help='optimizer init learning rate') + parser.add_argument('--min_lr', type=float, default=1e-5, help='optimizer min learning rate') + parser.add_argument('--warmup_lr', type=float, default=1e-6, help='optimizer warmup learning rate') + parser.add_argument('--warmup_steps', type=int, default=1000, help='optimizer warmup steps') + parser.add_argument('--lr_decay_rate', type=float, default=0.9, help='optimizer lr decay rate') + parser.add_argument('--scheduler', type=str, default='linear_warmup_cosine_lr', help='type of scheduler') # or linear_warmup_step_lr + parser.add_argument('--stage1_path', type=str, default='') + parser.add_argument('--stage2_path', type=str, default='') + parser.add_argument('--init_checkpoint', type=str, default='/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/converted.ckpt') + parser.add_argument('--caption_eval_epoch', type=int, default=5) + return parent_parser + + + +# def evaluate_exact_match(predictions, targets): +# acc = 0 +# for prediction, target in zip(predictions, targets): +# if prediction.strip() == target.strip(): +# acc += 1 +# acc = round(acc / len(predictions) * 100, 2) +# return acc + +import re + +def evaluate_exact_match(predictions, targets): + acc = 0 + for prediction, target in zip(predictions, targets): + # 使用正则提取 ... 中的内容 + match = re.search(r"(.*?)", target.strip(), re.DOTALL) + if match: + answer = match.group(1).strip() + if prediction.strip() == answer: + acc += 1 + else: + print(f"Warning: No tag found in target: {target}") + acc = round(acc / len(predictions) * 100, 2) + return acc diff --git a/BioReason-0813/model/help_funcs.py b/BioReason-0813/model/help_funcs.py new file mode 100644 index 0000000000000000000000000000000000000000..b57a64f8045e2824c90da07a601b826224d2394b --- /dev/null +++ b/BioReason-0813/model/help_funcs.py @@ -0,0 +1,112 @@ +import torch +from nltk.translate.bleu_score import corpus_bleu +from nltk.translate.meteor_score import meteor_score +from rouge_score import rouge_scorer +from tqdm import tqdm +import numpy as np + + +def caption_evaluate(predictions, targets, tokenizer, text_trunc_length): + targets = [t.strip() for t in targets] + meteor_scores = [] + references = [] + hypotheses = [] + for gt, out in tqdm(zip(targets, predictions)): + gt_tokens = tokenizer.tokenize(gt, truncation=True, max_length=text_trunc_length, + padding='max_length') + ## added for galactica + gt_tokens = list(filter(('').__ne__, gt_tokens)) + gt_tokens = list(filter(('[PAD]').__ne__, gt_tokens)) + gt_tokens = list(filter(('[CLS]').__ne__, gt_tokens)) + gt_tokens = list(filter(('[SEP]').__ne__, gt_tokens)) + + out_tokens = tokenizer.tokenize(out, truncation=True, max_length=text_trunc_length, + padding='max_length') + out_tokens = list(filter(('').__ne__, out_tokens)) + gt_tokens = list(filter(('[PAD]').__ne__, gt_tokens)) + out_tokens = list(filter(('[CLS]').__ne__, out_tokens)) + out_tokens = list(filter(('[SEP]').__ne__, out_tokens)) + + references.append([gt_tokens]) + hypotheses.append(out_tokens) + + mscore = meteor_score([gt_tokens], out_tokens) + meteor_scores.append(mscore) + + bleu2 = corpus_bleu(references, hypotheses, weights=(.5,.5)) + bleu4 = corpus_bleu(references, hypotheses, weights=(.25,.25,.25,.25)) + bleu2 *= 100 + bleu4 *= 100 + + print('BLEU-2 score:', bleu2) + print('BLEU-4 score:', bleu4) + _meteor_score = np.mean(meteor_scores) + _meteor_score *= 100 + print('Average Meteor score:', _meteor_score) + + scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL']) + + rouge_scores = [] + + references = [] + hypotheses = [] + + for gt, out in tqdm(zip(targets, predictions)): + rs = scorer.score(out, gt) + rouge_scores.append(rs) + + print('ROUGE score:') + rouge_1 = np.mean([rs['rouge1'].fmeasure for rs in rouge_scores]) * 100 + rouge_2 = np.mean([rs['rouge2'].fmeasure for rs in rouge_scores]) * 100 + rouge_l = np.mean([rs['rougeL'].fmeasure for rs in rouge_scores]) * 100 + print('rouge1:', rouge_1) + print('rouge2:', rouge_2) + print('rougeL:', rouge_l) + return bleu2, bleu4, rouge_1, rouge_2, rouge_l, _meteor_score + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def pad_and_concat(tensor_list, fill_value=0): + ''' + concat the first dimension and pad the second dimension + tensor_list: [[B (diff), N_num, *], ...] + ''' + device = tensor_list[0].device + dtype=tensor_list[0].dtype + max_dim1 = max(t.shape[1] for t in tensor_list) + sum_dim0 = sum(t.shape[0] for t in tensor_list) + if len(tensor_list[0].shape) == 3: + out = torch.full((sum_dim0, max_dim1, tensor_list[0].shape[-1]), fill_value=fill_value, device=device, dtype=dtype) + i = 0 + for t in tensor_list: + out[i:i+t.shape[0], :t.shape[1]] = t + i += t.shape[0] + return out + elif len(tensor_list[0].shape) == 2: + out = torch.full((sum_dim0, max_dim1), fill_value=fill_value, device=device, dtype=dtype) + i = 0 + for t in tensor_list: + out[i:i+t.shape[0], :t.shape[1]] = t + i += t.shape[0] + return out + raise NotImplementedError() + + +def hf_enable_gradient_checkpointing(hf_model): + if hasattr(hf_model, "enable_input_require_grads"): + hf_model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + hf_model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + # enable gradient checkpointing for memory efficiency + hf_model.gradient_checkpointing_enable() + return hf_model \ No newline at end of file diff --git a/BioReason-0813/prompt_templates.py b/BioReason-0813/prompt_templates.py new file mode 100644 index 0000000000000000000000000000000000000000..ef341723846ace96d9dc54ff4fb79499e180327f --- /dev/null +++ b/BioReason-0813/prompt_templates.py @@ -0,0 +1,57 @@ +prompt_templates = { + "classification": """ +Analyze the following protein sequence and predict its classification. + +Protein sequence: {aa_seq} + +Please provide your reasoning and classification. + + +Let me analyze this protein sequence step by step: +1. Sequence length: {seq_length} +2. Composition analysis... +3. Structural predictions... +4. Functional domains... + + + +Classification: {label} + +""", + "function_prediction": """ +Given the protein sequence below, predict its function and classification: + +Sequence: {aa_seq} + +Analyze the sequence and provide your prediction. + + +Sequence analysis: +- Length: {seq_length} amino acids +- Notable features... +- Homology considerations... + + + +Function prediction: {label} + +""", + "location_prediction": """ +Predict the cellular location and classification of this protein: + +Protein sequence: {aa_seq} + +What is the most likely classification for this protein? + + +Location and function analysis: +- Sequence characteristics... +- Signal peptides... +- Transmembrane regions... + + + +Classification: {label} + +""" +} diff --git a/BioReason-0813/run.sh b/BioReason-0813/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..a9271f3f32e25ef093ebb9d6554ffb34b4911b10 --- /dev/null +++ b/BioReason-0813/run.sh @@ -0,0 +1,103 @@ +echo "Starting GRPO training..." + +#!/bin/bash +# run_blip2.sh +# 用于启动 BLIP2 + GRPO 训练的脚本 + +# ===== 基本路径配置 ===== +DATA_FILE=/oss/wangyujia/ProtT3/ProtT3/data/sft/dataset/DeepLocBinary/test.csv +DATASET_NAME=deeplocbinary +OUTPUT_DIR=./output +CACHE_DIR=./cache + +# ===== 模型配置 ===== +BERT_PATH=/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft +PLM_MODEL=/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m +LLM_MODEL=/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged +SFT_CHECKPOINT=/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage2_07301646_2datasets_construct/epoch=09.ckpt/converted.ckpt +# ===== 训练参数 ===== +BATCH_SIZE=4 +EPOCHS=3 +LR=1e-5 + +# ===== 奖励函数权重 ===== +FORMAT_WEIGHT=0.2 +ACCURACY_WEIGHT=0.6 +REPETITION_WEIGHT=0.2 + +# ===== 运行训练 ===== +python blips_reason.py \ + --data_file_paths ${DATA_FILE} \ + --dataset_name ${DATASET_NAME} \ + --reward_funcs combined \ + --format_weight ${FORMAT_WEIGHT} \ + --accuracy_weight ${ACCURACY_WEIGHT} \ + --repetition_weight ${REPETITION_WEIGHT} \ + --use_custom_prompts \ + --template_name classification \ + --max_seq_length 1000 \ + --output_dir ${OUTPUT_DIR} \ + --per_device_train_batch_size ${BATCH_SIZE} \ + --num_train_epochs ${EPOCHS} \ + --learning_rate ${LR} \ + --bert_name ${BERT_PATH} \ + --plm_model ${PLM_MODEL} \ + --llm_name ${LLM_MODEL} \ + --sft_checkpoint ${SFT_CHECKPOINT} \ + --plm_tune freeze \ + --llm_tune lora \ + --qformer_tune train \ + --lora_r 8 \ + --lora_alpha 16 \ + --lora_dropout 0.1 \ + --enable_flash \ + --cache_dir ${CACHE_DIR} + + + +# python protein_reason.py \ +# --output_dir "./grpo_outputs" \ +# --model_name_or_path "Qwen/Qwen3-0.6B" \ +# --protein_model_name_or_path "facebook/esm2_t6_8M_UR50D" \ +# --qformer_model_name_or_path "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext" \ +# --dataset_name "wanglab/protein_function" \ +# --sft_checkpoint "./checkpoints/best_model" \ +# --per_device_train_batch_size 4 \ +# --gradient_accumulation_steps 4 \ +# --num_train_epochs 3 \ +# --learning_rate 1e-6 \ +# --beta 0.04 \ +# --temperature 0.6 \ +# --top_p 0.95 \ +# --top_k 20 \ +# --max_completion_length 800 \ +# --num_generations 8 \ +# --reward_funcs "xmlcount" "soft_format" "strict_format" "correctness" \ +# --lora_r 32 \ +# --lora_alpha 64 \ +# --lora_dropout 0.05 \ +# --freeze_protein_modules \ +# --logging_steps 2 \ +# --eval_strategy "steps" \ +# --eval_steps 100 \ +# --save_steps 200 \ +# --report_to "wandb" \ +# --log_completions + +# python blip2_reason.py \ +# --data_file_paths /oss/wangyujia/ProtT3/ProtT3/data/sft/dataset/DeepLocBinary/test.csv \ +# --reward_funcs combined \ +# --format_weight 0.2 \ +# --accuracy_weight 0.6 \ +# --repetition_weight 0.2 \ +# --use_custom_prompts \ +# --template_name classification \ +# --max_seq_length 1000 \ +# --output_dir ./output \ +# --per_device_train_batch_size 4 \ +# --num_train_epochs 3 \ +# --learning_rate 1e-5 + +echo "GRPO training completed!" + +echo "All training stages completed successfully!" \ No newline at end of file diff --git a/BioReason-main/.gitignore b/BioReason-main/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..2f71456cd6de4080581e4b307624b99b329a3f0d --- /dev/null +++ b/BioReason-main/.gitignore @@ -0,0 +1,180 @@ +# Byte-compiled / optimized / DLL files +.idea/ +__pycache__/ +*.py[cod] +*$py.class +wandb/ +.DS_Store +.vscode/ +.venv/ +.env +.pytest_cache/ + +# C extensions +*.so + +outputs/ + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# PyPI configuration file +.pypirc diff --git a/BioReason-main/LICENSE b/BioReason-main/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/BioReason-main/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/BioReason-main/README.md b/BioReason-main/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3d14cc899fab79ca2ef897d273da0d4c84701ffd --- /dev/null +++ b/BioReason-main/README.md @@ -0,0 +1,148 @@ +

+🧬 BioReason
Incentivizing Multimodal Biological Reasoning
within a DNA-LLM Model +

+ +

+ arXiv + GitHub + Website + HuggingFace Dataset +

+ +
+ +## Updates [Jun 10, 2025] +- We are integrating vLLM to improve the speed and efficiency of the GRPO pipeline. We expect this to be pushed by end of week. +- Checkpoints along with the custom DNA-LLM model class will be released on HuggingFace by end of week. +- More training results with GRPO will be shared soon. + +
+ +## Abstract + +Unlocking deep, interpretable biological reasoning from complex genomic data is a major AI challenge hindering scientific discovery. Current DNA foundation models, despite strong sequence representation, struggle with multi-step reasoning and lack inherent transparent, biologically intuitive explanations. We introduce BioReason, a pioneering architecture that, for the first time, deeply integrates a DNA foundation model with a large language model (LLM). This novel connection enables the LLM to directly process and reason with genomic information as a fundamental input, fostering a new form of multimodal biological understanding. BioReason's sophisticated multi-step reasoning is developed through supervised fine-tuning and targeted reinforcement learning, guiding the system to generate logical, biologically coherent deductions. On biological reasoning benchmarks including KEGG-based disease pathway prediction—where accuracy improves from 88% to 97%—and variant effect prediction, BioReason demonstrates an average 15% performance gain over strong single-modality baselines. + +
+ +## Key Contributions + +• **Novel multimodal architecture**: The first successful integration of a DNA foundation model with an LLM, establishing a new methodology for AI-driven biological studies. + +• **Advanced reasoning methodology**: A systematic training approach combining supervised fine-tuning and reinforcement learning that incentivizes multi-step biological reasoning. + +• **New biological reasoning benchmarks**: Development and curation of novel benchmarks for evaluating biological reasoning capabilities, including an annotated reasoning dataset for gene pathway and disease prediction from KEGG. + +• **Empirical performance improvements**: Demonstration that BioReason outperforms both DNA foundation models and LLMs used independently or in simple combination, with average performance gains of 15%+ over baseline. + +• **Interpretable reasoning traces**: A mechanism for generating step-by-step biological reasoning traces that provide interpretable predictions, enhancing scientific insight and hypothesis generation. + +
+ +## Datasets + +The datasets used to train and evaluate BioReason can be found on our [HuggingFace collection](https://huggingface.co/collections/wanglab/bioreason-683cd17172a037a31d208f70) with detailed download and usage instructions. + +
+ +## Checkpoints + +We will release the checkpoints soon! + +
+ +## Installation + +### Prerequisites +- Python 3.11+ +- CUDA/GPU for best performance + +### Installation Steps +```bash +# Clone the repository +git clone https://github.com/bowang-lab/BioReason.git +cd BioReason + +# Install package +pip install -e . +``` + +
+ +## Results + +### KEGG-Derived Biological Reasoning Task +Performance comparison on 290 test datapoints for multi-step mechanistic reasoning: + +| Model | Accuracy | F1-Score | Precision | Recall | +|-------|----------|----------|-----------|---------| +| [DNA] NT - 500M | 86.55 | 69.76 | 73.23 | 66.61 | +| [DNA] Evo2 - 1B | 88.28 | 72.43 | 75.23 | 69.83 | +| [LLM] Qwen3 - 1B | 85.17 | 65.71 | 71.39 | 64.19 | +| [LLM] Qwen3 - 4B | 93.48 | 85.44 | 88.31 | 86.72 | +| [DNA-LLM] NT + Qwen3 - 1B | 88.42 | 72.13 | 75.42 | 71.91 | +| [DNA-LLM] NT + Qwen3 - 1B (+RL) | 89.66 | 74.11 | 78.82 | 72.96 | +| [DNA-LLM] NT + Qwen3 - 4B | 96.90 | **89.03** | **90.99** | **89.38** | +| [DNA-LLM] Evo2 + Qwen3 - 1B | 90.42 | 75.62 | 77.42 | 73.91 | +| [DNA-LLM] Evo2 + Qwen3 - 4B | **97.24** | 86.30 | 86.75 | 87.25 | + +### Variant Effect Prediction Benchmarks +Performance on pathogenic/benign classification: + +| Model | Variant Effect - Coding | | Variant Effect - Non-SNV | | +|-------|------------|----------|------------|----------| +| | Accuracy | F1-Score | Accuracy | F1-Score | +| [DNA] NT - 500M | 60.91 | 45.20 | 67.93 | 65.97 | +| [DNA] Evo2 - 1B | 70.07 | 49.19 | 76.17 | 66.51 | +| [LLM] Qwen3 - 1B | 46.55 | 34.82 | 70.67 | 76.21 | +| [LLM] Qwen3 - 4B | 48.99 | 39.58 | 61.86 | 67.60 | +| [DNA-LLM] NT + Qwen3 - 1B | 55.58 | 54.50 | 72.82 | 76.93 | +| [DNA-LLM] NT + Qwen3 - 4B | 60.94 | 55.66 | 65.59 | 73.00 | +| [DNA-LLM] Evo2 + Qwen3 - 1B | 72.83 | 68.90 | **88.20** | **89.91** | +| [DNA-LLM] Evo2 + Qwen3 - 4B | **80.21** | **80.00** | 83.85 | 85.02 | + +
+ +## Citation + +If you find this work useful, please cite our paper: + +```bibtex +@misc{fallahpour2025bioreasonincentivizingmultimodalbiological, + title={BioReason: Incentivizing Multimodal Biological Reasoning within a DNA-LLM Model}, + author={Adibvafa Fallahpour and Andrew Magnuson and Purav Gupta and Shihao Ma and Jack Naimer and Arnav Shah and Haonan Duan and Omar Ibrahim and Hani Goodarzi and Chris J. Maddison and Bo Wang}, + year={2025}, + eprint={2505.23579}, + archivePrefix={arXiv}, + primaryClass={cs.LG}, + url={https://arxiv.org/abs/2505.23579}, +} +``` + +
+ +## Authors + +- **Adibvafa Fallahpour**¹²³⁵ * (adibvafa.fallahpour@mail.utoronto.ca) +- **Andrew Magnuson**¹² * +- **Purav Gupta**¹² * +- **Shihao Ma**¹²³ +- **Jack Naimer**¹²³ +- **Arnav Shah**¹²³ +- **Haonan Duan**¹² +- **Omar Ibrahim**³ +- **Hani Goodarzi**†⁴⁶ +- **Chris J. Maddison**†¹²⁷ +- **Bo Wang**†¹²³ + +¹ University of Toronto ² Vector Institute ³ University Health Network (UHN)
+⁴ Arc Institute ⁵ Cohere ⁶ University of California, San Francisco ⁷ Google DeepMind + +
+* Equal contribution
+† Equal advising + +--- + +

+Made with ❤️ at University of Toronto, Vector Institute, and University Health Network +

diff --git a/BioReason-main/bioreason.egg-info/PKG-INFO b/BioReason-main/bioreason.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..1b0f156459433672bd6d5f2976f0fe642559e568 --- /dev/null +++ b/BioReason-main/bioreason.egg-info/PKG-INFO @@ -0,0 +1,181 @@ +Metadata-Version: 2.4 +Name: bioreason +Version: 0.1.0 +Summary: Bio-related Reasoning with Language Models +License: UNKNOWN +Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.11 +Classifier: License :: OSI Approved :: MIT License +Classifier: Operating System :: OS Independent +Requires-Python: >=3.11 +Description-Content-Type: text/markdown +License-File: LICENSE +Requires-Dist: torch +Requires-Dist: torchvision +Requires-Dist: transformers +Requires-Dist: accelerate +Requires-Dist: qwen-vl-utils +Requires-Dist: jupyter +Requires-Dist: datasets +Requires-Dist: peft +Requires-Dist: pytorch_lightning +Requires-Dist: wandb +Requires-Dist: trl[vllm] +Requires-Dist: bitsandbytes +Requires-Dist: deepspeed +Provides-Extra: dev +Requires-Dist: pytest; extra == "dev" +Requires-Dist: black; extra == "dev" +Requires-Dist: isort; extra == "dev" +Requires-Dist: mypy; extra == "dev" +Dynamic: license-file + +

+🧬 BioReason
Incentivizing Multimodal Biological Reasoning
within a DNA-LLM Model +

+ +

+ arXiv + GitHub + Website + HuggingFace Dataset +

+ +
+ +## Updates [Jun 10, 2025] +- We are integrating vLLM to improve the speed and efficiency of the GRPO pipeline. We expect this to be pushed by end of week. +- Checkpoints along with the custom DNA-LLM model class will be released on HuggingFace by end of week. +- More training results with GRPO will be shared soon. + +
+ +## Abstract + +Unlocking deep, interpretable biological reasoning from complex genomic data is a major AI challenge hindering scientific discovery. Current DNA foundation models, despite strong sequence representation, struggle with multi-step reasoning and lack inherent transparent, biologically intuitive explanations. We introduce BioReason, a pioneering architecture that, for the first time, deeply integrates a DNA foundation model with a large language model (LLM). This novel connection enables the LLM to directly process and reason with genomic information as a fundamental input, fostering a new form of multimodal biological understanding. BioReason's sophisticated multi-step reasoning is developed through supervised fine-tuning and targeted reinforcement learning, guiding the system to generate logical, biologically coherent deductions. On biological reasoning benchmarks including KEGG-based disease pathway prediction—where accuracy improves from 88% to 97%—and variant effect prediction, BioReason demonstrates an average 15% performance gain over strong single-modality baselines. + +
+ +## Key Contributions + +• **Novel multimodal architecture**: The first successful integration of a DNA foundation model with an LLM, establishing a new methodology for AI-driven biological studies. + +• **Advanced reasoning methodology**: A systematic training approach combining supervised fine-tuning and reinforcement learning that incentivizes multi-step biological reasoning. + +• **New biological reasoning benchmarks**: Development and curation of novel benchmarks for evaluating biological reasoning capabilities, including an annotated reasoning dataset for gene pathway and disease prediction from KEGG. + +• **Empirical performance improvements**: Demonstration that BioReason outperforms both DNA foundation models and LLMs used independently or in simple combination, with average performance gains of 15%+ over baseline. + +• **Interpretable reasoning traces**: A mechanism for generating step-by-step biological reasoning traces that provide interpretable predictions, enhancing scientific insight and hypothesis generation. + +
+ +## Datasets + +The datasets used to train and evaluate BioReason can be found on our [HuggingFace collection](https://huggingface.co/collections/wanglab/bioreason-683cd17172a037a31d208f70) with detailed download and usage instructions. + +
+ +## Checkpoints + +We will release the checkpoints soon! + +
+ +## Installation + +### Prerequisites +- Python 3.11+ +- CUDA/GPU for best performance + +### Installation Steps +```bash +# Clone the repository +git clone https://github.com/bowang-lab/BioReason.git +cd BioReason + +# Install package +pip install -e . +``` + +
+ +## Results + +### KEGG-Derived Biological Reasoning Task +Performance comparison on 290 test datapoints for multi-step mechanistic reasoning: + +| Model | Accuracy | F1-Score | Precision | Recall | +|-------|----------|----------|-----------|---------| +| [DNA] NT - 500M | 86.55 | 69.76 | 73.23 | 66.61 | +| [DNA] Evo2 - 1B | 88.28 | 72.43 | 75.23 | 69.83 | +| [LLM] Qwen3 - 1B | 85.17 | 65.71 | 71.39 | 64.19 | +| [LLM] Qwen3 - 4B | 93.48 | 85.44 | 88.31 | 86.72 | +| [DNA-LLM] NT + Qwen3 - 1B | 88.42 | 72.13 | 75.42 | 71.91 | +| [DNA-LLM] NT + Qwen3 - 1B (+RL) | 89.66 | 74.11 | 78.82 | 72.96 | +| [DNA-LLM] NT + Qwen3 - 4B | 96.90 | **89.03** | **90.99** | **89.38** | +| [DNA-LLM] Evo2 + Qwen3 - 1B | 90.42 | 75.62 | 77.42 | 73.91 | +| [DNA-LLM] Evo2 + Qwen3 - 4B | **97.24** | 86.30 | 86.75 | 87.25 | + +### Variant Effect Prediction Benchmarks +Performance on pathogenic/benign classification: + +| Model | Variant Effect - Coding | | Variant Effect - Non-SNV | | +|-------|------------|----------|------------|----------| +| | Accuracy | F1-Score | Accuracy | F1-Score | +| [DNA] NT - 500M | 60.91 | 45.20 | 67.93 | 65.97 | +| [DNA] Evo2 - 1B | 70.07 | 49.19 | 76.17 | 66.51 | +| [LLM] Qwen3 - 1B | 46.55 | 34.82 | 70.67 | 76.21 | +| [LLM] Qwen3 - 4B | 48.99 | 39.58 | 61.86 | 67.60 | +| [DNA-LLM] NT + Qwen3 - 1B | 55.58 | 54.50 | 72.82 | 76.93 | +| [DNA-LLM] NT + Qwen3 - 4B | 60.94 | 55.66 | 65.59 | 73.00 | +| [DNA-LLM] Evo2 + Qwen3 - 1B | 72.83 | 68.90 | **88.20** | **89.91** | +| [DNA-LLM] Evo2 + Qwen3 - 4B | **80.21** | **80.00** | 83.85 | 85.02 | + +
+ +## Citation + +If you find this work useful, please cite our paper: + +```bibtex +@misc{fallahpour2025bioreasonincentivizingmultimodalbiological, + title={BioReason: Incentivizing Multimodal Biological Reasoning within a DNA-LLM Model}, + author={Adibvafa Fallahpour and Andrew Magnuson and Purav Gupta and Shihao Ma and Jack Naimer and Arnav Shah and Haonan Duan and Omar Ibrahim and Hani Goodarzi and Chris J. Maddison and Bo Wang}, + year={2025}, + eprint={2505.23579}, + archivePrefix={arXiv}, + primaryClass={cs.LG}, + url={https://arxiv.org/abs/2505.23579}, +} +``` + +
+ +## Authors + +- **Adibvafa Fallahpour**¹²³⁵ * (adibvafa.fallahpour@mail.utoronto.ca) +- **Andrew Magnuson**¹² * +- **Purav Gupta**¹² * +- **Shihao Ma**¹²³ +- **Jack Naimer**¹²³ +- **Arnav Shah**¹²³ +- **Haonan Duan**¹² +- **Omar Ibrahim**³ +- **Hani Goodarzi**†⁴⁶ +- **Chris J. Maddison**†¹²⁷ +- **Bo Wang**†¹²³ + +¹ University of Toronto ² Vector Institute ³ University Health Network (UHN)
+⁴ Arc Institute ⁵ Cohere ⁶ University of California, San Francisco ⁷ Google DeepMind + +
+* Equal contribution
+† Equal advising + +--- + +

+Made with ❤️ at University of Toronto, Vector Institute, and University Health Network +

diff --git a/BioReason-main/bioreason.egg-info/SOURCES.txt b/BioReason-main/bioreason.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..391fc97522cd46e17ef8ed932a77e26cd6373052 --- /dev/null +++ b/BioReason-main/bioreason.egg-info/SOURCES.txt @@ -0,0 +1,9 @@ +LICENSE +README.md +pyproject.toml +bioreason/__init__.py +bioreason.egg-info/PKG-INFO +bioreason.egg-info/SOURCES.txt +bioreason.egg-info/dependency_links.txt +bioreason.egg-info/requires.txt +bioreason.egg-info/top_level.txt \ No newline at end of file diff --git a/BioReason-main/bioreason.egg-info/dependency_links.txt b/BioReason-main/bioreason.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/BioReason-main/bioreason.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/BioReason-main/bioreason.egg-info/requires.txt b/BioReason-main/bioreason.egg-info/requires.txt new file mode 100644 index 0000000000000000000000000000000000000000..5dee50c1b476cc5e7f85f7b8e996c57311f348d7 --- /dev/null +++ b/BioReason-main/bioreason.egg-info/requires.txt @@ -0,0 +1,19 @@ +torch +torchvision +transformers +accelerate +qwen-vl-utils +jupyter +datasets +peft +pytorch_lightning +wandb +trl[vllm] +bitsandbytes +deepspeed + +[dev] +pytest +black +isort +mypy diff --git a/BioReason-main/bioreason.egg-info/top_level.txt b/BioReason-main/bioreason.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b9995054ed2dec4763dced82d34176d2c514f30 --- /dev/null +++ b/BioReason-main/bioreason.egg-info/top_level.txt @@ -0,0 +1 @@ +bioreason diff --git a/BioReason-main/bioreason/__init__.py b/BioReason-main/bioreason/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/BioReason-main/bioreason/dataset/__init__.py b/BioReason-main/bioreason/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a318a901b205faa943fd38d702d1001effb1cdd0 --- /dev/null +++ b/BioReason-main/bioreason/dataset/__init__.py @@ -0,0 +1,11 @@ +from .kegg import KEGGDataset, split_kegg_dataset +from .utils import torch_to_hf_dataset, truncate_dna +from .variant_effect import get_format_variant_effect_function + +__all__ = [ + "KEGGDataset", + "split_kegg_dataset", + "torch_to_hf_dataset", + "truncate_dna", + "get_format_variant_effect_function", +] diff --git a/BioReason-main/bioreason/dataset/kegg.py b/BioReason-main/bioreason/dataset/kegg.py new file mode 100644 index 0000000000000000000000000000000000000000..d721e79c9d06bfa3b713d0c0223ebc482ec43fee --- /dev/null +++ b/BioReason-main/bioreason/dataset/kegg.py @@ -0,0 +1,382 @@ +import json +import os +import random +import sys +import torch +from torch.utils.data import Dataset, DataLoader +from typing import Any, Dict, List, Tuple + +from bioreason.dataset.utils import torch_to_hf_dataset +from bioreason.models.dl.processing_dl import DLProcessor +from trl.data_utils import maybe_apply_chat_template + + +class KEGGDataset(Dataset): + """Dataset for KEGG data.""" + + def __init__(self, data_dir: str): + """ + Initialize the dataset by loading all JSON files from the given directory. + + Args: + data_dir: Path to the directory containing JSON files + """ + self.data_dir = data_dir + self.data = [] + + # Load all JSON files + json_files = sorted([f for f in os.listdir(data_dir) if f.endswith(".json")]) + + # Process each file + for filename in json_files: + file_path = os.path.join(data_dir, filename) + kegg_id = filename.split("_")[1] + + with open(file_path, "r", encoding="utf-8") as f: + item = json.load(f) + item["kegg_id"] = kegg_id + processed_item = self._process_item(item) + self.data.append(processed_item) + + def _process_item(self, item: Dict[str, Any]) -> Dict[str, Any]: + """ + Process a single data item to format fields as required. + + Args: + item: Original data item from JSON + + Returns: + Processed data item + """ + # Extract question as is + question = item.get("question", "") + + # Convert answer to lowercase and strip whitespace + answer = item.get("answer", "").lower().strip() + + # Combine reasoning steps into a single paragraph with newlines + reasoning_steps = item.get("reasoning", {}).get("reasoning_steps", []) + reasoning = "\n".join(reasoning_steps) + + # Convert sequences to uppercase and strip whitespace + reference_sequence = item.get("reference_sequence", "").upper().strip() + variant_sequence = item.get("variant_sequence", "").upper().strip() + + return { + "question": question, + "answer": answer, + "reasoning": reasoning, + "reference_sequence": reference_sequence, + "variant_sequence": variant_sequence, + } + + def __len__(self) -> int: + """Return the number of items in the dataset.""" + return len(self.data) + + def __getitem__(self, idx: int) -> Dict[str, Any]: + """Return a specific item from the dataset.""" + return self.data[idx] + + +def split_kegg_dataset( + dataset: KEGGDataset, + train_ratio: float = 0.8, + val_ratio: float = 0.1, + test_ratio: float = 0.1, + seed: int = 42, +) -> Tuple[KEGGDataset, KEGGDataset, KEGGDataset]: + """ + Split a KEGG dataset into train, validation, and test sets. + + Args: + dataset: The dataset to split + train_ratio: Proportion of data for training + val_ratio: Proportion of data for validation + test_ratio: Proportion of data for testing + batch_size: Batch size for the dataloaders + seed: Random seed for reproducibility + + Returns: + Tuple of (train_dataset, val_dataset, test_dataset) + """ + # Calculate the size of each split + dataset_size = len(dataset) + train_size = int(train_ratio * dataset_size) + val_size = int(val_ratio * dataset_size) + test_size = dataset_size - train_size - val_size + assert train_ratio + val_ratio + test_ratio == 1.0, "Ratios must sum to 1" + + # Set the random seed + torch.manual_seed(seed) + random.seed(seed) + + # Split the dataset + train_dataset, val_dataset, test_dataset = torch.utils.data.random_split( + dataset, [train_size, val_size, test_size] + ) + + return train_dataset, val_dataset, test_dataset + + +def create_kegg_dataloader( + data_dir: str, + batch_size: int = 2, + shuffle: bool = True, + num_workers: int = 2, + pin_memory: bool = True, +) -> DataLoader: + """ + Create a DataLoader for the KEGG dataset. + + Args: + data_dir: Path to the directory containing JSON files + batch_size: Batch size for the dataloader + shuffle: Whether to shuffle the data + num_workers: Number of worker processes for loading data + pin_memory: Whether to pin memory for faster data transfer + + Returns: + DataLoader for the KEGG dataset + """ + dataset = KEGGDataset(data_dir) + return DataLoader( + dataset, + batch_size=batch_size, + shuffle=shuffle, + num_workers=num_workers, + pin_memory=pin_memory, + ) + + +def get_format_kegg_function(model_name: str) -> Any: + """ + Get the appropriate format function for a given model name. + """ + if model_name.lower() == "llm": + return format_kegg_for_llm + elif model_name.lower() == "dna-llm": + return format_kegg_for_dna_llm + else: + raise ValueError(f"Unsupported model name: {model_name}") + + +def format_kegg_for_dna_llm(example: Dict[str, Any]) -> Dict[str, Any]: + """ + Format a KEGG example into the required chat format for DNA-LLM. + """ + return { + "prompt": [ + { + "role": "user", + "content": [ + *({"type": "dna", "text": None} for _ in range(2)), + {"type": "text", "text": example["question"].strip()}, + ], + }, + { + "role": "assistant", + "reasoning_content": example["reasoning"].strip(), + "content": [ + {"type": "text", "text": f"Answer: {example['answer'].strip()}"}, + ], + }, + ], + "dna_sequences": [ + example["reference_sequence"], + example["variant_sequence"], + ], + "answer": example["answer"], + } + + +def format_kegg_for_llm(example: Dict[str, Any]) -> Dict[str, Any]: + """ + Format a KEGG example into the required chat format for LLM. + """ + question = f"Reference sequence: {example['reference_sequence']}\nVariant sequence: {example['variant_sequence']}\nQuestion: {example['question']}" + return { + "prompt": [ + { + "role": "user", + "content": [ + *({"type": "dna", "text": None} for _ in range(2)), + {"type": "text", "text": question.strip()}, + ], + }, + { + "role": "assistant", + "reasoning_content": example["reasoning"].strip(), + "content": [ + {"type": "text", "text": f"Answer: {example['answer'].strip()}"}, + ], + }, + ], + "dna_sequences": [ + "", + "", + ], + "answer": example["answer"], + } + + +def qwen_dna_collate_fn( + examples: List[Dict], + processor: DLProcessor, + max_length_text: int, + max_length_dna: int, + return_answer_in_batch: bool = False, +) -> Dict: + """ + Custom collate function for Qwen DNA models. + + Creates a batch with proper labels for supervised fine-tuning where only + the assistant responses contribute to the loss calculation. + """ + prompts_text = [ + maybe_apply_chat_template(example, processor)["prompt"] for example in examples + ] + batch_dna_sequences = [example["dna_sequences"] for example in examples] + + batch = processor( + text=prompts_text, + batch_dna_sequences=batch_dna_sequences, + return_tensors="pt", + padding=True, + padding_side="left", + add_special_tokens=False, + max_length_text=max_length_text, + max_length_dna=max_length_dna, + ) + + # Create labels tensor filled with -100 (ignored in loss calculation) + labels = torch.full_like(batch["input_ids"], -100) + + # Get token IDs for special markers + assistant_start_marker = "<|im_start|>assistant\n" + im_end_marker = "<|im_end|>" + + assistant_start_token_ids = processor.tokenizer.encode( + assistant_start_marker, add_special_tokens=False + ) + im_end_token_ids = processor.tokenizer.encode( + im_end_marker, add_special_tokens=False + ) + + # Convert token arrays to tensors for faster comparison + assistant_marker_tensor = torch.tensor( + assistant_start_token_ids, device=batch["input_ids"].device + ) + im_end_marker_tensor = torch.tensor( + im_end_token_ids, device=batch["input_ids"].device + ) + + # Get dimensions for easier reference + assistant_marker_len = len(assistant_start_token_ids) + im_end_marker_len = len(im_end_token_ids) + + # For each sequence in the batch + for i in range(batch["input_ids"].shape[0]): + input_ids = batch["input_ids"][i] + seq_len = input_ids.size(0) + + # Track assistant sections + assistant_sections = [] + + # Find all assistant start markers + start_positions = [] + for pos in range(seq_len - assistant_marker_len + 1): + if torch.all( + input_ids[pos : pos + assistant_marker_len] == assistant_marker_tensor + ): + start_positions.append( + pos + assistant_marker_len + ) # Store position after marker + + # Find all end markers + end_positions = [] + for pos in range(seq_len - im_end_marker_len + 1): + if torch.all( + input_ids[pos : pos + im_end_marker_len] == im_end_marker_tensor + ): + end_positions.append(pos) # Store position at start of end marker + + # Match start and end markers to create sections + for start_pos in start_positions: + # Find the next end marker after this start position + valid_ends = [pos for pos in end_positions if pos > start_pos] + if valid_ends: + end_pos = min(valid_ends) # Take the first end marker after start + # Only include content between markers (not the markers themselves) + if start_pos < end_pos: + assistant_sections.append((start_pos, end_pos)) + else: + # If no end marker, assume the section runs to the end of the sequence + assistant_sections.append((start_pos, seq_len)) + + # Set labels for all identified assistant sections + for start_pos, end_pos in assistant_sections: + if start_pos < end_pos and start_pos < seq_len: + end_pos = min(end_pos, seq_len) # Safety check + labels[i, start_pos:end_pos] = input_ids[start_pos:end_pos] + + # Also mask padding tokens + labels[batch["input_ids"] == processor.tokenizer.pad_token_id] = -100 + + # Add labels to batch + batch["labels"] = labels + + # Add answer to batch + if return_answer_in_batch: + batch["answer"] = [example["answer"].strip() for example in examples] + + return batch + + +def dna_collate_fn( + batch: List[Dict[str, Any]], + dna_tokenizer: Any, + label2id: Dict[str, int], + max_length: int = 2048, +) -> Dict[str, Any]: + """ + Custom collate function for DNA models. + """ + ref_sequences = [item["reference_sequence"] for item in batch] + alt_sequences = [item["variant_sequence"] for item in batch] + + # Tokenize DNA sequences separately + tokenized_ref = dna_tokenizer( + ref_sequences, + padding=True, + truncation=True, + max_length=max_length, + return_tensors="pt", + ) + + tokenized_alt = dna_tokenizer( + alt_sequences, + padding=True, + truncation=True, + max_length=max_length, + return_tensors="pt", + ) + + # Get labels + labels = [] + for item in batch: + label = label2id[item["answer"]] + labels.append(label) + + # Create labels tensor + labels_tensor = torch.tensor(labels, dtype=torch.long) + + tokenized_batch = { + "ref_ids": tokenized_ref.input_ids, + "ref_attention_mask": tokenized_ref.attention_mask, + "alt_ids": tokenized_alt.input_ids, + "alt_attention_mask": tokenized_alt.attention_mask, + "labels": labels_tensor, + } + + return tokenized_batch diff --git a/BioReason-main/bioreason/dataset/utils.py b/BioReason-main/bioreason/dataset/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..25488fe729ff5b34be0488ddc5fdf2dbd6cb9993 --- /dev/null +++ b/BioReason-main/bioreason/dataset/utils.py @@ -0,0 +1,59 @@ +from datasets import Dataset as HFDataset +from torch.utils.data import Dataset as TorchDataset +from typing import Dict, Any, Union, List + + +def truncate_dna( + example: Dict[str, Any], truncate_dna_per_side: int = 1024 +) -> Dict[str, Any]: + """ + Truncate DNA sequences by removing a specified number of base pairs from both ends. + If the sequence is too short, it will return the middle portion. + """ + for key in ["reference_sequence", "variant_sequence"]: + sequence = example[key] + seq_len = len(sequence) + + if seq_len > 2 * truncate_dna_per_side + 8: + example[key] = sequence[truncate_dna_per_side:-truncate_dna_per_side] + + return example + + +def torch_to_hf_dataset(torch_dataset: TorchDataset) -> HFDataset: + """ + Convert a PyTorch Dataset to a Hugging Face Dataset. + + This function takes a PyTorch Dataset and converts it to a Hugging Face Dataset + by extracting all items and organizing them into a dictionary structure that + can be used to create a Hugging Face Dataset. + + Args: + torch_dataset: A PyTorch Dataset object to be converted + + Returns: + A Hugging Face Dataset containing the same data as the input PyTorch Dataset + """ + # Get first item to determine structure + if len(torch_dataset) == 0: + return HFDataset.from_dict({}) + + first_item = torch_dataset[0] + + # Initialize dictionary based on first item's keys + data_dict = ( + {k: [] for k in first_item.keys()} + if isinstance(first_item, dict) + else {"data": []} + ) + + # Populate dictionary + for i in range(len(torch_dataset)): + item = torch_dataset[i] + if isinstance(item, dict): + for k in data_dict: + data_dict[k].append(item[k]) + else: + data_dict["data"].append(item) + + return HFDataset.from_dict(data_dict) diff --git a/BioReason-main/bioreason/dataset/variant_effect.py b/BioReason-main/bioreason/dataset/variant_effect.py new file mode 100644 index 0000000000000000000000000000000000000000..f36b4a29943b9026cadfd2916fa4dc0e70f1722c --- /dev/null +++ b/BioReason-main/bioreason/dataset/variant_effect.py @@ -0,0 +1,98 @@ +import json +import os +import random +import sys +import torch +from torch.utils.data import Dataset, DataLoader +from typing import Any, Dict, List, Tuple + +from bioreason.dataset.utils import torch_to_hf_dataset +from bioreason.models.dl.processing_dl import DLProcessor +from trl.data_utils import maybe_apply_chat_template + + +def get_format_variant_effect_function(model_name: str) -> Any: + """ + Get the appropriate format function for a given model name. + """ + if model_name.lower() == "llm": + return format_variant_effect_for_llm + elif model_name.lower() == "dna-llm": + return format_variant_effect_for_dna_llm + else: + raise ValueError(f"Unsupported model name: {model_name}") + + +def clean_variant_effect_example(example: Dict[str, Any]) -> Dict[str, Any]: + """ + Clean a variant effect example. + """ + example['answer'] = example['answer'].split(";")[0].strip().lower() + return example + + +def clean_variant_effect_non_snv_example(example: Dict[str, Any]) -> Dict[str, Any]: + """ + Clean a variant effect non-SNV example. + """ + example['answer'] = example['answer'].replace("[", "").replace("]", "").replace("'", "").replace("_", " ").strip() + return example + + +def format_variant_effect_for_dna_llm(example: Dict[str, Any]) -> Dict[str, Any]: + """ + Format a VEP example into the required chat format for DNA-LLM. + """ + return { + "prompt": [ + { + "role": "user", + "content": [ + *({"type": "dna", "text": None} for _ in range(2)), + {"type": "text", "text": example["question"].strip()}, + ], + }, + { + "role": "assistant", + "reasoning_content": f"Answer: {example['answer'].strip()}", + "content": [ + {"type": "text", "text": f"Answer: {example['answer'].strip()}"}, + ], + }, + ], + "dna_sequences": [ + example["reference_sequence"], + example["variant_sequence"], + ], + "answer": example["answer"].strip(), + } + + +def format_variant_effect_for_llm(example: Dict[str, Any]) -> Dict[str, Any]: + """ + Format a VEP example into the required chat format for LLM. + """ + question = f"Reference sequence: {example['reference_sequence']}\nVariant sequence: {example['variant_sequence']}\nQuestion: {example['question']}" + return { + "prompt": [ + { + "role": "user", + "content": [ + *({"type": "dna", "text": None} for _ in range(2)), + {"type": "text", "text": question.strip()}, + ], + }, + { + "role": "assistant", + "reasoning_content": f"Answer: {example['answer'].strip()}", + "content": [ + {"type": "text", "text": f"Answer: {example['answer'].strip()}"}, + ], + }, + ], + "dna_sequences": [ + "", + "", + ], + "answer": example["answer"].strip(), + } \ No newline at end of file diff --git a/BioReason-main/bioreason/dna_modules/__init__.py b/BioReason-main/bioreason/dna_modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0b2b166dfc42880b3737646adb004e43873633d9 --- /dev/null +++ b/BioReason-main/bioreason/dna_modules/__init__.py @@ -0,0 +1,4 @@ +from .dna_module import DNABaseModule +from .nucleotide_module import NucleotideDNAModule + +__all__ = ["DNABaseModule", "NucleotideDNAModule"] \ No newline at end of file diff --git a/BioReason-main/bioreason/dna_modules/dna_module.py b/BioReason-main/bioreason/dna_modules/dna_module.py new file mode 100644 index 0000000000000000000000000000000000000000..679d92745fec46687e73d99e5ade6f50a54c4811 --- /dev/null +++ b/BioReason-main/bioreason/dna_modules/dna_module.py @@ -0,0 +1,49 @@ +from abc import ABC, abstractmethod +from typing import Dict, Any, Union +import torch + +class DNABaseModule(ABC): + def __init__(self): + super().__init__() + + @abstractmethod + def get_dnallm_key(self): + pass + + @abstractmethod + def get_model_class(self, model_id: str, model_init_kwargs: dict): + pass + + def post_model_init(self, model, processing_class): + pass + + def is_embeds_input(self): + return False + + @abstractmethod + def get_processing_class(self): + pass + + @abstractmethod + def get_dnallm_modules_keywords(self): + pass + + @abstractmethod + def get_custom_multimodal_keywords(self): + pass + + @abstractmethod + def get_non_generate_params(self): + pass + + @abstractmethod + def get_custom_processing_keywords(self): + pass + + @abstractmethod + def prepare_prompt(self, processing_class, inputs: dict[str, Union[torch.Tensor, Any]]): + pass + + @abstractmethod + def prepare_model_inputs(self, processing_class, prompts_text, images, return_tensors, padding, padding_side, add_special_tokens): + pass \ No newline at end of file diff --git a/BioReason-main/bioreason/dna_modules/nucleotide_module.py b/BioReason-main/bioreason/dna_modules/nucleotide_module.py new file mode 100644 index 0000000000000000000000000000000000000000..ef40652ddbbfd817435460ae2d9033e823cd5096 --- /dev/null +++ b/BioReason-main/bioreason/dna_modules/nucleotide_module.py @@ -0,0 +1,263 @@ +from transformers import ( + Qwen2_5_VLForConditionalGeneration, + Qwen2VLForConditionalGeneration, + AutoProcessor, +) +from typing import Dict, Any, Union, List, Optional, Callable, Type +from trl.data_utils import maybe_apply_chat_template +from trl import SFTTrainer +import torch + +from bioreason.dna_modules.dna_module import DNABaseModule +from bioreason.models.dna_llm import DNALLMModel +from bioreason.models.dl.processing_dl import DLProcessor + + +class NucleotideDNAModule(DNABaseModule): + """ + DNA module implementation for NucleotideTransformer-based models. + + This module provides the interface between DNA-LLM models and the training + infrastructure, handling model loading, processing setup, and reward functions. + """ + + def __init__(self): + """Initialize the NucleotideDNAModule.""" + super().__init__() + + def get_dnallm_key(self) -> str: + """ + Get the key identifier for this DNA-LLM implementation. + + Returns: + String identifier for this module type + """ + return "qwen" + + def get_model_class(self, model_id: str, model_init_kwargs: Dict[str, Any]) -> Type: + """ + Return the appropriate model class based on model ID. + + Args: + model_id: Identifier for the model + model_init_kwargs: Initialization arguments for the model + + Returns: + The model class to instantiate + + Raises: + ValueError: If the model is not supported + """ + if "DNALLM" in model_id: + model_cls = DNALLMModel + else: + raise ValueError(f"Unsupported model: {model_id}") + return model_cls + + def post_model_init(self, model: Any, processing_class: Any) -> None: + """ + Perform any post-initialization setup on the model. + + Args: + model: The initialized model + processing_class: The processor for the model + """ + # No post-init needed for this implementation + pass + + def get_processing_class(self) -> Type: + """ + Get the processing class to use with this DNA-LLM model. + + Returns: + The processing class + """ + return DLProcessor + + def get_dnallm_modules_keywords(self) -> List[str]: + """ + Get keywords to identify DNA-specific modules in the model. + + Used to exclude DNA modules from LoRA adaptation during training. + + Returns: + List of keywords that identify DNA modules + """ + return ["dna"] + + def get_custom_multimodal_keywords(self) -> List[str]: + """ + Get keywords for multimodal inputs that should be passed to the model. + + Returns: + List of input keywords for multimodal processing + """ + return ["dna_tokenized", "batch_idx_map"] + + def get_non_generate_params(self) -> List[str]: + """ + Get parameter names that should be excluded from generation. + + Returns: + List of parameter names to exclude from generation calls + """ + return [] + + def get_custom_processing_keywords(self) -> List[tuple]: + """ + Get custom processing keywords for the processor. + + Returns: + List of (component, parameter) tuples for custom processing + """ + return [("dna_tokenizer", "max_length")] + + def prepare_prompt( + self, processing_class: Any, inputs: List[Dict[str, Union[torch.Tensor, Any]]] + ) -> List[str]: + """ + Prepare prompts from input examples. + + Args: + processing_class: The processor to use + inputs: List of input examples + + Returns: + List of prepared prompts + """ + prompts_text = [ + maybe_apply_chat_template(example, processing_class)["prompt"] + for example in inputs + ] + return prompts_text + + def prepare_model_inputs( + self, + processing_class: Any, + model: Any, + prompts_text: List[str], + batch_dna_sequences: List[List[str]], + return_tensors: str = "pt", + padding: bool = True, + padding_side: str = "left", + add_special_tokens: bool = False, + ) -> Dict[str, Any]: + """ + Prepare inputs for the model. + + Args: + processing_class: The processor to use + model: The model to prepare inputs for + prompts_text: List of text prompts + batch_dna_sequences: List of lists of DNA sequences + return_tensors: Return format for tensors + padding: Whether to pad inputs + padding_side: Side to pad on + add_special_tokens: Whether to add special tokens + + Returns: + Processed inputs for the model + """ + # Handle DataParallel wrapped models by accessing the module attribute if needed + max_length_text = model.max_length_text if not hasattr(model, 'module') else model.module.max_length_text + max_length_dna = model.max_length_dna if not hasattr(model, 'module') else model.module.max_length_dna + + prompt_inputs = processing_class( + text=prompts_text, + batch_dna_sequences=batch_dna_sequences, + return_tensors=return_tensors, + padding=padding, + padding_side=padding_side, + add_special_tokens=add_special_tokens, + max_length_text=max_length_text, + max_length_dna=max_length_dna, + ) + + return prompt_inputs + + def is_embeds_input(self) -> bool: + """ + Whether the model uses embeddings as input (instead of token IDs). + + Returns: + Boolean indicating if the model takes embedding inputs + """ + return True + + @staticmethod + def get_question_template() -> str: + """ + Get the template for formatting questions. + + Returns: + String template for questions + """ + return "{Question}" + + @staticmethod + def format_reward_rec(completions: List[Dict[str, Any]], **kwargs) -> List[float]: + """ + Check if the Qwen model output matches a specific format. + + Args: + completions: List of model completions + **kwargs: Additional arguments + + Returns: + List of reward scores (1.0 for match, 0.0 for no match) + """ + import re + import os + from datetime import datetime + + # Pattern to match the expected output format + pattern = r".*?\s*.*?\{.*\[\d+,\s*\d+,\s*\d+,\s*\d+\].*\}.*?" + completion_contents = [completion[0]["content"] for completion in completions] + matches = [ + re.search(pattern, content, re.DOTALL) is not None + for content in completion_contents + ] + + # Log format results if in debug mode + current_time = datetime.now().strftime("%d-%H-%M-%S-%f") + if os.getenv("DEBUG_MODE") == "true": + log_path = os.getenv("LOG_PATH") + with open( + log_path.replace(".txt", "_format.txt"), "a", encoding="utf-8" + ) as f: + f.write(f"------------- {current_time} Format reward -------------\n") + for content, match in zip(completion_contents, matches): + f.write(f"Content: {content}\n") + f.write(f"Has format: {bool(match)}\n") + + return [1.0 if match else 0.0 for match in matches] + + @staticmethod + def select_reward_func(func: str, task_type: str) -> Callable: + """ + Select the appropriate reward function based on function name and task type. + + Args: + func: The type of reward function ('accuracy', 'format', etc.) + task_type: The type of task ('rec', etc.) + + Returns: + The reward function to use + + Raises: + ValueError: If the function or task type is not supported + """ + if func == "accuracy": + match task_type: + case "rec": + return NucleotideDNAModule.iou_reward + case _: + raise ValueError(f"Unsupported reward function: {func}") + elif func == "format": + match task_type: + case "rec": + return NucleotideDNAModule.format_reward_rec + case _: + raise ValueError(f"Unsupported reward function: {func}") + else: + raise ValueError(f"Unsupported reward function: {func}") \ No newline at end of file diff --git a/BioReason-main/bioreason/models/__init__.py b/BioReason-main/bioreason/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2774b1766a799c0c8469bbefafb3b35bb7e8726a --- /dev/null +++ b/BioReason-main/bioreason/models/__init__.py @@ -0,0 +1,9 @@ +from .dna_only import DNAClassifierModel +from .dna_llm import DNALLMModel +from .evo2_tokenizer import Evo2Tokenizer + +__all__ = [ + "DNAClassifierModel", + "DNALLMModel", + "Evo2Tokenizer", +] diff --git a/BioReason-main/bioreason/models/dl/__init__.py b/BioReason-main/bioreason/models/dl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/BioReason-main/bioreason/models/dl/__init__.py @@ -0,0 +1 @@ + diff --git a/BioReason-main/bioreason/models/dl/chat_template_dl.py b/BioReason-main/bioreason/models/dl/chat_template_dl.py new file mode 100644 index 0000000000000000000000000000000000000000..fdcec5da2058e2a42ce0c4b5db73ab350d81ae59 --- /dev/null +++ b/BioReason-main/bioreason/models/dl/chat_template_dl.py @@ -0,0 +1 @@ +CHAT_TEMPLATE = "{%- set dna_count = namespace(value=0) %}{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content is string and message.content.startswith('') and message.content.endswith('')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' }} {%- if message.content is string %}{{- message.content + '<|im_end|>' + '\\n' }}{%- else %}{%- for content in message.content %}{%- if content.type == 'dna' or 'dna' in content %}{%- set dna_count.value = dna_count.value + 1 %}{%- if add_dna_id %}DNA Sequence {{- dna_count.value }}: {%- endif %}<|dna_start|><|dna_pad|><|dna_end|>{%- elif 'text' in content %}{{- content.text }}{%- endif %}{%- endfor %}{{- '<|im_end|>' + '\\n' }}{%- endif %}{%- elif message.role == \"assistant\" %}\n {%- set content = message.content[0].text %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '' in message.content %}\n {%- set content = message.content[0].text.split('')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content[0].text.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content.strip('\\n') + '\\n\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '\\n\\n\\n\\n' }}\n {%- endif %}\n{%- endif %}" \ No newline at end of file diff --git a/BioReason-main/bioreason/models/dl/configuration_dl.py b/BioReason-main/bioreason/models/dl/configuration_dl.py new file mode 100644 index 0000000000000000000000000000000000000000..c5e0eac5e2bce002700459edd90a907125a81d28 --- /dev/null +++ b/BioReason-main/bioreason/models/dl/configuration_dl.py @@ -0,0 +1,232 @@ +from transformers import PretrainedConfig + +class DLDNAConfig(PretrainedConfig): + model_type = "dl" + base_config_key = "dna_config" + + def __init__( + self, + depth=32, + hidden_size=3584, + hidden_act="silu", + intermediate_size=3420, + num_heads=16, + in_channels=3, + patch_size=14, + spatial_merge_size=2, + temporal_patch_size=2, + tokens_per_second=4, + window_size=112, + out_hidden_size=3584, + fullatt_block_indexes=[7, 15, 23, 31], + **kwargs, + ): + super().__init__(**kwargs) + + self.depth = depth + self.hidden_size = hidden_size + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.num_heads = num_heads + self.in_channels = in_channels + self.patch_size = patch_size + self.spatial_merge_size = spatial_merge_size + self.temporal_patch_size = temporal_patch_size + self.tokens_per_second = tokens_per_second + self.window_size = window_size + self.fullatt_block_indexes = fullatt_block_indexes + self.out_hidden_size = out_hidden_size + +class DLConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Qwen2_5_VLModel`]. It is used to instantiate a + Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of + Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 152064): + Vocabulary size of the Qwen2_5_VL model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Qwen2_5_VLModel`] + hidden_size (`int`, *optional*, defaults to 8192): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 29568): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 80): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 64): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 8): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 1000000.0): + The base period of the RoPE embeddings. + use_sliding_window (`bool`, *optional*, defaults to `False`): + Whether to use sliding window attention. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention (SWA) window size. If not specified, will default to `4096`. + max_window_layers (`int`, *optional*, defaults to 80): + The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + vision_config (`Dict`, *optional*): + The config for the visual encoder initialization. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + + ```python + >>> from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLConfig + + >>> # Initializing a Qwen2_5_VL style configuration + >>> configuration = Qwen2_5_VLConfig() + + >>> # Initializing a model from the Qwen2-VL-7B style configuration + >>> model = Qwen2_5_VLForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "dl" + sub_configs = {"dna_config": DLDNAConfig} + keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `Qwen2_5_VL` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=152064, + hidden_size=8192, + intermediate_size=29568, + num_hidden_layers=80, + num_attention_heads=64, + num_key_value_heads=8, + hidden_act="silu", + max_position_embeddings=32768, + initializer_range=0.02, + rms_norm_eps=1e-05, + use_cache=True, + tie_word_embeddings=False, + rope_theta=1000000.0, + use_sliding_window=False, + sliding_window=4096, + max_window_layers=80, + attention_dropout=0.0, + vision_config=None, + rope_scaling=None, + image_token_id=None, + **kwargs, + ): + if isinstance(vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**vision_config) + elif vision_config is None: + self.vision_config = self.sub_configs["vision_config"]() + + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.use_sliding_window = use_sliding_window + self.sliding_window = sliding_window + self.max_window_layers = max_window_layers + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + self.rope_scaling = rope_scaling + + self.dna_token_id = image_token_id + + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, move it to 'rope_type'. + # and change type from 'mrope' to 'default' because `mrope` does default RoPE calculations + # one can set it to "linear"/"dynamic" etc. to have scaled RoPE + # TODO: @raushan update config in the hub + if self.rope_scaling is not None and "type" in self.rope_scaling: + if self.rope_scaling["type"] == "mrope": + self.rope_scaling["type"] = "default" + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self, ignore_keys={"mrope_section"}) + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + +__all__ = ["DLConfig"] \ No newline at end of file diff --git a/BioReason-main/bioreason/models/dl/processing_dl.py b/BioReason-main/bioreason/models/dl/processing_dl.py new file mode 100644 index 0000000000000000000000000000000000000000..1baa781f45ef2527e3fb4a9fbfbb8274ece8b419 --- /dev/null +++ b/BioReason-main/bioreason/models/dl/processing_dl.py @@ -0,0 +1,275 @@ +from typing import List, Optional, Union, Dict, Any, Tuple + +import torch +from torch import nn +import torch.nn.functional as F + +from transformers import AutoTokenizer +from transformers.processing_utils import ( + CommonKwargs, + ProcessingKwargs, + ProcessorMixin, + Unpack, +) +from transformers.feature_extraction_utils import BatchFeature +from transformers.tokenization_utils_base import PreTokenizedInput, TextInput +from transformers.utils import logging + +from bioreason.utils.dna_utils import DNAInput + +class DLDNAKwargs(CommonKwargs): + """Keyword arguments specific to DNA processing""" + max_length_text: Optional[int] + max_length_dna: Optional[int] + + +class DLProcessorKwargs(ProcessingKwargs, total=False): + """Processing keyword arguments for the DL processor""" + dna_kwargs: DLDNAKwargs + _defaults = { + "text_kwargs": { + "padding": False, + }, + } + +class DLProcessor(ProcessorMixin): + r""" + Constructs a DL processor which wraps a NucleotideTransformer DNA processor and a Qwen2_5 tokenizer into a single processor. + This processor handles both text and DNA sequence processing to prepare inputs for the DNALLMModel. + + Args: + tokenizer (PreTrainedTokenizerBase, *optional*): + The text tokenizer used for processing text inputs. + dna_tokenizer (PreTrainedTokenizerBase, *optional*): + The DNA tokenizer used for processing DNA sequences. + chat_template (`str`, *optional*): + A Jinja template for chat formatting. If None, will use the tokenizer's template. + """ + + attributes = ["tokenizer", "dna_tokenizer"] + valid_kwargs = ["model", "chat_template"] + tokenizer_class = ( + "Qwen2Tokenizer", "Qwen2TokenizerFast", + "GPT2TokenizerFast", + ) + dna_tokenizer_class = ("EsmTokenizer", "Evo2Tokenizer") + + def __init__( + self, tokenizer=None, dna_tokenizer=None, chat_template=None, **kwargs + ): + """ + Initialize the processor with text and DNA tokenizers. + + Args: + tokenizer: Text tokenizer (usually from a language model) + dna_tokenizer: DNA tokenizer (usually from a DNA model) + chat_template: Template for formatting chat conversations + **kwargs: Additional arguments + """ + self.tokenizer = tokenizer + self.dna_tokenizer = dna_tokenizer + + self.dna_token = ( + "<|dna_pad|>" + if not hasattr(self.tokenizer, "dna_token") + else self.tokenizer.dna_token + ) + + # Get chat template from tokenizer if not provided + if chat_template is None and hasattr(self.tokenizer, "chat_template"): + chat_template = self.tokenizer.chat_template + super().__init__(tokenizer, dna_tokenizer, chat_template=chat_template) + + # The GRPO trainer might expect this to be set + if not hasattr(self.tokenizer, 'pad_token') or self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + def tokenize_dna_sequences( + self, + batch_dna_sequences: List[List[str]], + max_length: int = 2048, + return_tensors: str = "pt", + device: str = "cuda", + ) -> Dict[str, Any]: + """ + Tokenize a batch of DNA sequences. + + Args: + batch_dna_sequences: List of lists of DNA sequences per batch item + max_length: Maximum allowed length for DNA sequences + return_tensors: Return format for tensors ("pt" for PyTorch) + device: Device to place tensors on + + Returns: + Dict containing: + - dna_tokenized: The tokenized DNA sequences + - batch_idx_map: Mapping of which sequences belong to which batch item + """ + # Create a mapping to track which sequences belong to which batch item + batch_idx_map = [] + all_sequences = [] + + # Flatten all sequences with batch tracking + for batch_idx, dna_sequences in enumerate(batch_dna_sequences): + for seq in dna_sequences: + all_sequences.append(seq) + batch_idx_map.append(batch_idx) + + # If no sequences in the entire batch, return empty dict + if not all_sequences: + return {"dna_tokenized": None, "batch_idx_map": []} + + # Tokenize all sequences at once + dna_tokenized = self.dna_tokenizer( + all_sequences, + padding=True, + truncation=True, + max_length=max_length, + return_tensors=return_tensors, + return_attention_mask=True, + ) + + return {"dna_tokenized": dna_tokenized, "batch_idx_map": batch_idx_map} + + def __call__( + self, + batch_dna_sequences: Optional[List[List[str]]] = None, + text: Optional[ + Union[ + TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput] + ] + ] = None, + max_length_text: int = 512, + max_length_dna: int = 2048, + return_tensors: str = "pt", + device: str = "cuda", + **kwargs: Unpack[DLProcessorKwargs], + ) -> BatchFeature: + """ + Process text and DNA sequences for model input. + + Args: + batch_dna_sequences: List of lists of DNA sequences per batch item + text: Input text or list of texts + max_length_text: Maximum length for text sequences + max_length_dna: Maximum length for DNA sequences + return_tensors: Return format for tensors + device: Device to place tensors on + **kwargs: Additional processor keyword arguments + + Returns: + BatchFeature with tokenized inputs for the model + """ + output_kwargs = self._merge_kwargs( + DLProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + # Ensure text is a list + if not isinstance(text, list): + text = [text] + + # flattened_dna_sequences = [dna_sequence for dna_sequences in batch_dna_sequences for dna_sequence in dna_sequences] + dna_inputs = {} + if batch_dna_sequences is not None: + # Tokenize DNA sequences + dna_processing_result = self.tokenize_dna_sequences( + batch_dna_sequences, + max_length=max_length_dna, + return_tensors=return_tensors, + device=device, + ) + + # Replace DNA tokens in text if needed + index = 0 + for i in range(len(text)): + while self.dna_token in text[i]: + num_dna_tokens = (dna_processing_result['dna_tokenized']['input_ids'][index] != 1).sum().item() + text[i] = text[i].replace( + self.dna_token, "<|placeholder|>" * num_dna_tokens, 1 + ) + index += 1 + text[i] = text[i].replace("<|placeholder|>", self.dna_token) + + + + # Add batch info to the output + dna_inputs = { + # "batch_dna_sequences": batch_dna_sequences, + "dna_tokenized": dna_processing_result["dna_tokenized"], + "batch_idx_map": dna_processing_result["batch_idx_map"], + } + + # Tokenize text + text_kwargs = output_kwargs.get("text_kwargs", {}) + + if 'padding' in text_kwargs: + del text_kwargs['padding'] + + # print("__call__ (processor):", text) + text_inputs = self.tokenizer( + text, + max_length=max_length_text + 2 * max_length_dna, + return_tensors=return_tensors, + padding=True, + truncation=True, + **text_kwargs, + ) + + # The BatchFeature should have all required fields for the model's forward pass + return BatchFeature(data={**text_inputs, **dna_inputs}) + + def batch_decode(self, *args, **kwargs) -> List[str]: + """ + This method forwards all its arguments to the tokenizer's batch_decode. + + Returns: + List of decoded strings + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs) -> str: + """ + This method forwards all its arguments to the tokenizer's decode. + + Returns: + Decoded string + """ + return self.tokenizer.decode(*args, **kwargs) + + def post_process_dna_to_text( + self, + generated_outputs: torch.Tensor, + skip_special_tokens: bool = True, + **kwargs, + ) -> List[str]: + """ + Post-process the model output to decode the text. + + Args: + generated_outputs: The token IDs generated by the model + skip_special_tokens: Whether to skip special tokens in the output + **kwargs: Additional arguments for the decoder + + Returns: + List of decoded strings + """ + return self.tokenizer.batch_decode( + generated_outputs, + skip_special_tokens=skip_special_tokens, + **kwargs, + ) + + @property + def model_input_names(self) -> List[str]: + """ + Get the input names expected by the model. + + Returns: + List of input names + """ + tokenizer_input_names = self.tokenizer.model_input_names + dna_input_names = ["dna_tokenized", "batch_idx_map"] + + return list(dict.fromkeys(tokenizer_input_names + dna_input_names)) diff --git a/BioReason-main/bioreason/models/dna_llm.py b/BioReason-main/bioreason/models/dna_llm.py new file mode 100644 index 0000000000000000000000000000000000000000..93e6bb613671e64d0d8d1770e359313cb18092ea --- /dev/null +++ b/BioReason-main/bioreason/models/dna_llm.py @@ -0,0 +1,306 @@ +import os +from argparse import ArgumentParser +import torch +import torch.nn as nn +from transformers import ( + AutoTokenizer, + AutoModelForCausalLM, + AutoModelForMaskedLM, +) + +from typing import Optional, List, Dict, Any, Union, Tuple + +from bioreason.utils.dna_utils import DNAInput +from bioreason.models.dl.processing_dl import DLProcessor +from bioreason.models.dl.chat_template_dl import CHAT_TEMPLATE +from bioreason.models.evo2_tokenizer import Evo2Tokenizer + +class DNALLMModel(nn.Module): + """ + A combined model that processes both DNA sequences and text inputs. + + The model uses a DNA encoder (like NucleotideTransformer) to extract features from DNA sequences + and a text model (LLM) to process text inputs and generate responses. The DNA features are + projected to the text model's embedding space and prepended to the text embeddings. + """ + + def __init__( + self, + text_model_name: str, + dna_model_name: str, + cache_dir: Optional[str] = None, + max_length_dna: int = 2048, + max_length_text: int = 512, + text_model_finetune: bool = True, + dna_model_finetune: bool = True, + dna_is_evo2: bool = False, + dna_embedding_layer: str = None + ): + """ + Initialize the DNALLMModel. + + Args: + text_model_name: Name of the text model to be used. + dna_model_name: Name of the DNA model to be used. + cache_dir: Directory to cache the models. + max_length_dna: Maximum length of DNA sequences. Defaults to 2048. + max_length_text: Maximum length of text sequences. Defaults to 512. + text_model_finetune: Whether to finetune the text model. Defaults to True. + dna_model_finetune: Whether to finetune the DNA model. Defaults to True. + dna_is_evo2: Whether the DNA model is Evo2. Defaults to False. + dna_embedding_layer: Name of the layer to use for the Evo2 model. Defaults to None. + """ + super().__init__() + + self.text_model_finetune = text_model_finetune + self.dna_model_finetune = dna_model_finetune + self.max_length_dna = max_length_dna + self.max_length_text = max_length_text + self.dna_is_evo2 = dna_is_evo2 + self.dna_embedding_layer = dna_embedding_layer + + + # Load the text model and tokenizer + self.text_model = AutoModelForCausalLM.from_pretrained( + text_model_name, cache_dir=cache_dir, trust_remote_code=True + ) + self.text_tokenizer = AutoTokenizer.from_pretrained(text_model_name, trust_remote_code=True) + self.text_config = self.text_model.config + self.text_tokenizer.chat_template = CHAT_TEMPLATE + self.text_tokenizer.pad_token = self.text_tokenizer.eos_token + + new_tokens = ["<|dna_start|>", "<|dna_pad|>", "<|dna_end|>"] + self.text_tokenizer.add_special_tokens({"additional_special_tokens": new_tokens}) + self.dna_token_id = self.text_tokenizer.convert_tokens_to_ids("<|dna_pad|>") + + + # Load the DNA model and tokenizer + if not self.dna_is_evo2: + self.dna_model = AutoModelForMaskedLM.from_pretrained( + dna_model_name, cache_dir=cache_dir, trust_remote_code=True + ) + self.dna_tokenizer = AutoTokenizer.from_pretrained(dna_model_name, trust_remote_code=True) + self.dna_config = self.dna_model.config + + else: + from evo2 import Evo2 + self.dna_model = Evo2(dna_model_name) + self.dna_tokenizer = Evo2Tokenizer(self.dna_model.tokenizer) + self.dna_config = self.dna_model.model.config + self.dna_embedding_layer = self.dna_embedding_layer + + # Get model dimensions + self.text_hidden_size = self.text_config.hidden_size + self.dna_hidden_size = self.dna_config.hidden_size + + # Create projection layer to map DNA embeddings to text model's embedding space + self.dna_projection = nn.Linear(self.dna_hidden_size, self.text_hidden_size) + + # Create processor for handling inputs + self.processor = DLProcessor(tokenizer=self.text_tokenizer, dna_tokenizer=self.dna_tokenizer) + + + def process_dna_embeddings( + self, + dna_tokenized: Dict[str, torch.Tensor], + batch_idx_map: List[int], + batch_size: int, + ) -> List[torch.Tensor]: + """ + Process DNA sequences to obtain embeddings. + + Args: + dna_tokenized: Tokenized DNA sequences + batch_idx_map: Mapping of each sequence to its batch item + batch_size: Number of items in the batch + + Returns: + List of tensor embeddings for each batch item + """ + # Process all sequences to get DNA representations + with torch.no_grad(): + # Handle different model types based on dna_is_evo2 attribute + if self.dna_is_evo2 and self.dna_embedding_layer is not None: # Evo2 model + # Get embeddings from the specific layer in Evo2 + hidden_states_list = [] + + for seq_idx in range(len(dna_tokenized["input_ids"])): + # Extract single sequence + input_ids = dna_tokenized["input_ids"][seq_idx:seq_idx+1] + + # Call Evo2 with return_embeddings=True + _, embeddings = self.dna_model( + input_ids, + return_embeddings=True, + layer_names=[self.dna_embedding_layer] + ) + + # Get embeddings for the specified layer + seq_embeddings = embeddings[self.dna_embedding_layer].squeeze(0) + hidden_states_list.append(seq_embeddings) + + # Stack to get same format as non-Evo2 output + if hidden_states_list: + hidden_states = torch.stack(hidden_states_list) + else: + return [torch.zeros((0, self.text_hidden_size)) for _ in range(batch_size)] + + else: # Standard HuggingFace model + # Use existing code path for HF models + outputs = self.dna_model( + input_ids=dna_tokenized["input_ids"], + attention_mask=dna_tokenized["attention_mask"], + output_hidden_states=True, + ) + # Get the last hidden state + hidden_states = outputs.hidden_states[-1] # shape: [n_seqs, seq_len, hidden_dim] + + # Project all embeddings at once + hidden_states = hidden_states.to(device=self.dna_projection.weight.device, dtype=self.dna_projection.weight.dtype) + projected_states = self.dna_projection(hidden_states) + + # Group embeddings by batch item + result = [[] for _ in range(batch_size)] + + # For each sequence, get its embeddings and add to appropriate batch result + for seq_idx, batch_idx in enumerate(batch_idx_map): + # Get only the valid (non-padding) tokens + valid_length = dna_tokenized["attention_mask"][seq_idx].sum().item() + seq_embedding = projected_states[seq_idx, :valid_length] + result[batch_idx].append(seq_embedding) + + # Concatenate embeddings for each batch item + for i in range(batch_size): + if result[i]: + result[i] = torch.cat(result[i], dim=0) + else: + result[i] = torch.zeros((0, self.text_hidden_size)) + + return result + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + dna_tokenized: Optional[Dict[str, torch.Tensor]] = None, + batch_idx_map: Optional[List[int]] = None, + labels: Optional[torch.Tensor] = None, + **kwargs, + ) -> torch.Tensor: + """ + Generate text based on DNA and text inputs. + + Args: + input_ids: Input IDs (used if provided directly) + attention_mask: Attention mask (used if provided directly) + dna_tokenized: Tokenized DNA sequences (used if provided directly) + batch_idx_map: Batch mapping for DNA sequences (used if provided directly) + labels: Labels for supervised fine-tuning (used if provided directly) + **kwargs: Additional arguments for generation + + Returns: + Outputs from the text model + """ + # Ensure required inputs are available + if input_ids is None or attention_mask is None: + raise ValueError("Either 'inputs' or 'input_ids'/'attention_mask' must be provided") + + batch_size = input_ids.shape[0] + + # Get text embeddings from the model's embedding layer + text_inputs_embeds = self.text_model.get_input_embeddings()(input_ids) + + if dna_tokenized is not None and batch_idx_map: + batch_dna_embeds = self.process_dna_embeddings(dna_tokenized, batch_idx_map, batch_size) + + mask = input_ids == self.dna_token_id + + n_dna_tokens = mask.sum().item() + dna_embeds_flat = torch.cat(batch_dna_embeds, dim=0) + n_dna_features = dna_embeds_flat.shape[0] + + if n_dna_features != n_dna_tokens: + raise ValueError( + f"DNA features and DNA tokens do not match: features {n_dna_features}, tokens: {n_dna_tokens}" + ) + + # Ensure DNA embeddings have the same dtype as the text embeddings + dna_embeds_flat = dna_embeds_flat.to(dtype=text_inputs_embeds.dtype) + text_inputs_embeds[mask] = dna_embeds_flat + + # Handle labels if provided (for training) + if labels is not None: + # TODO: Implement this + pass + + # Forward pass through the text model (loss is computed if labels is provided) + outputs = self.text_model( + inputs_embeds=text_inputs_embeds, + attention_mask=attention_mask, + labels=labels, + **kwargs, + ) + + return outputs + + def generate( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + dna_tokenized: Optional[Dict[str, torch.Tensor]] = None, + batch_idx_map: Optional[List[int]] = None, + **generation_kwargs, + ) -> Union[torch.Tensor, List[str]]: + """ + Generate text based on DNA and text inputs. + + Args: + inputs: The preprocessed inputs from the processor (preferred method) + batch_dna_sequences: List of lists of DNA sequences per batch item (legacy method) + input_texts: List of input texts (legacy method) + input_ids: Input IDs (used if provided directly) + attention_mask: Attention mask (used if provided directly) + dna_tokenized: Tokenized DNA sequences (used if provided directly) + batch_idx_map: Batch mapping for DNA sequences (used if provided directly) + **generation_kwargs: Additional arguments for generation + + Returns: + Generated token IDs which can be decoded using the processor + """ + # Ensure required inputs are available + if input_ids is None or attention_mask is None: + raise ValueError("Either 'inputs' or 'input_ids'/'attention_mask' must be provided") + + batch_size = input_ids.shape[0] + + # Get text embeddings from the model's embedding layer + text_inputs_embeds = self.text_model.get_input_embeddings()(input_ids) + + if dna_tokenized is not None and batch_idx_map: + batch_dna_embeds = self.process_dna_embeddings(dna_tokenized, batch_idx_map, batch_size) + + mask = input_ids == self.dna_token_id + + n_dna_tokens = mask.sum().item() + dna_embeds_flat = torch.cat(batch_dna_embeds, dim=0) + n_dna_features = dna_embeds_flat.shape[0] + + if n_dna_features != n_dna_tokens: + raise ValueError( + f"DNA features and DNA tokens do not match: features {n_dna_features}, tokens: {n_dna_tokens}" + ) + + # Ensure DNA embeddings have the same dtype as the text embeddings + dna_embeds_flat = dna_embeds_flat.to(dtype=text_inputs_embeds.dtype) + text_inputs_embeds[mask] = dna_embeds_flat + + # Generation parameters may need adjustment based on model type + with torch.no_grad(): + outputs = self.text_model.generate( + inputs_embeds=text_inputs_embeds, + attention_mask=attention_mask, + use_cache=True, + **generation_kwargs, + ) + + return outputs \ No newline at end of file diff --git a/BioReason-main/bioreason/models/dna_only.py b/BioReason-main/bioreason/models/dna_only.py new file mode 100644 index 0000000000000000000000000000000000000000..0a9581d7f7e111677d83be5c8d9ca4159a64548d --- /dev/null +++ b/BioReason-main/bioreason/models/dna_only.py @@ -0,0 +1,203 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Dict +from transformers import AutoModelForMaskedLM, AutoTokenizer + + +class SelfAttentionPooling(nn.Module): + def __init__(self, hidden_size, num_heads=8): + super().__init__() + # Use PyTorch's built-in multi-head attention + self.attention = nn.MultiheadAttention( + embed_dim=hidden_size, + num_heads=num_heads, + batch_first=True + ) + # Learnable query vector + self.query = nn.Parameter(torch.randn(1, 1, hidden_size)) + + def forward(self, embeddings, attention_mask=None): + # Expand query to batch size + batch_size = embeddings.size(0) + query = self.query.expand(batch_size, -1, -1) + + # Create key padding mask from attention mask if provided + key_padding_mask = None + if attention_mask is not None: + key_padding_mask = attention_mask == 0 # Convert to boolean mask where True means ignore + + # Apply attention: query attends to embeddings + context, _ = self.attention( + query=query, # [batch_size, 1, hidden_size] + key=embeddings, # [batch_size, seq_len, hidden_size] + value=embeddings, # [batch_size, seq_len, hidden_size] + key_padding_mask=key_padding_mask + ) + + # Squeeze out the singleton dimension + return context.squeeze(1) # [batch_size, hidden_size] + + +class DNAClassifierModel(nn.Module): + """ + A simple classifier that uses a DNA model with a classification head. + """ + + def __init__( + self, + dna_model_name: str, + cache_dir: str = None, + max_length_dna: int = 4096, + num_classes: int = 2, # Binary classification by default + dna_is_evo2: bool = False, + dna_embedding_layer: str = None, + train_just_classifier: bool = True + ): + """ + Initialize the DNAClassifierModel. + + Args: + dna_model_name (str): Name of the DNA model to use + cache_dir (str): Directory to cache models + max_length_dna (int): Maximum sequence length + num_classes (int): Number of output classes + dna_is_evo2: Whether the DNA model is Evo2. Defaults to False + dna_embedding_layer: Name of the layer to use for the Evo2 model. Defaults to None + train_just_classifier: Whether to train just the classifier. Defaults to True + """ + super().__init__() + + self.dna_model_name = dna_model_name + self.cache_dir = cache_dir + self.max_length_dna = max_length_dna + self.num_classes = num_classes + self.dna_is_evo2 = dna_is_evo2 + self.dna_embedding_layer = dna_embedding_layer + self.train_just_classifier = train_just_classifier + + # Load the DNA model and tokenizer + if not self.dna_is_evo2: + self.dna_model = AutoModelForMaskedLM.from_pretrained( + dna_model_name, cache_dir=cache_dir, trust_remote_code=True + ) + self.dna_tokenizer = AutoTokenizer.from_pretrained(dna_model_name, trust_remote_code=True) + self.dna_config = self.dna_model.config + + else: + from evo2 import Evo2 + from bioreason.models.evo2_tokenizer import Evo2Tokenizer + self.dna_model = Evo2(dna_model_name) + self.dna_tokenizer = Evo2Tokenizer(self.dna_model.tokenizer) + self.dna_config = self.dna_model.model.config + self.dna_embedding_layer = self.dna_embedding_layer + + # Get hidden size from model config + self.hidden_size = self.dna_config.hidden_size + + # Add the self-attention pooling module + self.pooler = SelfAttentionPooling(self.hidden_size) + + # Create classification head that takes concatenated embeddings from both sequences + self.classifier = nn.Sequential( + nn.Linear(self.hidden_size * 2, self.hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(self.hidden_size, num_classes), + ) + + self.max_length_dna = max_length_dna + + def get_dna_embedding(self, input_ids: torch.Tensor, attention_mask: torch.Tensor): + """ + Get DNA embedding for a single DNA sequence using self-attention pooling. + + Args: + input_ids: DNA tokenized sequence + attention_mask: DNA tokenized sequence attention mask + + Returns: + torch.Tensor: Tensor containing the self-attention pooled DNA embedding + """ + # Add batch dimension if not present + if input_ids.dim() == 1: + input_ids = input_ids.unsqueeze(0) # [1, seq_len] + + # Handle attention mask - create if not provided or add batch dimension + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + elif attention_mask.dim() == 1: + attention_mask = attention_mask.unsqueeze(0) # [1, seq_len] + + # Get embeddings from DNA model + with torch.set_grad_enabled(not self.train_just_classifier): # Enable gradients for fine-tuning + + if self.dna_is_evo2 and self.dna_embedding_layer is not None: # Evo2 model + # Get embeddings from the specific layer in Evo2 + _, embeddings = self.dna_model( + input_ids, + return_embeddings=True, + layer_names=[self.dna_embedding_layer] + ) + + # Get embeddings for the specified layer + hidden_states = embeddings[self.dna_embedding_layer] + + else: + # Get embeddings from the last hidden state + outputs = self.dna_model( + input_ids, + attention_mask=attention_mask, + output_hidden_states=True, + ) + + # Get the last hidden state + hidden_states = outputs.hidden_states[-1] + + # Apply self-attention pooling to get a weighted representation + sequence_embedding = self.pooler(hidden_states, attention_mask) + return sequence_embedding.squeeze(0) + + def forward( + self, ref_ids=None, alt_ids=None, ref_attention_mask=None, alt_attention_mask=None + ): + """ + Forward pass of the model. + + Args: + ref_ids: Reference sequence token IDsself.dna_model + alt_ids: Alternate sequence token IDsself.dna_model + ref_attention_mask: Reference sequence attention maskself.dna_model + alt_attention_mask: Alternate sequence attention maskself.dna_model + + Returns: + torch.Tensor: Classification logits + """ + batch_size = ref_ids.shape[0] if ref_ids is not None else alt_ids.shape[0] + + if batch_size is None: + raise ValueError("Either token IDs must be provided") + + ref_embeddings = [] + alt_embeddings = [] + + # Process each example in the batch + for i in range(batch_size): + + # Get sequence embeddings + ref_embed = self.get_dna_embedding(ref_ids[i], ref_attention_mask[i]) + alt_embed = self.get_dna_embedding(alt_ids[i], alt_attention_mask[i]) + ref_embeddings.append(ref_embed) + alt_embeddings.append(alt_embed) + + # Stack embeddings + ref_embeddings = torch.stack(ref_embeddings) + alt_embeddings = torch.stack(alt_embeddings) + + # Concatenate ref and alt embeddings + combined_embeddings = torch.cat([ref_embeddings, alt_embeddings], dim=1) + + # Pass through classifier + logits = self.classifier(combined_embeddings) + + return logits \ No newline at end of file diff --git a/BioReason-main/bioreason/models/evo2_tokenizer.py b/BioReason-main/bioreason/models/evo2_tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..7d8efafe819c96d6e456b12f4fb98a6b4da7d6f4 --- /dev/null +++ b/BioReason-main/bioreason/models/evo2_tokenizer.py @@ -0,0 +1,219 @@ +from transformers.tokenization_utils import PreTrainedTokenizer +from transformers.utils import logging +from transformers import AutoTokenizer +from transformers.tokenization_utils_base import BatchEncoding +import torch +import numpy as np +from typing import List, Dict, Optional, Union, Tuple + +# Register the tokenizer with AutoTokenizer +from transformers.models.auto import AutoTokenizer +from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING +from transformers.models.auto.configuration_auto import CONFIG_MAPPING + +logger = logging.get_logger(__name__) + +class Evo2Tokenizer(PreTrainedTokenizer): + """ + Tokenizer for Evo2 models - wraps the CharLevelTokenizer to be compatible with HuggingFace. + """ + vocab_files_names = {} # No vocab files needed + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + evo2_tokenizer, + bos_token="", + eos_token="", + pad_token="", + unk_token="", + **kwargs + ): + """ + Initialize the Evo2Tokenizer. + + Args: + evo2_tokenizer: The Evo2 CharLevelTokenizer to wrap + bos_token: Beginning of sequence token + eos_token: End of sequence token + pad_token: Padding token + unk_token: Unknown token + """ + self.evo2_tokenizer = evo2_tokenizer + + # Map special tokens to Evo2 tokenizer's special token IDs + self._pad_token = pad_token + self._eos_token = eos_token + self._bos_token = bos_token + self._unk_token = unk_token + + # Initialize with special tokens + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + unk_token=unk_token, + **kwargs + ) + + # Set token IDs from Evo2 tokenizer + self.pad_token_id = self.evo2_tokenizer.pad_id + self.eos_token_id = self.evo2_tokenizer.eos_id + + @property + def vocab_size(self) -> int: + """Return the vocab size of the tokenizer.""" + return self.evo2_tokenizer.vocab_size + + def get_vocab(self) -> Dict: + """Return vocab as a dictionary.""" + # Evo2 CharLevelTokenizer doesn't have a traditional vocab dict + # Create a simple mapping of ASCII codes to tokens + return {chr(i): i for i in range(self.vocab_size)} + + def _tokenize(self, text: str) -> List[int]: + """Tokenize a string using the Evo2 tokenizer.""" + return [chr(int(token)) for token in self.evo2_tokenizer.tokenize(text)] + + def _convert_token_to_id(self, token: str) -> int: + """Convert a token to an id using the Evo2 tokenizer.""" + # Since tokens are just characters, convert to their ASCII value + return ord(token) + + def _convert_id_to_token(self, index: int) -> str: + """Convert an id to a token using the Evo2 tokenizer.""" + # Convert ASCII value back to character + return chr(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + """Convert a sequence of tokens to a single string.""" + return "".join(tokens) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + """No vocabulary to save for Evo2Tokenizer, so just return an empty tuple.""" + return () + + def __call__( + self, + text: Union[str, List[str]], + text_pair: Optional[Union[str, List[str]]] = None, + padding: Union[bool, str] = False, + truncation: Union[bool, str] = False, + max_length: Optional[int] = None, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = True, + **kwargs + ) -> Dict[str, torch.Tensor]: + """ + Main tokenization method that handles batching and converts to tensors. + """ + # Handle single string vs list of strings + if isinstance(text, str): + text = [text] + + # Tokenize all sequences - note: tokenizer only accepts strings, not lists + input_ids_list = [] + for seq in text: + # Tokenize and convert numpy.uint8 to Python integers + tokens = [int(token) for token in self.evo2_tokenizer.tokenize(seq)] + + # Truncate if needed + if truncation and max_length and len(tokens) > max_length: + tokens = tokens[:max_length] + + input_ids_list.append(tokens) + + # Apply padding if needed + if padding: + if False:#max_length: + max_len = max_length + else: + max_len = max(len(ids) for ids in input_ids_list) + + # Create padded sequences and attention masks + padded_input_ids = [] + attention_mask = [] + + for ids in input_ids_list: + # Apply left padding (pad on the left) + padding_length = max_len - len(ids) + padded_ids = [self.pad_token_id] * padding_length + ids + mask = [0] * padding_length + [1] * len(ids) + + padded_input_ids.append(padded_ids) + attention_mask.append(mask) + + input_ids_list = padded_input_ids + else: + # Create attention mask without padding + attention_mask = [[1] * len(ids) for ids in input_ids_list] + + # Create result dictionary + result = {"input_ids": input_ids_list} + if return_attention_mask: + result["attention_mask"] = attention_mask + + # Convert to tensors if requested + if return_tensors == "pt": + result = {k: torch.tensor(v) for k, v in result.items()} + + # Return a BatchEncoding object rather than a plain dictionary + return BatchEncoding( + data=result, + tensor_type=return_tensors, + prepend_batch_axis=False, # Already handled in our tensor creation + encoding=None # No encoding info from Evo2's tokenizer + ) + + def batch_decode( + self, + sequences: Union[List[int], List[List[int]], torch.Tensor], + skip_special_tokens: bool = False, + **kwargs + ) -> List[str]: + """ + Decode a batch of token ids to strings. + """ + if isinstance(sequences, torch.Tensor): + sequences = sequences.tolist() + + return self.evo2_tokenizer.detokenize_batch(sequences) + + def decode( + self, + token_ids: Union[int, List[int], torch.Tensor], + skip_special_tokens: bool = False, + **kwargs + ) -> str: + """ + Decode a single sequence of token ids to a string. + """ + if isinstance(token_ids, torch.Tensor): + token_ids = token_ids.tolist() + + # Single sequence + if not isinstance(token_ids, list) or not token_ids or not isinstance(token_ids[0], (list, torch.Tensor)): + return self.evo2_tokenizer.detokenize(token_ids) + + # Batch with one item + return self.batch_decode(token_ids, skip_special_tokens, **kwargs)[0] + + +# Register the tokenizer - you'll need to do this when your script loads +# You might want to put this in your __init__.py file +def register_evo2_tokenizer(): + """Register the Evo2Tokenizer with HuggingFace's AutoTokenizer.""" + + # This will register the tokenizer so AutoTokenizer.from_pretrained knows about it + AutoTokenizer.register("evo2", Evo2Tokenizer) + + # If you have a config class, you would also register that + # from transformers.models.auto import AutoConfig + # AutoConfig.register("evo2", Evo2Config) + + print("Evo2Tokenizer registered with AutoTokenizer") + + +if __name__ == "__main__": + register_evo2_tokenizer() \ No newline at end of file diff --git a/BioReason-main/bioreason/trainer/__init__.py b/BioReason-main/bioreason/trainer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8cff50974346dcaa1995b654962ab29184d0999e --- /dev/null +++ b/BioReason-main/bioreason/trainer/__init__.py @@ -0,0 +1,7 @@ +from .grpo_config import DNALLMGRPOConfig +from .grpo_trainer import DNALLMGRPOTrainer + +__all__ = [ + "DNALLMGRPOConfig", + "DNALLMGRPOTrainer", +] \ No newline at end of file diff --git a/BioReason-main/bioreason/trainer/demo_grpo.py b/BioReason-main/bioreason/trainer/demo_grpo.py new file mode 100644 index 0000000000000000000000000000000000000000..9b772f871463c526e37b5d8952e8e5337a18130e --- /dev/null +++ b/BioReason-main/bioreason/trainer/demo_grpo.py @@ -0,0 +1,811 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import textwrap +import warnings +from collections import defaultdict +from typing import Any, Callable, Optional, Sized, Union +from unittest.mock import patch + +import torch +import torch.utils.data +import transformers +from accelerate.utils import broadcast_object_list, gather, gather_object, is_peft_model, set_seed +from accelerate.utils.other import is_compiled_module +from datasets import Dataset, IterableDataset +from packaging import version +from torch import nn +from torch.utils.data import Sampler +from transformers import ( + AutoModelForCausalLM, + AutoModelForSequenceClassification, + AutoTokenizer, + GenerationConfig, + PreTrainedModel, + PreTrainedTokenizerBase, + Trainer, + TrainerCallback, + is_wandb_available, +) +from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled +from transformers.utils import is_peft_available + +from trl.data_utils import apply_chat_template, is_conversational, maybe_apply_chat_template +from trl.import_utils import is_vllm_available +from trl.models import create_reference_model, prepare_deepspeed, unwrap_model_for_generation +from trl import SyncRefModelCallback +from trl import GRPOConfig +from trl.trainer.utils import generate_model_card, get_comet_experiment_url, pad, selective_log_softmax + + +if is_peft_available(): + from peft import PeftConfig, get_peft_model + +if is_vllm_available(): + from vllm import LLM, SamplingParams + +if is_wandb_available(): + import wandb + +# What we call a reward function is a callable that takes a list of prompts and completions and returns a list of +# rewards. When it's a string, it's a model ID, so it's loaded as a pretrained model. +RewardFunc = Union[str, PreTrainedModel, Callable[[list, list], list[float]]] + + +class RepeatRandomSampler(Sampler): + """ + Sampler that repeats the indices of a dataset N times. + + Args: + data_source (`Sized`): + Dataset to sample from. + repeat_count (`int`): + Number of times to repeat each index. + seed (`Optional[int]`): + Random seed for reproducibility (only affects this sampler). + + Example: + ```python + >>> sampler = RepeatRandomSampler(["a", "b", "c", "d"], repeat_count=2) + >>> list(sampler) + [2, 2, 0, 0, 3, 3, 1, 1] + ``` + """ + + def __init__(self, data_source: Sized, repeat_count: int, seed: Optional[int] = None): + self.data_source = data_source + self.repeat_count = repeat_count + self.num_samples = len(data_source) + self.seed = seed + self.generator = torch.Generator() # Create a local random generator + if seed is not None: + self.generator.manual_seed(seed) + + def __iter__(self): + indexes = [ + idx + for idx in torch.randperm(self.num_samples, generator=self.generator).tolist() + for _ in range(self.repeat_count) + ] + return iter(indexes) + + def __len__(self): + return self.num_samples * self.repeat_count + +# made this to test out the usual pipeline of GRPOTrainer data, and add my own debug messages +class FakeGRPOTrainer(Trainer): + """ + Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the + paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + + Example: + + ```python + from datasets import load_dataset + from trl import GRPOTrainer + + dataset = load_dataset("trl-lib/tldr", split="train") + + def reward_func(completions, **kwargs): + # Dummy reward function that rewards completions with more unique letters. + return [float(len(set(completion))) for completion in completions] + + trainer = GRPOTrainer( + model="Qwen/Qwen2-0.5B-Instruct", + reward_funcs=reward_func, + train_dataset=dataset, + ) + + trainer.train() + ``` + + Args: + model (`Union[str, PreTrainedModel]`): + Model to be trained. Can be either: + + - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or + a path to a *directory* containing model weights saved using + [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is + loaded using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keywork arguments + in `args.model_init_kwargs`. + - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported. + reward_funcs (`Union[RewardFunc, list[RewardFunc]]`): + Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward + functions with the prompts and completions and sum the rewards. Can be either: + + - A single reward function, such as: + - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a + path to a *directory* containing model weights saved using + [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded + using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the + keyword arguments in `args.model_init_kwargs`. + - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported. + - A custom reward function: The function is provided with the prompts and the generated completions, + plus any additional columns in the dataset. It should return a list of rewards. For more details, see + [Using a custom reward function](#using-a-custom-reward-function). + - A list of reward functions, where each item can independently be any of the above types. Mixing different + types within the list (e.g., a string model ID and a custom reward function) is allowed. + args ([`GRPOConfig`], *optional*, defaults to `None`): + Configuration for this trainer. If `None`, a default configuration is used. + train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]): + Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is + ignored. The format of the samples can be either: + + - [Standard](dataset_formats#standard): Each sample contains plain text. + - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role + and content). + eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`): + Dataset to use for evaluation. It must meet the same requirements as `train_dataset`. + processing_class ([`~transformers.PreTrainedTokenizerBase`], *optional*, defaults to `None`): + Processing class used to process the data. The padding side must be set to "left". If `None`, the + processing class is loaded from the model's name with [`~transformers.AutoTokenizer.from_pretrained`]. + reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`): + Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either: + + - A single processing class: Used when `reward_funcs` contains only one reward function. + - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`. + If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is + `None`, the tokenizer for the model is automatically loaded using [`~transformers.AutoTokenizer.from_pretrained`]. + For elements in `reward_funcs` that are custom reward functions (not [`~transformers.PreTrainedModel`]), + the corresponding entries in `reward_processing_classes` are ignored. + callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`): + List of callbacks to customize the training loop. Will add those to the list of default callbacks + detailed in [here](https://huggingface.co/docs/transformers/main_classes/callback). + + If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`] + method. + optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`): + A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your + model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`. + peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`): + PEFT configuration used to wrap the model. If `None`, the model is not wrapped. + """ + + _tag_names = ["trl", "grpo"] + + def __init__( + self, + model: Union[str, PreTrainedModel], + reward_funcs: Union[RewardFunc, list[RewardFunc]], + args: GRPOConfig = None, + train_dataset: Optional[Union[Dataset, IterableDataset]] = None, + eval_dataset: Optional[Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]] = None, + processing_class: Optional[PreTrainedTokenizerBase] = None, + reward_processing_classes: Optional[Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None), + peft_config: Optional["PeftConfig"] = None, + ): + # Args + if args is None: + model_name = model if isinstance(model, str) else model.config._name_or_path + model_name = model_name.split("/")[-1] + args = GRPOConfig(f"{model_name}-GRPO") + + # Models + # Trained model + model_init_kwargs = args.model_init_kwargs or {} + if isinstance(model, str): + model_id = model + torch_dtype = model_init_kwargs.get("torch_dtype") + if isinstance(torch_dtype, torch.dtype) or torch_dtype == "auto" or torch_dtype is None: + pass # torch_dtype is already a torch.dtype or "auto" or None + elif isinstance(torch_dtype, str): # it's a str, but not "auto" + torch_dtype = getattr(torch, torch_dtype) + model_init_kwargs["torch_dtype"] = torch_dtype + else: + raise ValueError( + "Invalid `torch_dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing " + f"a `torch.dtype` (e.g., 'float32'), but got {torch_dtype}." + ) + # Disable caching if gradient checkpointing is enabled (not supported) + model_init_kwargs["use_cache"] = ( + False if args.gradient_checkpointing else model_init_kwargs.get("use_cache") + ) + model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs) + else: + model_id = model.config._name_or_path + if args.model_init_kwargs is not None: + raise ValueError( + "You passed `model_init_kwargs` to the `GRPOConfig`, but your model is already instantiated. " + "This argument can only be used when the `model` argument is a string." + ) + + if peft_config is not None: + model = get_peft_model(model, peft_config) + + # Reference model + if is_deepspeed_zero3_enabled(): + self.ref_model = AutoModelForCausalLM.from_pretrained(model_id, **model_init_kwargs) + elif not is_peft_model(model): + # If PEFT configuration is not provided, create a reference model based on the initial model. + self.ref_model = create_reference_model(model) + else: + # If PEFT is used, the reference model is not needed since the adapter can be disabled + # to revert to the initial model. + self.ref_model = None + + # Processing class + if processing_class is None: + processing_class = AutoTokenizer.from_pretrained(model.config._name_or_path, padding_side="left") + + # Reward functions + if not isinstance(reward_funcs, list): + reward_funcs = [reward_funcs] + for i, reward_func in enumerate(reward_funcs): + if isinstance(reward_func, str): + reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained( + reward_func, num_labels=1, **model_init_kwargs + ) + self.reward_funcs = reward_funcs + + # Reward weights + if args.reward_weights is not None: + if len(args.reward_weights) != len(reward_funcs): + raise ValueError( + f"Number of reward weights ({len(args.reward_weights)}) must match number of reward " + f"functions ({len(reward_funcs)})" + ) + self.reward_weights = torch.tensor(args.reward_weights, dtype=torch.float32) + else: + self.reward_weights = torch.ones(len(reward_funcs), dtype=torch.float32) + + # Reward processing class + if reward_processing_classes is None: + reward_processing_classes = [None] * len(reward_funcs) + elif not isinstance(reward_processing_classes, list): + reward_processing_classes = [reward_processing_classes] + else: + if len(reward_processing_classes) != len(reward_funcs): + raise ValueError("The number of reward processing classes must match the number of reward functions.") + + for i, (reward_processing_class, reward_func) in enumerate(zip(reward_processing_classes, reward_funcs)): + if isinstance(reward_func, PreTrainedModel): + if reward_processing_class is None: + reward_processing_class = AutoTokenizer.from_pretrained(reward_func.config._name_or_path) + if reward_processing_class.pad_token_id is None: + reward_processing_class.pad_token = reward_processing_class.eos_token + # The reward model computes the reward for the latest non-padded token in the input sequence. + # So it's important to set the pad token ID to the padding token ID of the processing class. + reward_func.config.pad_token_id = reward_processing_class.pad_token_id + reward_processing_classes[i] = reward_processing_class + self.reward_processing_classes = reward_processing_classes + + # Data collator + def data_collator(features): # No data collation is needed in GRPO + return features + + # Training arguments + self.max_prompt_length = args.max_prompt_length + self.max_completion_length = args.max_completion_length # = |o_i| in the GRPO paper + self.num_generations = args.num_generations # = G in the GRPO paper + self.use_vllm = args.use_vllm + + self.beta = args.beta + + # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the + # input tensor associated with the key "input_ids". However, in GRPO, the sampled data does not include the + # "input_ids" key. Instead, the available keys is "prompt". As a result, the trainer issues the warning: + # "Could not estimate the number of tokens of the input, floating-point operations will not be computed." To + # suppress this warning, we set the "estimate_tokens" key in the model's "warnings_issued" dictionary to True. + # This acts as a flag to indicate that the warning has already been issued. + model.warnings_issued["estimate_tokens"] = True + + # Initialize the metrics + self._metrics = defaultdict(list) + self.log_completions = args.log_completions + + super().__init__( + model=model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + callbacks=callbacks, + optimizers=optimizers, + ) + + # Check if the per_device_train/eval_batch_size * num processes can be divided by the number of generations + num_processes = self.accelerator.num_processes + global_batch_size = args.per_device_train_batch_size * num_processes + possible_values = [n_gen for n_gen in range(2, global_batch_size + 1) if (global_batch_size) % n_gen == 0] + if self.num_generations not in possible_values: + raise ValueError( + f"The global train batch size ({num_processes} x {args.per_device_train_batch_size}) must be evenly " + f"divisible by the number of generations per prompt ({self.num_generations}). Given the current train " + f"batch size, the valid values for the number of generations are: {possible_values}." + ) + if self.args.eval_strategy != "no": + global_batch_size = args.per_device_eval_batch_size * num_processes + possible_values = [n_gen for n_gen in range(2, global_batch_size + 1) if (global_batch_size) % n_gen == 0] + if self.num_generations not in possible_values: + raise ValueError( + f"The global eval batch size ({num_processes} x {args.per_device_eval_batch_size}) must be evenly " + f"divisible by the number of generations per prompt ({self.num_generations}). Given the current " + f"eval batch size, the valid values for the number of generations are: {possible_values}." + ) + + # Ensure each process receives a unique seed to prevent duplicate completions when generating with + # transformers if num_generations exceeds per_device_train_batch_size. We could skip it if we use vLLM, but + # it's safer to set it in all cases. + set_seed(args.seed, device_specific=True) + + if self.use_vllm: + if not is_vllm_available(): + raise ImportError( + "vLLM is not available and `use_vllm` is set to True. Please install vLLM with " + "`pip install vllm` to use it." + ) + + if self.accelerator.is_main_process: + vllm_device = self.args.vllm_device + if vllm_device == "auto": + if torch.cuda.device_count() == 1: + vllm_device = "cuda:0" # particular case when training with onyl 1 GPU: share it + else: + vllm_device = f"cuda:{self.accelerator.num_processes}" # take the next GPU idx + # Check that the requested device is available + if vllm_device.split(":")[0] == "cuda" and int(vllm_device.split(":")[1]) >= torch.cuda.device_count(): + raise ValueError( + f"The requested device for vllm ({vllm_device}) is not available. You are likely using vLLM " + "without restricting the number of GPUs for training. Set the `--num_processes` argument to a " + "value lower than the number of GPUs available on your machine—typically, reducing it by one " + f"is sufficient. In your case: `--num_processes {torch.cuda.device_count() - 1}`." + ) + # Check that the requested device is not also used for training + if vllm_device in {f"cuda:{idx}" for idx in range(self.accelerator.num_processes)}: + warnings.warn( + f"The requested device {vllm_device} is also being used for training. For higher throughput " + "and to avoid out-of-memory errors, it is recommended to use a dedicated device for vLLM. " + "If this is intentional, you may ignore this warning but should adjust " + "`vllm_gpu_memory_utilization` accordingly." + ) + # vLLM is not compatible with accelerate. So we need to patch it to make sure we can (1) place the vLLM + # model on the desired device (world_size_patch) and (2) avoid a test that is not designed for our + # setting (profiling_patch). + world_size_patch = patch("torch.distributed.get_world_size", return_value=1) + profiling_patch = patch( + "vllm.worker.worker.Worker._assert_memory_footprint_increased_during_profiling", return_value=None + ) + with world_size_patch, profiling_patch: + self.llm = LLM( + model=model.name_or_path, + device=vllm_device, + gpu_memory_utilization=self.args.vllm_gpu_memory_utilization, + dtype=self.args.vllm_dtype, + # Automatic Prefix Caching caches the KV cache of existing queries, so that a new query can + # directly reuse the KV cache if it shares the same prefix with one of the existing queries. + # This is particularly useful here because we generate completions from the same prompts. + enable_prefix_caching=True, + max_model_len=self.args.vllm_max_model_len, + ) + self.sampling_params = SamplingParams( + temperature=args.temperature, + max_tokens=self.max_completion_length, + ) + + self._last_loaded_step = 0 # tag to avoid useless loading during grad accumulation + + # When using vLLM, the main process is responsible for loading the model weights. This can cause process + # desynchronization and seems to lead to DeepSpeed hanging during initialization. To prevent this, we + # synchronize all processes after vLLM has been fully initialized. + self.accelerator.wait_for_everyone() + else: + self.generation_config = GenerationConfig( + max_new_tokens=self.max_completion_length, + do_sample=True, + temperature=args.temperature, + pad_token_id=processing_class.pad_token_id, + ) + + # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the + # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set + # self.model_accepts_loss_kwargs to False to enable scaling. + self.model_accepts_loss_kwargs = False + + # Add tags to the model + self.model.add_model_tags(self._tag_names) + + if self.ref_model is not None: + if self.is_deepspeed_enabled: + self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator) + else: + self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) + + if args.sync_ref_model: + self.add_callback(SyncRefModelCallback(ref_model=self.ref_model, accelerator=self.accelerator)) + + for i, reward_func in enumerate(self.reward_funcs): + if isinstance(reward_func, PreTrainedModel): + self.reward_funcs[i] = self.accelerator.prepare_model(reward_func, evaluation_mode=True) + + def _set_signature_columns_if_needed(self): + # If `self.args.remove_unused_columns` is True, non-signature columns are removed. + # By default, this method sets `self._signature_columns` to the model's expected inputs. + # In GRPOTrainer, we preprocess data, so using the model's signature columns doesn't work. + # Instead, we set them to the columns expected by the `training_step` method, hence the override. + if self._signature_columns is None: + self._signature_columns = ["prompt"] + + def _get_train_sampler(self) -> Sampler: + # Returns a sampler that ensures each prompt is repeated across multiple processes. This guarantees that + # identical prompts are distributed to different GPUs, allowing rewards to be computed and normalized correctly + # within each prompt group. Using the same seed across processes ensures consistent prompt assignment, + # preventing discrepancies in group formation. + return RepeatRandomSampler(self.train_dataset, self.num_generations, seed=self.args.seed) + + def _get_eval_sampler(self, eval_dataset) -> Sampler: + # Returns a sampler that ensures each prompt is repeated across multiple processes. This guarantees that + # identical prompts are distributed to different GPUs, allowing rewards to be computed and normalized correctly + # within each prompt group. Using the same seed across processes ensures consistent prompt assignment, + # preventing discrepancies in group formation. + return RepeatRandomSampler(eval_dataset, self.num_generations, seed=self.args.seed) + + # Get the per-token log probabilities for the completions for the model and the reference model + def _get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep): + # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded + logits = model(input_ids=input_ids, attention_mask=attention_mask, logits_to_keep=logits_to_keep + 1).logits + logits = logits[:, :-1, :] # (B, L-1, V), exclude the last logit: it corresponds to the next token pred + + input_ids = input_ids[:, -logits_to_keep:] + # For transformers<=4.48, logits_to_keep argument isn't supported, so here we drop logits ourselves. + # See https://github.com/huggingface/trl/issues/2770 + logits = logits[:, -logits_to_keep:] + return selective_log_softmax(logits, input_ids) # compute logprobs for the input tokens + + def _move_model_to_vllm(self): + with unwrap_model_for_generation( + self.model, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation + ) as unwrapped_model: + if is_compiled_module(unwrapped_model): + unwrapped_model = unwrapped_model._orig_mod + if is_peft_model(unwrapped_model): + unwrapped_model.merge_adapter() + state_dict = unwrapped_model.state_dict() + # Remove base_model and base_layer prefixes + state_dict = { + k.removeprefix("base_model.model.").replace(".base_layer", ""): v for k, v in state_dict.items() + } + # Remove values with adapter prefix (example: "_lora") + state_dict = {k: v for k, v in state_dict.items() if unwrapped_model.prefix not in k} + # When module to save, remove its prefix and discard the original module + state_dict = { + k.replace("modules_to_save.default.", ""): v + for k, v in state_dict.items() + if "original_module" not in k + } + else: + state_dict = unwrapped_model.state_dict() + if self.accelerator.is_main_process: + llm_model = self.llm.llm_engine.model_executor.driver_worker.model_runner.model + llm_model.load_weights(state_dict.items()) + # Unmerge the adapter to restore the model to its original state. + # This must be done after loading weights to ensure they correspond to the merged state. + if is_peft_model(unwrapped_model): + unwrapped_model.unmerge_adapter() + + def _prepare_inputs(self, inputs: dict[str, Union[torch.Tensor, Any]]) -> dict[str, Union[torch.Tensor, Any]]: + device = self.accelerator.device + prompts = [x["prompt"] for x in inputs] + prompts_text = [maybe_apply_chat_template(example, self.processing_class)["prompt"] for example in inputs] + prompt_inputs = self.processing_class( + prompts_text, return_tensors="pt", padding=True, padding_side="left", add_special_tokens=False + ) + prompt_inputs = super()._prepare_inputs(prompt_inputs) + prompt_ids, prompt_mask = prompt_inputs["input_ids"], prompt_inputs["attention_mask"] + + if self.max_prompt_length is not None: + prompt_ids = prompt_ids[:, -self.max_prompt_length :] + prompt_mask = prompt_mask[:, -self.max_prompt_length :] + + # Generate completions using either vLLM or regular generation + if self.args.use_vllm: + # First, have main process load weights if needed + if self.state.global_step != self._last_loaded_step: + self._move_model_to_vllm() + self._last_loaded_step = self.state.global_step + + # Generate completions using vLLM: gather all prompts and use them in a single call in the main process + all_prompts_text = gather_object(prompts_text) + if self.accelerator.is_main_process: + outputs = self.llm.generate(all_prompts_text, sampling_params=self.sampling_params, use_tqdm=False) + completion_ids = [out.token_ids for completions in outputs for out in completions.outputs] + else: + completion_ids = [None] * len(all_prompts_text) + # Broadcast the completions from the main process to all processes, ensuring each process receives its + # corresponding slice. + completion_ids = broadcast_object_list(completion_ids, from_process=0) + process_slice = slice( + self.accelerator.process_index * len(prompts), + (self.accelerator.process_index + 1) * len(prompts), + ) + completion_ids = completion_ids[process_slice] + + # Pad the completions, and concatenate them with the prompts + completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids] + completion_ids = pad(completion_ids, padding_value=self.processing_class.pad_token_id) + prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1) + else: + print("about to generate!!") + # Regular generation path + with unwrap_model_for_generation(self.model, self.accelerator) as unwrapped_model: + prompt_completion_ids = unwrapped_model.generate( + prompt_ids, attention_mask=prompt_mask, generation_config=self.generation_config + ) + + print('prompts_ids', prompt_ids, 'attention_mask', prompt_mask) + print('prompt_completion_ids', prompt_completion_ids) + print('prompt len', prompt_ids.size(1)) + + # Compute prompt length and extract completion ids + prompt_length = prompt_ids.size(1) + prompt_ids = prompt_completion_ids[:, :prompt_length] + completion_ids = prompt_completion_ids[:, prompt_length:] + + # Mask everything after the first EOS token + is_eos = completion_ids == self.processing_class.eos_token_id + eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device) + eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)] + sequence_indices = torch.arange(is_eos.size(1), device=device).expand(is_eos.size(0), -1) + completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int() + + # Concatenate prompt_mask with completion_mask for logit computation + attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) # (B*G, P+C) + + logits_to_keep = completion_ids.size(1) # we only need to compute the logits for the completion tokens + + with torch.inference_mode(): + if self.ref_model is not None: + ref_per_token_logps = self._get_per_token_logps( + self.ref_model, prompt_completion_ids, attention_mask, logits_to_keep + ) + else: + with self.accelerator.unwrap_model(self.model).disable_adapter(): + ref_per_token_logps = self._get_per_token_logps( + self.model, prompt_completion_ids, attention_mask, logits_to_keep + ) + + # Decode the generated completions + completions_text = self.processing_class.batch_decode(completion_ids, skip_special_tokens=True) + if is_conversational(inputs[0]): + completions = [] + for prompt, completion in zip(prompts, completions_text): + bootstrap = prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else "" + completions.append([{"role": "assistant", "content": bootstrap + completion}]) + else: + completions = completions_text + + rewards_per_func = torch.zeros(len(prompts), len(self.reward_funcs), device=device) + for i, (reward_func, reward_processing_class) in enumerate( + zip(self.reward_funcs, self.reward_processing_classes) + ): + if isinstance(reward_func, nn.Module): # Module instead of PretrainedModel for compat with compiled models + if is_conversational(inputs[0]): + messages = [{"messages": p + c} for p, c in zip(prompts, completions)] + texts = [apply_chat_template(x, reward_processing_class)["text"] for x in messages] + else: + texts = [p + c for p, c in zip(prompts, completions)] + reward_inputs = reward_processing_class( + texts, return_tensors="pt", padding=True, padding_side="right", add_special_tokens=False + ) + reward_inputs = super()._prepare_inputs(reward_inputs) + with torch.inference_mode(): + rewards_per_func[:, i] = reward_func(**reward_inputs).logits[:, 0] # Shape (B*G,) + else: + # Repeat all input columns (but "prompt" and "completion") to match the number of generations + keys = [key for key in inputs[0] if key not in ["prompt", "completion"]] + reward_kwargs = {key: [example[key] for example in inputs] for key in keys} + output_reward_func = reward_func(prompts=prompts, completions=completions, **reward_kwargs) + rewards_per_func[:, i] = torch.tensor(output_reward_func, dtype=torch.float32, device=device) + + # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the + # completions may be distributed across processes + rewards_per_func = gather(rewards_per_func) + + # Apply weights to each reward function's output and sum + rewards = (rewards_per_func * self.reward_weights.to(device).unsqueeze(0)).sum(dim=1) + + # Compute grouped-wise rewards + mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1) + std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1) + + # Normalize the rewards to compute the advantages + mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(self.num_generations, dim=0) + std_grouped_rewards = std_grouped_rewards.repeat_interleave(self.num_generations, dim=0) + advantages = (rewards - mean_grouped_rewards) / (std_grouped_rewards + 1e-4) + + # Slice to keep only the local part of the data + process_slice = slice( + self.accelerator.process_index * len(prompts), + (self.accelerator.process_index + 1) * len(prompts), + ) + advantages = advantages[process_slice] + + # Log the metrics + reward_per_func = rewards_per_func.mean(0) + for i, reward_func in enumerate(self.reward_funcs): + if isinstance(reward_func, nn.Module): # Module instead of PretrainedModel for compat with compiled models + reward_func_name = reward_func.config._name_or_path.split("/")[-1] + else: + reward_func_name = reward_func.__name__ + self._metrics[f"rewards/{reward_func_name}"].append(reward_per_func[i].item()) + + self._metrics["reward"].append(rewards.mean().item()) + self._metrics["reward_std"].append(std_grouped_rewards.mean().item()) + + if ( + self.log_completions + and self.state.global_step % self.args.logging_steps == 0 + and "wandb" in self.args.report_to + ): + import pandas as pd + + # For logging + table = { + "step": [str(self.state.global_step)] * len(rewards), + "prompt": gather_object(prompts_text), + "completion": gather_object(completions_text), + "reward": rewards.tolist(), + } + df = pd.DataFrame(table) + + if wandb.run is not None and self.accelerator.is_main_process: + wandb.log({"completions": wandb.Table(dataframe=df)}) + + return { + "prompt_ids": prompt_ids, + "prompt_mask": prompt_mask, + "completion_ids": completion_ids, + "completion_mask": completion_mask, + "ref_per_token_logps": ref_per_token_logps, + "advantages": advantages, + } + + def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): + if return_outputs: + raise ValueError("The GRPOTrainer does not support returning outputs") + # Compute the per-token log probabilities for the model + + prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"] + completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"] + input_ids = torch.cat([prompt_ids, completion_ids], dim=1) + attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) + logits_to_keep = completion_ids.size(1) # we only need to compute the logits for the completion tokens + + per_token_logps = self._get_per_token_logps(model, input_ids, attention_mask, logits_to_keep) + + # Compute the KL divergence between the model and the reference model + ref_per_token_logps = inputs["ref_per_token_logps"] + per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1 + + # x - x.detach() allows for preserving gradients from x + advantages = inputs["advantages"] + per_token_loss = torch.exp(per_token_logps - per_token_logps.detach()) * advantages.unsqueeze(1) + per_token_loss = -(per_token_loss - self.beta * per_token_kl) + loss = ((per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean() + + # Log the metrics + completion_length = self.accelerator.gather_for_metrics(completion_mask.sum(1)).float().mean().item() + self._metrics["completion_length"].append(completion_length) + + mean_kl = ((per_token_kl * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean() + self._metrics["kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item()) + + return loss + + def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys: Optional[list[str]] = None): + inputs = self._prepare_inputs(inputs) + print("about to loss") + with torch.no_grad(): + with self.compute_loss_context_manager(): + loss = self.compute_loss(model, inputs) + loss = loss.mean().detach() + print("loss computed") + return loss, None, None + + def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None: + metrics = {key: sum(val) / len(val) for key, val in self._metrics.items()} # average the metrics + + # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs` + # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format. + if next(iter(logs.keys())).startswith("eval_"): + metrics = {f"eval_{key}": val for key, val in metrics.items()} + + logs = {**logs, **metrics} + if version.parse(transformers.__version__) >= version.parse("4.47.0.dev0"): + super().log(logs, start_time) + else: # transformers<=4.46 + super().log(logs) + self._metrics.clear() + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + tags = tags or [] + if isinstance(tags, str): + tags = [tags] + + if hasattr(self.model.config, "unsloth_version"): + tags.append("unsloth") + + citation = textwrap.dedent( + """\ + @article{zhihong2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, + } + """ + ) + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="GRPO", + trainer_citation=citation, + paper_title="DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models", + paper_id="2402.03300", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) diff --git a/BioReason-main/bioreason/trainer/grpo_config.py b/BioReason-main/bioreason/trainer/grpo_config.py new file mode 100644 index 0000000000000000000000000000000000000000..eaf704018cc74354682ec4433d07e7d53cdfaf13 --- /dev/null +++ b/BioReason-main/bioreason/trainer/grpo_config.py @@ -0,0 +1,365 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field +from typing import Optional, Union + +from transformers import TrainingArguments + + +@dataclass +class DNALLMGRPOConfig(TrainingArguments): + r""" + Configuration class for the [`GRPOTrainer`]. + + Only the parameters specific to GRPO training are listed here. For details on other parameters, refer to the + [`~transformers.TrainingArguments`] documentation. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + > Parameters that control the model and reference model + + model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): + Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model` + argument of the [`GRPOTrainer`] is provided as a string. + + > Parameters that control the data preprocessing + + remove_unused_columns (`bool`, *optional*, defaults to `False`): + Whether to only keep the column `"prompt"` in the dataset. If you use a custom reward function that + requires any column other than `"prompts"` and `"completions"`, you should keep this to `False`. + max_prompt_length (`int` or `None`, *optional*, defaults to `512`): + Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left. + num_generations (`int` or `None`, *optional*, defaults to `8`): + Number of generations per prompt to sample. The global batch size (num_processes * per_device_batch_size) + must be divisible by this value. + max_completion_length (`int` or `None`, *optional*, defaults to `256`): + Maximum length of the generated completion. + ds3_gather_for_generation (`bool`, *optional*, defaults to `True`): + This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, + improving generation speed. However, disabling this option allows training models that exceed the VRAM + capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible + with vLLM generation. + + > Parameters that control generation + + temperature (`float`, defaults to `0.9`): + Temperature for sampling. The higher the temperature, the more random the completions. + top_p (`float`, *optional*, defaults to `1.0`): + Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to + `1.0` to consider all tokens. + top_k (`int` or `None`, *optional*, defaults to `50`): + Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, top-k-filtering is + disabled. + min_p (`float` or `None`, *optional*, defaults to `None`): + Minimum token probability, which will be scaled by the probability of the most likely token. It must be a + value between `0.0` and `1.0`. Typical values are in the `0.01-0.2` range. + repetition_penalty (`float`, *optional*, defaults to `1.0`): + Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far. + Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat + tokens. + cache_implementation (`str` or `None`, *optional*, defaults to `None`): + Implementation of the cache method for faster generation when use_vllm is set to False. + + > Parameters that control generation acceleration powered by vLLM + + use_vllm (`bool`, *optional*, defaults to `False`): + Whether to use vLLM for generating completions. If set to `True`, ensure that a GPU is kept unused for + training, as vLLM will require one for generation. vLLM must be installed (`pip install vllm`). + vllm_device (`str`, *optional*, defaults to `"auto"`): + Device where vLLM generation will run, e.g. `"cuda:1"`. If set to `"auto"` (default), the system will + automatically select the next available GPU after the last one used for training. This assumes that + training has not already occupied all available GPUs. If only one device is available, the device will be + shared between both training and vLLM. + vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.9`): + Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV cache on the + device dedicated to generation powered by vLLM. Higher values will increase the KV cache size and thus + improve the model's throughput. However, if the value is too high, it may cause out-of-memory (OOM) errors + during initialization. + vllm_dtype (`str`, *optional*, defaults to `"auto"`): + Data type to use for vLLM generation. If set to `"auto"`, the data type will be automatically determined + based on the model configuration. Find the supported values in the vLLM documentation. + vllm_max_model_len (`int` or `None`, *optional*, defaults to `None`): + If set, the `max_model_len` to use for vLLM. This could be useful when running with reduced + `vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model + context size, which might be much larger than the KV cache, leading to inefficiencies. + vllm_enable_prefix_caching (`bool`, *optional*, defaults to `True`): + Whether to enable prefix caching in vLLM. If set to `True` (default), ensure that the model and the hardware + support this feature. + vllm_guided_decoding_regex (`str` or `None`, *optional*, defaults to `None`): + Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled. + + > Parameters that control the training + + learning_rate (`float`, *optional*, defaults to `1e-6`): + Initial learning rate for [`AdamW`] optimizer. The default value replaces that of + [`~transformers.TrainingArguments`]. + beta (`float`, *optional*, defaults to `0.04`): + KL coefficient. If `0.0`, the reference model is not loaded, reducing memory usage and improving training + speed, but may be numerically unstable for long training runs. + num_iterations (`int`, *optional*, defaults to `1`): + Number of iterations per batch (denoted as μ in the algorithm). + epsilon (`float`, *optional*, defaults to `0.2`): + Epsilon value for clipping. + epsilon_high (`float` or `None`, *optional*, defaults to `None`): + Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the lower-bound + specified in argument `epsilon`. Paper [DAPO](https://huggingface.co/papers/2503.14476) recommends `0.28`. + reward_weights (`list[float]` or `None`, *optional*, defaults to `None`): + Weights for each reward function. Must match the number of reward functions. If `None`, all rewards are + weighted equally with weight `1.0`. + sync_ref_model (`bool`, *optional*, defaults to `False`): + Whether to synchronize the reference model with the active model every `ref_model_sync_steps` steps, using + the `ref_model_mixup_alpha` parameter. This synchronization originites from the + [TR-DPO](https://huggingface.co/papers/2404.09656) paper. + ref_model_mixup_alpha (`float`, *optional*, defaults to `0.6`): + α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix + between the current policy and the previous reference policy during updates. The reference policy is + updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you + must set `sync_ref_model=True`. + ref_model_sync_steps (`int`, *optional*, defaults to `512`): + τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how + frequently the current policy is synchronized with the reference policy. To use this parameter, you must + set `sync_ref_model=True`. + + > Parameters that control the logging + + log_completions (`bool`, *optional*, defaults to `False`): + Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is + installed, it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`. + """ + + # Parameters that control the model and reference model + model_init_kwargs: Optional[dict] = field( + default=None, + metadata={ + "help": "Keyword arguments for `transformers.AutoModelForCausalLM.from_pretrained`, used when the `model` " + "argument of the `GRPOTrainer` is provided as a string." + }, + ) + + # Parameters that control the data preprocessing + # The default value remove_unused_columns is overwritten from the parent class, because in GRPO we usually rely on + # additional columns to compute the reward + remove_unused_columns: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether to only keep the column 'prompt' in the dataset. If you use a custom reward function " + "that requires any column other than 'prompts' and 'completions', you should keep this to `False`." + }, + ) + max_prompt_length: Optional[int] = field( + default=512, + metadata={ + "help": "Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left." + }, + ) + num_generations: Optional[int] = field( + default=8, + metadata={ + "help": "Number of generations to sample. The global batch size (num_processes * per_device_batch_size) " + "must be divisible by this value." + }, + ) + max_completion_length: Optional[int] = field( + default=800, + metadata={"help": "Maximum length of the generated completion."}, + ) + ds3_gather_for_generation: bool = field( + default=True, + metadata={ + "help": "This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for " + "generation, improving generation speed. However, disabling this option allows training models that " + "exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation. Disabling this option " + "is not compatible with vLLM generation." + }, + ) + + # Parameters that control generation + temperature: float = field( + default=0.6, + metadata={"help": "Temperature for sampling. The higher the temperature, the more random the completions."}, + ) + top_p: float = field( + default=0.95, + metadata={ + "help": "Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. " + "Set to 1.0 to consider all tokens." + }, + ) + top_k: Optional[int] = field( + default=20, + metadata={ + "help": "Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, " + "top-k-filtering is disabled." + }, + ) + min_p: Optional[float] = field( + default=None, + metadata={ + "help": "Minimum token probability, which will be scaled by the probability of the most likely token. It " + "must be a value between 0.0 and 1.0. Typical values are in the 0.01-0.2 range." + }, + ) + repetition_penalty: float = field( + default=1.0, + metadata={ + "help": "Float that penalizes new tokens based on whether they appear in the prompt and the generated " + "text so far. Values > 1.0 encourage the model to use new tokens, while values < 1.0 encourage the model " + "to repeat tokens." + }, + ) + cache_implementation: Optional[str] = field( + default=None, + metadata={"help": "Implementation of the cache method for faster generation when use_vllm is set to False."}, + ) + + # Parameters that control generation acceleration powered by vLLM + use_vllm: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether to use vLLM for generating completions. If set to `True`, ensure that a GPU is kept " + "unused for training, as vLLM will require one for generation. vLLM must be installed " + "(`pip install vllm`)." + }, + ) + vllm_device: Optional[str] = field( + default="auto", + metadata={ + "help": "Device where vLLM generation will run, e.g. 'cuda:1'. If set to 'auto' (default), the system " + "will automatically select the next available GPU after the last one used for training. This assumes " + "that training has not already occupied all available GPUs." + }, + ) + vllm_gpu_memory_utilization: float = field( + default=0.9, + metadata={ + "help": "Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV " + "cache on the device dedicated to generation powered by vLLM. Higher values will increase the KV cache " + "size and thus improve the model's throughput. However, if the value is too high, it may cause " + "out-of-memory (OOM) errors during initialization." + }, + ) + vllm_dtype: Optional[str] = field( + default="auto", + metadata={ + "help": "Data type to use for vLLM generation. If set to 'auto', the data type will be automatically " + "determined based on the model configuration. Find the supported values in the vLLM documentation." + }, + ) + vllm_max_model_len: Optional[int] = field( + default=None, + metadata={ + "help": "If set, the `max_model_len` to use for vLLM. This could be useful when running with reduced " + "`vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model " + "context size, which might be much larger than the KV cache, leading to inefficiencies." + }, + ) + vllm_enable_prefix_caching: Optional[bool] = field( + default=True, + metadata={ + "help": "Whether to enable prefix caching in vLLM. If set to `True` (default), ensure that the model and " + "the hardware support this feature." + }, + ) + vllm_guided_decoding_regex: Optional[str] = field( + default=None, + metadata={"help": "Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled."}, + ) + + # Parameters that control the training + learning_rate: float = field( + default=1e-6, + metadata={ + "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of " + "`transformers.TrainingArguments`." + }, + ) + beta: float = field( + default=0.04, + metadata={ + "help": "KL coefficient. If `0.0`, the reference model is not loaded, reducing memory usage and improving " + "training speed, but may be numerically unstable for long training runs." + }, + ) + num_iterations: int = field( + default=1, + metadata={"help": "Number of iterations per batch (denoted as μ in the algorithm)."}, + ) + epsilon: float = field( + default=0.2, + metadata={"help": "Epsilon value for clipping."}, + ) + epsilon_high: Optional[float] = field( + default=None, + metadata={ + "help": "Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the " + "lower-bound specified in argument `epsilon`. Paper DAPO recommends `0.28`." + }, + ) + reward_weights: Optional[list[float]] = field( + default=None, + metadata={ + "help": "Weights for each reward function. Must match the number of reward functions. If `None`, all " + "rewards are weighted equally with weight `1.0`." + }, + ) + sync_ref_model: bool = field( + default=False, + metadata={ + "help": "Whether to synchronize the reference model with the active model every `ref_model_sync_steps` " + "steps, using the `ref_model_mixup_alpha` parameter." + }, + ) + ref_model_mixup_alpha: float = field( + default=0.6, + metadata={ + "help": "α parameter from the TR-DPO paper, which controls the mix between the current policy and the " + "previous reference policy during updates. The reference policy is updated according to the equation: " + "`π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you must set `sync_ref_model=True`." + }, + ) + ref_model_sync_steps: int = field( + default=512, + metadata={ + "help": "τ parameter from the TR-DPO paper, which determines how frequently the current policy is " + "synchronized with the reference policy. To use this parameter, you must set `sync_ref_model=True`." + }, + ) + + # Parameters that control the logging + log_completions: bool = field( + default=True, + metadata={ + "help": "Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is " + "installed, it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`." + }, + ) + + report_to: Union[None, str, list[str]] = field( + default="wandb", metadata={"help": "The list of integrations to report the results and logs to."} + ) + + logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"}) + logging_steps: float = field( + default=2, + metadata={ + "help": ( + "Log every X updates steps. Should be an integer or a float in range `[0,1)`. " + "If smaller than 1, will be interpreted as ratio of total training steps." + ) + }, + ) \ No newline at end of file diff --git a/BioReason-main/bioreason/trainer/grpo_trainer.py b/BioReason-main/bioreason/trainer/grpo_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..b62b71f766fb26bce63b1a331056144c14b1de13 --- /dev/null +++ b/BioReason-main/bioreason/trainer/grpo_trainer.py @@ -0,0 +1,905 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +import textwrap +import pandas as pd +from collections import defaultdict +from typing import Any, Callable, Optional, Union, Sized + +import torch +import torch.utils.data +import transformers +from datasets import Dataset, IterableDataset +from packaging import version +from transformers import ( + AriaForConditionalGeneration, + AriaProcessor, + AutoModelForCausalLM, + AutoModelForSequenceClassification, + AutoProcessor, + AutoTokenizer, + GenerationConfig, + PreTrainedModel, + PreTrainedTokenizerBase, + Qwen2VLForConditionalGeneration, + Qwen2_5_VLForConditionalGeneration, + Trainer, + TrainerCallback, + is_wandb_available, +) +from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled +from transformers.utils import is_peft_available + +from trl.data_utils import apply_chat_template, is_conversational, maybe_apply_chat_template +from trl.models import create_reference_model, prepare_deepspeed, unwrap_model_for_generation +from trl.trainer.grpo_config import GRPOConfig +from trl.trainer.utils import generate_model_card, get_comet_experiment_url +# from trl import GRPOTrainer + +from accelerate.utils import is_peft_model, set_seed, gather_object +import PIL.Image + +import copy +from torch.utils.data import Sampler +import warnings + +if is_peft_available(): + from peft import PeftConfig, get_peft_model, prepare_model_for_kbit_training + +if is_wandb_available(): + import wandb + +from bioreason.dna_modules.dna_module import DNABaseModule +from bioreason.trainer import DNALLMGRPOConfig +# What we call a reward function is a callable that takes a list of prompts and completions and returns a list of +# rewards. When it's a string, it's a model ID, so it's loaded as a pretrained model. +RewardFunc = Union[str, PreTrainedModel, Callable[[list, list], list[float]]] + + +class RepeatRandomSampler(Sampler): + """ + Sampler that repeats the indices of a dataset in a structured manner. + + Args: + data_source (`Sized`): + Dataset to sample from. + mini_repeat_count (`int`): + Number of times to repeat each index per batch. + batch_size (`int`, *optional*, defaults to `1`): + Number of unique indices per batch. + repeat_count (`int`, *optional*, defaults to `1`): + Number of times to repeat the full sampling process. + seed (`int` or `None`, *optional*, defaults to `None`): + Random seed for reproducibility. + """ + + def __init__( + self, + data_source: Sized, + mini_repeat_count: int, + batch_size: int = 1, + repeat_count: int = 1, + seed: Optional[int] = None, + ): + self.data_source = data_source + self.mini_repeat_count = mini_repeat_count + self.batch_size = batch_size + self.repeat_count = repeat_count + self.num_samples = len(data_source) + self.seed = seed + self.generator = torch.Generator() + if seed is not None: + self.generator.manual_seed(seed) + + def __iter__(self): + indexes = torch.randperm(self.num_samples, generator=self.generator).tolist() + indexes = [indexes[i : i + self.batch_size] for i in range(0, len(indexes), self.batch_size)] + indexes = [chunk for chunk in indexes if len(chunk) == self.batch_size] + + for chunk in indexes: + for _ in range(self.repeat_count): + for index in chunk: + for _ in range(self.mini_repeat_count): + yield index + + def __len__(self) -> int: + return self.num_samples * self.mini_repeat_count * self.repeat_count + + +class DNALLMGRPOTrainer(Trainer): + """ + Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the + paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + + Example: + + ```python + from datasets import load_dataset + from trl import GRPOTrainer + + dataset = load_dataset("trl-lib/tldr", split="train") + + trainer = GRPOTrainer( + model="Qwen/Qwen2-0.5B-Instruct", + reward_funcs="weqweasdas/RM-Gemma-2B", + train_dataset=dataset, + ) + + trainer.train() + ``` + + Args: + model (`Union[str, PreTrainedModel]`): + Model to be trained. Can be either: + + - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or + a path to a *directory* containing model weights saved using + [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is + loaded using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keywork arguments + in `args.model_init_kwargs`. + - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported. + reward_funcs (`Union[RewardFunc, list[RewardFunc]]`): + Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward + functions with the prompts and completions and sum the rewards. Can be either: + + - A single reward function, such as: + - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a + path to a *directory* containing model weights saved using + [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded + using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the + keyword arguments in `args.model_init_kwargs`. + - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported. + - A custom reward function: The function is provided with the prompts and the generated completions, + plus any additional columns in the dataset. It should return a list of rewards. For more details, see + [Using a custom reward function](#using-a-custom-reward-function). + - A list of reward functions, where each item can independently be any of the above types. Mixing different + types within the list (e.g., a string model ID and a custom reward function) is allowed. + args ([`GRPOConfig`], *optional*, defaults to `None`): + Configuration for this trainer. If `None`, a default configuration is used. + train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]): + Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is + ignored. The format of the samples can be either: + + - [Standard](dataset_formats#standard): Each sample contains plain text. + - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role + and content). + eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`): + Dataset to use for evaluation. It must meet the same requirements as `train_dataset`. + processing_class ([`~transformers.PreTrainedTokenizerBase`], *optional*, defaults to `None`): + Processing class used to process the data. The padding side must be set to "left". If `None`, the + processing class is loaded from the model's name with [`~transformers.AutoTokenizer.from_pretrained`]. + reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`): + Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either: + + - A single processing class: Used when `reward_funcs` contains only one reward function. + - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`. + If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is + `None`, the tokenizer for the model is automatically loaded using [`~transformers.AutoTokenizer.from_pretrained`]. + For elements in `reward_funcs` that are custom reward functions (not [`~transformers.PreTrainedModel`]), + the corresponding entries in `reward_processing_classes` are ignored. + callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`): + List of callbacks to customize the training loop. Will add those to the list of default callbacks + detailed in [here](https://huggingface.co/docs/transformers/main_classes/callback). + + If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`] + method. + optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`): + A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your + model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`. + peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`): + PEFT configuration used to wrap the model. If `None`, the model is not wrapped. + """ + + def __init__( + self, + model: Union[str, PreTrainedModel], + reward_funcs: Union[RewardFunc, list[RewardFunc]], + args: DNALLMGRPOConfig = None, + dna_module: DNABaseModule = None, + train_dataset: Optional[Union[Dataset, IterableDataset]] = None, + eval_dataset: Optional[Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]] = None, + processing_class: Optional[PreTrainedTokenizerBase] = None, + reward_processing_classes: Optional[Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None), + peft_config: Optional["PeftConfig"] = None, + freeze_dna_modules: Optional[bool] = False, + attn_implementation: str = "flash_attention_2", + torch_dtype: str = "bfloat16", + **kwargs, + ): + # Args + if args is None: + model_name = model if isinstance(model, str) else model.config._name_or_path + model_name = model_name.split("/")[-1] + args = GRPOConfig(f"{model_name}-GRPO") + + self.dna_module = dna_module + + # Models + # Trained model + model_init_kwargs = args.model_init_kwargs or {} + # FIXME + # Remember to modify it in the invernvl + model_init_kwargs["attn_implementation"] = attn_implementation + if model_init_kwargs.get("torch_dtype") is None: + model_init_kwargs["torch_dtype"] = torch_dtype + + assert not isinstance(model, str), "model must NOT be a string in the current implementation" + + torch_dtype = model_init_kwargs.get("torch_dtype") + if isinstance(torch_dtype, torch.dtype) or torch_dtype == "auto" or torch_dtype is None: + pass # torch_dtype is already a torch.dtype or "auto" or None + elif isinstance(torch_dtype, str): # it's a str, but not "auto" + torch_dtype = getattr(torch, torch_dtype) + else: + raise ValueError( + "Invalid `torch_dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing " + f"a `torch.dtype` (e.g., 'float32'), but got {torch_dtype}." + ) + # Disable caching if gradient checkpointing is enabled (not supported) + model_init_kwargs["use_cache"] = ( + False if args.gradient_checkpointing else model_init_kwargs.get("use_cache") + ) + + # LoRA + self.dna_modules_keywords = self.dna_module.get_dnallm_modules_keywords() + if peft_config is not None: + print("Applying LoRA...") + def find_all_linear_names(model, multimodal_keywords): + cls = torch.nn.Linear + lora_module_names = set() + for name, module in model.named_modules(): + print('name:', name, 'module:', module) + # LoRA is not applied to the DNA modules + if any(mm_keyword in name for mm_keyword in multimodal_keywords): + continue + if isinstance(module, cls): + lora_module_names.add(name) + for m in lora_module_names: # needed for 16-bit + if "embed_tokens" in m: + lora_module_names.remove(m) + return list(lora_module_names) + target_modules = find_all_linear_names(model, self.dna_modules_keywords) + peft_config.target_modules = target_modules + model = prepare_model_for_kbit_training(model) + model = get_peft_model(model, peft_config) + + # Freeze DNA modules + if freeze_dna_modules: + print("Freezing DNA modules...") + for p in model.dna_model.parameters(): + p.requires_grad = False + + # Make projection layer trainable + for p in model.dna_projection.parameters(): + p.required_grad = True + + # Compute the number of trainable parameters and print the parameter that is trainable + trainable_params = [p for p in model.parameters() if p.requires_grad] + total_params = sum(p.numel() for p in trainable_params) + # for n, p in model.named_parameters(): + # if p.requires_grad: + # print(n, p.shape) + print(f"Total trainable parameters: {total_params}") + + # Enable gradient checkpointing if requested + if args.gradient_checkpointing: + model = self._enable_gradient_checkpointing(model, args) + + # Reference model + self.beta = args.beta + if self.beta == 0.0: + # If beta is 0.0, the reference model is not needed + self.ref_model = None + elif is_deepspeed_zero3_enabled(): + self.ref_model = model_cls.from_pretrained(model_id, **model_init_kwargs) + elif is_peft_model(model): + # If PEFT is used, the reference model is not needed since the adapter can be disabled + # to revert to the initial model. + self.ref_model = None + else: + # If PEFT configuration is not provided, create a reference model based on the initial model. + self.ref_model = create_reference_model(model) + + # Processing class + if processing_class is None: + processing_cls = self.dna_module.get_processing_class() + + #if isinstance(model.text_model) + processing_class = processing_cls(tokenizer=model.text_tokenizer, dna_tokenizer=model.dna_tokenizer) + # print(model.tokenizer.chat_template) + for component, processing_keyword in self.dna_module.get_custom_processing_keywords(): + if processing_keyword in kwargs: + # If we cannot find component in processing_class, return the processing_class itself + processing_component = getattr(processing_class, component, processing_class) + setattr(processing_component, processing_keyword, kwargs[processing_keyword]) + if getattr(processing_class, "tokenizer", None) is not None: + pad_token_id = processing_class.tokenizer.pad_token_id + processing_class.pad_token_id = pad_token_id + processing_class.eos_token_id = processing_class.tokenizer.eos_token_id + else: + assert isinstance(processing_class, PreTrainedTokenizerBase), "processing_class must be an instance of PreTrainedTokenizerBase if it has no tokenizer attribute" + pad_token_id = processing_class.pad_token_id + + self.dna_module.post_model_init(model, processing_class) + self.dna_module.post_model_init(self.ref_model, processing_class) + + # Reward functions + if not isinstance(reward_funcs, list): + reward_funcs = [reward_funcs] + for i, reward_func in enumerate(reward_funcs): + if isinstance(reward_func, str): + reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained( + reward_func, num_labels=1, **model_init_kwargs + ) + self.reward_funcs = reward_funcs + + # Reward processing class + if reward_processing_classes is None: + reward_processing_classes = [None] * len(reward_funcs) + elif not isinstance(reward_processing_classes, list): + reward_processing_classes = [reward_processing_classes] + else: + if len(reward_processing_classes) != len(reward_funcs): + raise ValueError("The number of reward processing classes must match the number of reward functions.") + + for i, (reward_processing_class, reward_func) in enumerate(zip(reward_processing_classes, reward_funcs)): + if isinstance(reward_func, PreTrainedModel): + if reward_processing_class is None: + reward_processing_class = AutoTokenizer.from_pretrained(reward_func.config._name_or_path) + if reward_processing_class.pad_token_id is None: + reward_processing_class.pad_token = reward_processing_class.eos_token + # The reward model computes the reward for the latest non-padded token in the input sequence. + # So it's important to set the pad token ID to the padding token ID of the processing class. + reward_func.config.pad_token_id = reward_processing_class.pad_token_id + reward_processing_classes[i] = reward_processing_class + self.reward_processing_classes = reward_processing_classes + + # Data collator + def data_collator(features): # No data collation is needed in GRPO + return features + + # Training arguments + self.max_prompt_length = args.max_prompt_length + self.max_prompt_length = None + if args.max_prompt_length is not None: + warnings.warn("Setting max_prompt_length is currently not supported, it has been set to None") + + self.max_completion_length = args.max_completion_length # = |o_i| in the GRPO paper + self.num_generations = args.num_generations # = G in the GRPO paper + self.generation_config = GenerationConfig( + max_new_tokens=self.max_completion_length, + do_sample=True, + temperature=0.6, + top_p=0.95, + top_k=20, + pad_token_id=pad_token_id, + ) + if hasattr(self.dna_module, "get_eos_token_id"): # For InternVL + self.generation_config.eos_token_id = self.dna_module.get_eos_token_id(processing_class) + self.beta = args.beta + self.epsilon_low = args.epsilon + self.epsilon_high = args.epsilon_high if args.epsilon_high is not None else args.epsilon + + # Multi-step + self.num_iterations = args.num_iterations # = 𝜇 in the GRPO paper + # Tracks the number of iterations (forward + backward passes), including those within a gradient accumulation cycle + self._step = 0 + # Buffer the batch to reuse generated outputs across multiple updates + self._buffered_inputs = [None] * args.gradient_accumulation_steps + + # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the + # input tensor associated with the key "input_ids". However, in GRPO, the sampled data does not include the + # "input_ids" key. Instead, the available keys is "prompt". As a result, the trainer issues the warning: + # "Could not estimate the number of tokens of the input, floating-point operations will not be computed." To + # suppress this warning, we set the "estimate_tokens" key in the model's "warnings_issued" dictionary to True. + # This acts as a flag to indicate that the warning has already been issued. + model.warnings_issued["estimate_tokens"] = True + + # Initialize the metrics + self._metrics = defaultdict(list) + self.log_completions = args.log_completions + + super().__init__( + model=model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + callbacks=callbacks, + optimizers=optimizers, + ) + + # Check if the per_device_train/eval_batch_size * num processes can be divided by the number of generations + num_processes = self.accelerator.num_processes + global_batch_size = args.per_device_train_batch_size * num_processes + possible_values = [n_gen for n_gen in range(2, global_batch_size + 1) if (global_batch_size) % n_gen == 0] + if self.num_generations not in possible_values: + raise ValueError( + f"The global train batch size ({num_processes} x {args.per_device_train_batch_size}) must be evenly " + f"divisible by the number of generations per prompt ({self.num_generations}). Given the current train " + f"batch size, the valid values for the number of generations are: {possible_values}." + ) + if self.args.eval_strategy != "no": + global_batch_size = args.per_device_eval_batch_size * num_processes + possible_values = [n_gen for n_gen in range(2, global_batch_size + 1) if (global_batch_size) % n_gen == 0] + if self.num_generations not in possible_values: + raise ValueError( + f"The global eval batch size ({num_processes} x {args.per_device_eval_batch_size}) must be evenly " + f"divisible by the number of generations per prompt ({self.num_generations}). Given the current " + f"eval batch size, the valid values for the number of generations are: {possible_values}." + ) + + # Ensure each process receives a unique seed to prevent duplicate completions when generating with + # transformers if num_generations exceeds per_device_train_batch_size. We could skip it if we use vLLM, but + # it's safer to set it in all cases. + set_seed(args.seed, device_specific=True) + + # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the + # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set + # self.model_accepts_loss_kwargs to False to enable scaling. + self.model_accepts_loss_kwargs = False + + if self.ref_model is not None: + # if self.is_deepspeed_enabled: + if is_deepspeed_zero3_enabled(): + self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator) + else: + self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) + + for i, reward_func in enumerate(self.reward_funcs): + if isinstance(reward_func, PreTrainedModel): + self.reward_funcs[i] = self.accelerator.prepare_model(reward_func, evaluation_mode=True) + + def _enable_gradient_checkpointing(self, model: PreTrainedModel, args: GRPOConfig) -> PreTrainedModel: + """Enables gradient checkpointing for the model.""" + # Ensure use_cache is disabled + model.config.use_cache = False + + # Enable gradient checkpointing on the base model for PEFT + if is_peft_model(model): + model.base_model.gradient_checkpointing_enable() + # Enable gradient checkpointing for non-PEFT models + else: + if getattr(model, "language_model", None) is not None: + # For InternVL; these operations are copied from the original training script of InternVL + model.language_model.config.use_cache = False + model.dna_model.gradient_checkpointing = True + model.dna_model.encoder.gradient_checkpointing = True + model.language_model._set_gradient_checkpointing() + # This line is necessary, otherwise the `model.gradient_checkpointing_enable()` will be executed during the training process, leading to an error since InternVL does not support this operation. + args.gradient_checkpointing = False + else: + model.gradient_checkpointing_enable() + + gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs or {} + use_reentrant = ( + "use_reentrant" not in gradient_checkpointing_kwargs or gradient_checkpointing_kwargs["use_reentrant"] + ) + + if use_reentrant: + model.enable_input_require_grads() + + return model + + def _set_signature_columns_if_needed(self): + # If `self.args.remove_unused_columns` is True, non-signature columns are removed. + # By default, this method sets `self._signature_columns` to the model's expected inputs. + # In GRPOTrainer, we preprocess data, so using the model's signature columns doesn't work. + # Instead, we set them to the columns expected by the `training_step` method, hence the override. + if self._signature_columns is None: + self._signature_columns = ["prompt"] + + + # Get the per-token log probabilities for the completions for the model and the reference model + def _get_per_token_logps(self, model, input_ids, attention_mask, **custom_multimodal_inputs): + logits = model(input_ids=input_ids, attention_mask=attention_mask, **custom_multimodal_inputs).logits # (B, L, V) + logits = logits[:, :-1, :] # (B, L-1, V), exclude the last logit: it corresponds to the next token pred + input_ids = input_ids[:, 1:] # (B, L-1), exclude the first input ID since we don't have logits for it + # Compute the log probabilities for the input tokens. Use a loop to reduce memory peak. + per_token_logps = [] + for logits_row, input_ids_row in zip(logits, input_ids): + log_probs = logits_row.log_softmax(dim=-1) + token_log_prob = torch.gather(log_probs, dim=1, index=input_ids_row.unsqueeze(1)).squeeze(1) + per_token_logps.append(token_log_prob) + return torch.stack(per_token_logps) + + + def _prepare_inputs(self, inputs): + # Simple pass-through, just like original + return inputs + + def _get_key_from_inputs(self, x, key): + ele = x.get(key, None) + assert ele is not None, f"The key {key} is not found in the input" + if isinstance(ele, list): + return [e for e in ele] + else: + return [ele] + + def _generate_and_score_completions(self, inputs: dict[str, Union[torch.Tensor, Any]], model) -> dict[str, Union[torch.Tensor, Any]]: + device = self.accelerator.device + prompts = [x["prompt"] for x in inputs] + prompts_text = self.dna_module.prepare_prompt(self.processing_class, inputs) + # Handle both pre-loaded images and image paths + batch_dna_sequences = [] + print("_generate_and_score_completions (GRPO):") + for x in inputs: + #print('---') + #print(x) + if 'dna_sequences' in x: + dnas = self._get_key_from_inputs(x, "dna_sequences") + + for dna in dnas: + # clean if desired + pass + batch_dna_sequences.append(dnas) + # NOTE: typically appends dna, so dna_sequences is all the dna in one list + # odd. trying this instead + + + prompt_inputs = self.dna_module.prepare_model_inputs( + self.processing_class, + model, + prompts_text, + batch_dna_sequences, + return_tensors="pt", + padding=True, + padding_side="left", + add_special_tokens=False, + ) + + prompt_inputs = super()._prepare_inputs(prompt_inputs) + prompt_ids, prompt_mask = prompt_inputs["input_ids"], prompt_inputs["attention_mask"] + + # max_prompt_length is not supported yet + # if self.max_prompt_length is not None: + # prompt_ids = prompt_ids[:, -self.max_prompt_length :] + # prompt_inputs["input_ids"] = prompt_ids + # prompt_mask = prompt_mask[:, -self.max_prompt_length :] + # prompt_inputs["attention_mask"] = prompt_mask + + # Generate completions + start = time.time() + with unwrap_model_for_generation(model, self.accelerator) as unwrapped_model: + kwargs = {k: v for k, v in prompt_inputs.items() if k not in self.dna_module.get_non_generate_params()} + generate_returned_result = unwrapped_model.generate( + **kwargs, + generation_config=self.generation_config + ) + end = time.time() + print(f"Generation time: {end - start:.9f} seconds") + prompt_length = prompt_ids.size(1) + if not self.dna_module.is_embeds_input(): + prompt_completion_ids = generate_returned_result + prompt_ids = prompt_completion_ids[:, :prompt_length] + completion_ids = prompt_completion_ids[:, prompt_length:] + else: + # In this case, the input of the LLM backbone is the embedding of the combination of the image and text prompt + # So the returned result of the `generate` method only contains the completion ids + completion_ids = generate_returned_result + prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1) + + # Mask everything after the first EOS token + # print('completion:', completion_ids) + # print('generate_returned_result', generate_returned_result, generate_returned_result.shape) + # print('prompt_inputs["input_ids"]', prompt_inputs["input_ids"], prompt_inputs["input_ids"].shape) + # print('prompt_ids', prompt_ids, prompt_ids.shape) + # print('prompt_length', prompt_length) + # print('prompt_completion_ids', prompt_completion_ids, prompt_completion_ids.shape) + is_eos = completion_ids == self.processing_class.eos_token_id + eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device) + eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)] + sequence_indices = torch.arange(is_eos.size(1), device=device).expand(is_eos.size(0), -1) + completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int() + + # Concatenate prompt_mask with completion_mask for logit computation + attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) # (B, P+C) + + # Get the multimodal inputs + multimodal_keywords = self.dna_module.get_custom_multimodal_keywords() + multimodal_inputs = {k: prompt_inputs[k] if k in prompt_inputs else None for k in multimodal_keywords} + with torch.no_grad(): + # When using num_iterations == 1, old_per_token_logps == per_token_logps, so we can skip its + # computation here, and use per_token_logps.detach() instead. + if self.num_iterations > 1: + old_per_token_logps = self._get_per_token_logps( + model, prompt_completion_ids, attention_mask, **multimodal_inputs + ) + old_per_token_logps = old_per_token_logps[:, prompt_length - 1:] + else: + old_per_token_logps = None + + if self.beta == 0.0: + ref_per_token_logps = None + elif self.ref_model is not None: + ref_per_token_logps = self._get_per_token_logps( + self.ref_model, prompt_completion_ids, attention_mask, **multimodal_inputs + ) + else: + with self.accelerator.unwrap_model(model).disable_adapter(): + ref_per_token_logps = self._get_per_token_logps( + model, prompt_completion_ids, attention_mask, **multimodal_inputs + ) + if ref_per_token_logps is not None: + ref_per_token_logps = ref_per_token_logps[:, prompt_length - 1:] + + # Decode the generated completions + completions_text = self.processing_class.batch_decode(completion_ids, skip_special_tokens=True) + if is_conversational(inputs[0]): + completions = [[{"role": "assistant", "content": completion}] for completion in completions_text] + else: + completions = completions_text + # Compute the rewards + # No need to duplicate prompts as we're not generating multiple completions per prompt + print("Reward calculation...") + rewards_per_func = torch.zeros(len(prompts), len(self.reward_funcs), device=device) + for i, (reward_func, reward_processing_class) in enumerate( + zip(self.reward_funcs, self.reward_processing_classes) + ): + if isinstance(reward_func, PreTrainedModel): + if is_conversational(inputs[0]): + messages = [{"messages": p + c} for p, c in zip(prompts, completions)] + texts = [apply_chat_template(x, reward_processing_class)["text"] for x in messages] + else: + texts = [p + c for p, c in zip(prompts, completions)] + reward_inputs = reward_processing_class( + texts, return_tensors="pt", padding=True, padding_side="right", add_special_tokens=False + ) + reward_inputs = super()._prepare_inputs(reward_inputs) + with torch.inference_mode(): + rewards_per_func[:, i] = reward_func(**reward_inputs).logits[:, 0] # Shape (B*G,) + else: + # Repeat all input columns (but "prompt" and "completion") to match the number of generations + reward_kwargs = {key: [] for key in inputs[0].keys() if key not in ["prompt", "completion"]} + for key in reward_kwargs: + for example in inputs: + # No need to duplicate prompts as we're not generating multiple completions per prompt + # reward_kwargs[key].extend([example[key]] * self.num_generations) + reward_kwargs[key].extend([example[key]]) + output_reward_func = reward_func(prompts=prompts, completions=completions, **reward_kwargs) + rewards_per_func[:, i] = torch.tensor(output_reward_func, dtype=torch.float32, device=device) + + # Gather rewards across processes + rewards_per_func = self.accelerator.gather(rewards_per_func) + + # Sum the rewards from all reward functions + rewards = rewards_per_func.sum(dim=1) + + # Compute grouped-wise rewards + # Each group consists of num_generations completions for the same prompt + mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1) + std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1) + + # Normalize the rewards to compute the advantages + mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(self.num_generations, dim=0) + std_grouped_rewards = std_grouped_rewards.repeat_interleave(self.num_generations, dim=0) + advantages = (rewards - mean_grouped_rewards) / (std_grouped_rewards + 1e-4) + + # Get only the local slice of advantages + process_slice = slice( + self.accelerator.process_index * len(prompts), + (self.accelerator.process_index + 1) * len(prompts), + ) + advantages = advantages[process_slice] + + # Log the metrics + print("Logging metrics...") + completion_length = self.accelerator.gather_for_metrics(completion_mask.sum(1)).float().mean().item() + self._metrics["completion_length"].append(completion_length) + + reward_per_func = self.accelerator.gather_for_metrics(rewards_per_func).mean(0) + for i, reward_func in enumerate(self.reward_funcs): + if isinstance(reward_func, PreTrainedModel): + reward_func_name = reward_func.config._name_or_path.split("/")[-1] + else: + reward_func_name = reward_func.__name__ + self._metrics[f"rewards/{reward_func_name}"].append(reward_per_func[i].item()) + + self._metrics["reward"].append(self.accelerator.gather_for_metrics(rewards).mean().item()) + + self._metrics["reward_std"].append(self.accelerator.gather_for_metrics(std_grouped_rewards).mean().item()) + + print(self.log_completions, self.state.global_step, self.args.logging_steps, self.args.report_to) + if ( + self.log_completions + and self.state.global_step % self.args.logging_steps == 0 + and "wandb" in self.args.report_to + ): + timestamp = time.time() + + # Get the length of one of the other arrays + num_items = len(gather_object(prompts_text)) + + table = { + "step": [f"{self.state.global_step}_{timestamp}"] * num_items, # Repeat to match length + "prompt": gather_object(prompts_text), + "completion": gather_object(completions_text), + "reward": rewards.tolist(), + } + df = pd.DataFrame(table) + + if wandb.run is not None and self.accelerator.is_main_process: + wandb.log({f"completions_{self.state.global_step}_{timestamp}": wandb.Table(dataframe=df)}) + + return { + "prompt_ids": prompt_ids, + "prompt_mask": prompt_mask, + "completion_ids": completion_ids, + "completion_mask": completion_mask, + "old_per_token_logps": old_per_token_logps, + "ref_per_token_logps": ref_per_token_logps, + "advantages": advantages, + "multimodal_inputs": multimodal_inputs + } + + def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): + if return_outputs: + raise ValueError("The GRPOTrainer does not support returning outputs") + + # Check if we need to generate new completions or use buffered ones + print("index 1") + if self.state.global_step % self.num_iterations == 0: + inputs = self._generate_and_score_completions(inputs, model) + self._buffered_inputs[self._step % self.args.gradient_accumulation_steps] = inputs + else: + inputs = self._buffered_inputs[self._step % self.args.gradient_accumulation_steps] + self._step += 1 + + print("index 2") + # Get the prepared inputs + prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"] + completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"] + multimodal_inputs = inputs["multimodal_inputs"] + + # Concatenate for full sequence + input_ids = torch.cat([prompt_ids, completion_ids], dim=1) + attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) + print("index 3") + # Get the current policy's log probabilities + + print("index 4") + per_token_logps = self._get_per_token_logps(model, input_ids, attention_mask, **multimodal_inputs) + # Get rid of the prompt (-1 because of the shift done in get_per_token_logps) + per_token_logps = per_token_logps[:, prompt_ids.size(1) - 1:] + + # Get the advantages from inputs + advantages = inputs["advantages"] + print("index 5") + # When using num_iterations == 1, old_per_token_logps == per_token_logps, so we can skip its computation + # and use per_token_logps.detach() instead + old_per_token_logps = inputs["old_per_token_logps"] if self.num_iterations > 1 else per_token_logps.detach() + + # Compute the policy ratio and clipped version + coef_1 = torch.exp(per_token_logps - old_per_token_logps) + coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high) + per_token_loss1 = coef_1 * advantages.unsqueeze(1) + per_token_loss2 = coef_2 * advantages.unsqueeze(1) + per_token_loss = -torch.min(per_token_loss1, per_token_loss2) + print("index 6") + # Add KL penalty if beta > 0 + if self.beta > 0: + ref_per_token_logps = inputs["ref_per_token_logps"] + per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1 + per_token_loss = per_token_loss + self.beta * per_token_kl + + # Log KL divergence + mean_kl = ((per_token_kl * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean() + self._metrics["kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item()) + + # Compute final loss + print("Computing final loss...") + loss = ((per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean() + + # Log clip ratio + is_clipped = (per_token_loss1 < per_token_loss2).float() + clip_ratio = (is_clipped * completion_mask).sum() / completion_mask.sum() + self._metrics["clip_ratio"].append(self.accelerator.gather_for_metrics(clip_ratio).mean().item()) + + return loss + + def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None: + metrics = {key: sum(val) / len(val) for key, val in self._metrics.items()} # average the metrics + logs = {**logs, **metrics} + if version.parse(transformers.__version__) >= version.parse("4.47.0.dev0"): + super().log(logs, start_time) + else: # transformers<=4.46 + super().log(logs) + self._metrics.clear() + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + tags = tags or [] + if isinstance(tags, str): + tags = [tags] + + if hasattr(self.model.config, "unsloth_version"): + tags.append("unsloth") + + citation = textwrap.dedent( + """\ + @article{zhihong2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, + """ + ) + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="GRPO", + trainer_citation=citation, + paper_title="DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models", + paper_id="2402.03300", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) + + def _get_train_sampler(self) -> Sampler: + """Returns a sampler that ensures proper data sampling for GRPO training.""" + effective_batch_size = ( + self.args.per_device_train_batch_size + * self.accelerator.num_processes + * self.args.gradient_accumulation_steps + ) + + return RepeatRandomSampler( + data_source=self.train_dataset, + mini_repeat_count=self.num_generations, + batch_size=effective_batch_size // self.num_generations, + repeat_count=self.num_iterations, + seed=self.args.seed, + ) + + def _get_eval_sampler(self, eval_dataset) -> Sampler: + """Returns a sampler for evaluation.""" + return RepeatRandomSampler( + data_source=eval_dataset, + mini_repeat_count=self.num_generations, + seed=self.args.seed, + ) \ No newline at end of file diff --git a/BioReason-main/bioreason/utils/__init__.py b/BioReason-main/bioreason/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/BioReason-main/bioreason/utils/dna_utils.py b/BioReason-main/bioreason/utils/dna_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..af77089f6cfcc102f78670c68cf43e43fcb407af --- /dev/null +++ b/BioReason-main/bioreason/utils/dna_utils.py @@ -0,0 +1,12 @@ +from typing import TYPE_CHECKING, Callable, Optional, Union + +import numpy as np + +from transformers.utils import is_torch_available + +if is_torch_available(): + import torch + +DNAInput = Union[ + str, list[int], np.ndarray, "torch.Tensor", list[str], list[list[int]], list[np.ndarray], list["torch.Tensor"] +] # noqa \ No newline at end of file diff --git a/BioReason-main/data/BioReasoning_DataCuration_KEGG.ipynb b/BioReason-main/data/BioReasoning_DataCuration_KEGG.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e9972ce5daf65a4d4e604410217f69fa3c0175ff --- /dev/null +++ b/BioReason-main/data/BioReasoning_DataCuration_KEGG.ipynb @@ -0,0 +1,2575 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Genetic Variant Analysis with KEGG Pathway Data\n", + "\n", + "This notebook demonstrates the process of analyzing genetic variants using KEGG pathway data and the Anthropic Claude API. The analysis creates structured reasoning paths explaining the biological mechanisms and disease relationships for genetic variants.\n", + "\n", + "## Overview\n", + "\n", + "The notebook includes functions to:\n", + "1. Load genetic variant data from TSV files\n", + "2. Process variants in batches using the Anthropic API\n", + "3. Generate detailed biological reasoning for each variant\n", + "4. Combine results into a comprehensive dataset\n", + "\n", + "## Requirements\n", + "\n", + "- Python 3.7+\n", + "- anthropic library\n", + "- tqdm for progress tracking\n", + "- Access to Anthropic Claude API\n", + "\n", + "## Data Format\n", + "\n", + "The input TSV file should contain columns for:\n", + "- Var_ID: Variant identifier\n", + "- ENTRY: Gene entry\n", + "- Chr: Chromosome\n", + "- Start: Position\n", + "- RefAllele: Reference allele\n", + "- AltAllele: Alternative allele\n", + "- Network Definition: Pathway information\n", + "- Gene: Gene information (JSON format)\n", + "- Disease: Associated disease (JSON format)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup and Installation\n", + "\n", + "Install required packages and set up the environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nPQxSEejKYkk", + "outputId": "d444f6f8-90ca-4f0a-e872-082e04154c7b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: anthropic in /usr/local/lib/python3.11/dist-packages (0.50.0)\n", + "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.11/dist-packages (from anthropic) (4.9.0)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.11/dist-packages (from anthropic) (1.9.0)\n", + "Requirement already satisfied: httpx<1,>=0.25.0 in /usr/local/lib/python3.11/dist-packages (from anthropic) (0.28.1)\n", + "Requirement already satisfied: jiter<1,>=0.4.0 in /usr/local/lib/python3.11/dist-packages (from anthropic) (0.9.0)\n", + "Requirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.11/dist-packages (from anthropic) (2.11.3)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.11/dist-packages (from anthropic) (1.3.1)\n", + "Requirement already satisfied: typing-extensions<5,>=4.10 in /usr/local/lib/python3.11/dist-packages (from anthropic) (4.13.2)\n", + "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.11/dist-packages (from anyio<5,>=3.5.0->anthropic) (3.10)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.11/dist-packages (from httpx<1,>=0.25.0->anthropic) (2025.1.31)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx<1,>=0.25.0->anthropic) (1.0.9)\n", + "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.11/dist-packages (from httpcore==1.*->httpx<1,>=0.25.0->anthropic) (0.16.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic<3,>=1.9.0->anthropic) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.33.1 in /usr/local/lib/python3.11/dist-packages (from pydantic<3,>=1.9.0->anthropic) (2.33.1)\n", + "Requirement already satisfied: typing-inspection>=0.4.0 in /usr/local/lib/python3.11/dist-packages (from pydantic<3,>=1.9.0->anthropic) (0.4.0)\n" + ] + } + ], + "source": [ + "!pip install anthropic\n", + "\n", + "import os\n", + "import json\n", + "import time\n", + "import glob\n", + "import datetime\n", + "import re\n", + "from tqdm.notebook import tqdm\n", + "import anthropic\n", + "from anthropic.types.message_create_params import MessageCreateParamsNonStreaming\n", + "from anthropic.types.messages.batch_create_params import Request\n", + "\n", + "# Create directories\n", + "output_dir = \"processed_variants\"\n", + "os.makedirs(output_dir, exist_ok=True)\n", + "\n", + "# API key setup - replace with your preferred method\n", + "# Option 1: Set as environment variable (recommended for production)\n", + "api_key = os.getenv('ANTHROPIC_API_KEY')\n", + "\n", + "# Option 2: For Google Colab, uncomment the following lines:\n", + "# from google.colab import userdata\n", + "# api_key = userdata.get('ANTHROPIC_API_KEY')\n", + "\n", + "# Option 3: Direct input (not recommended for production)\n", + "if not api_key:\n", + " api_key = input(\"Enter your Anthropic API key: \")\n", + "\n", + "# Create Anthropic client\n", + "client = anthropic.Anthropic(api_key=api_key)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "L8pHMeUrXfxW" + }, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import time\n", + "import glob\n", + "import datetime\n", + "import re\n", + "from tqdm.notebook import tqdm\n", + "import anthropic\n", + "from anthropic.types.message_create_params import MessageCreateParamsNonStreaming\n", + "from anthropic.types.messages.batch_create_params import Request\n", + "\n", + "# Create directories\n", + "output_dir = \"processed_variants\"\n", + "os.makedirs(output_dir, exist_ok=True)\n", + "\n", + "# Get API key from Google Colab secrets\n", + "from google.colab import userdata\n", + "api_key = userdata.get('api_key')\n", + "if not api_key:\n", + " api_key = input(\"Enter your Anthropic API key: \")\n", + "\n", + "# Create Anthropic client\n", + "client = anthropic.Anthropic(api_key=api_key)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Loading Functions\n", + "\n", + "Functions to load and process genetic variant data from TSV files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "A5rPTgZWXf0z" + }, + "outputs": [], + "source": [ + "# Load the variant data\n", + "def load_variant_data(file_path):\n", + " \"\"\"Load variant data from a TSV file.\"\"\"\n", + " variants = []\n", + "\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " # Get header line\n", + " header = f.readline().strip().split('\\t')\n", + "\n", + " # Read each line and create a dictionary\n", + " for line in f:\n", + " values = line.strip().split('\\t')\n", + " variant = {header[i]: values[i] for i in range(len(header))}\n", + " variants.append(variant)\n", + "\n", + " return variants\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sdddg01zaQOo" + }, + "outputs": [], + "source": [ + "def create_variant_prompt(variant):\n", + " \"\"\"Create a prompt for analyzing a genetic variant.\"\"\"\n", + " prompt = f\"\"\"# Genetic Variant Analysis Prompt\n", + "\n", + " You are a genetics expert analyzing disease-causing mutations. For the following variant data, create a detailed reasoning path explaining the biological mechanism and disease relationship.\n", + "\n", + " ## Variant Data:\n", + " - Variant ID: {variant.get('Var_ID', 'Unknown')}\n", + " - Gene: {variant.get('ENTRY', 'Unknown')} ({\", \".join([f\"{k.split(';')[0]}\" for k in json.loads(variant.get('Gene', '{}')).values()])})\n", + " - Chromosome: {variant.get('Chr', 'Unknown')}\n", + " - Position: {variant.get('Start', 'Unknown')}\n", + " - Reference Allele: {variant.get('RefAllele', 'Unknown')}\n", + " - Alternative Allele: {variant.get('AltAllele', 'Unknown')}\n", + " - Network: {variant.get('Network Definition', 'Unknown')}\n", + " - Associated Disease: {list(json.loads(variant.get('Disease', '{}')).keys())[0] if variant.get('Disease') else 'Unknown'}\n", + "\n", + " ## Instructions\n", + " 1. Based on this variant data, provide a structured analysis in valid JSON format with the following components:\n", + " - Keep the complete raw_data object containing all original fields\n", + " - Generate one detailed question about the biological effect of this variant and what disease it might contribute to\n", + " - Provide a concise answer (2-3 sentences) summarizing the mechanism and disease relationship\n", + " - Develop a comprehensive reasoning path containing:\n", + " - The variant identifier\n", + " - The HGVS notation\n", + " - 8-12 sequential reasoning steps that trace the causal pathway from the genetic mutation to its cellular effects and disease manifestation\n", + " - Relevant labels for pathways, diseases, and genes\n", + "\n", + " ## Output Format\n", + " ```json\n", + " {{\n", + " \"raw_data\": {{\n", + " // Complete original data object with all fields\n", + " }},\n", + " \"question\": \"What is the biological effect of the [gene] mutation [id] ([ref]>[alt] at [position]) and what disease might it contribute to?\",\n", + " \"answer\": \"Concise 2-3 sentence answer summarizing mechanism and disease\",\n", + " \"reasoning\": {{\n", + " \"variant_id\": \"ID\",\n", + " \"hgvs\": \"Formal HGVS notation\",\n", + " \"reasoning_steps\": [\n", + " \"Step 1: Description of mutation at molecular level\",\n", + " \"Step 2: Effect on protein structure/function\",\n", + " \"Step 3: Effect on cellular pathway/process\",\n", + " // Additional steps showing causal chain\n", + " \"Final step: How this contributes to disease pathology\"\n", + " ],\n", + " \"labels\": {{\n", + " \"pathway\": [\"Pathway identifiers\"],\n", + " \"disease\": [\"Disease names\"],\n", + " \"gene\": [\"Gene names\"]\n", + " }}\n", + " }}\n", + " }}\n", + " Important notes:\n", + "\n", + " Ensure your response is VALID JSON without ANY explanatory text outside the JSON structure\n", + " Do not include markdown code blocks (```) in your response - just provide the raw JSON\n", + " Provide detailed, scientifically accurate reasoning steps that show the complete causal pathway\n", + " For HGVS notation, include both genomic (g.) and protein (p.) level changes\n", + "\n", + " Analyze this variant data and provide your complete analysis in valid JSON format:\n", + " \"\"\"\n", + " return prompt\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prompt Creation\n", + "\n", + "Function to create structured prompts for genetic variant analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1d8S4WHHaQSw" + }, + "outputs": [], + "source": [ + "def process_variants_in_batches(variants, batch_size=5, model=\"claude-3-7-sonnet-20250219\", max_tokens=6000):\n", + " \"\"\"Process variants in batches using the Anthropic SDK.\"\"\"\n", + " print(f\"Processing {len(variants)} variants in batches of {batch_size}\")\n", + " # Process in batches\n", + " for i in range(0, len(variants), batch_size):\n", + " batch_variants = variants[i:i+batch_size]\n", + " print(f\"Processing batch {i//batch_size + 1} with {len(batch_variants)} variants\")\n", + "\n", + " # Create batch requests\n", + " batch_requests = []\n", + " for variant in batch_variants:\n", + " # Create a custom_id (max 64 chars)\n", + " var_id = variant.get('Var_ID', 'variant')\n", + " gene = variant.get('ENTRY', '')\n", + " custom_id = f\"{var_id}_{gene}\"[:64]\n", + "\n", + " # Create the prompt\n", + " prompt = create_variant_prompt(variant)\n", + "\n", + " # Add to batch requests\n", + " batch_requests.append(\n", + " Request(\n", + " custom_id=custom_id,\n", + " params=MessageCreateParamsNonStreaming(\n", + " model=model,\n", + " max_tokens=max_tokens,\n", + " temperature=0.2, # Slightly higher temperature for reasoning variation\n", + " system=\"You are a genetics expert analyzing disease-causing mutations. Provide your analysis in VALID JSON format only, with no markdown formatting or explanatory text. Your JSON should contain raw_data, question, answer, and reasoning components.\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": prompt}\n", + " ]\n", + " )\n", + " )\n", + " )\n", + "\n", + " # Submit batch\n", + " print(f\"Submitting batch with {len(batch_requests)} requests...\")\n", + " batch = client.messages.batches.create(requests=batch_requests)\n", + " print(f\"Batch created with ID: {batch.id}\")\n", + " print(f\"Initial status: {batch.processing_status}\")\n", + "\n", + " # Poll for batch completion\n", + " polling_interval = 10 # seconds\n", + " while True:\n", + " # Get batch status\n", + " batch_status = client.messages.batches.retrieve(batch.id)\n", + "\n", + " # Print status\n", + " print(f\"Batch status: {batch_status.processing_status}\")\n", + " print(f\"Processing: {batch_status.request_counts.processing}, \"\n", + " f\"Succeeded: {batch_status.request_counts.succeeded}, \"\n", + " f\"Errored: {batch_status.request_counts.errored}\")\n", + "\n", + " # Exit loop if processing is complete\n", + " if batch_status.processing_status == \"ended\":\n", + " break\n", + "\n", + " # Wait before checking again\n", + " print(f\"Waiting {polling_interval} seconds...\")\n", + " time.sleep(polling_interval)\n", + "\n", + " # Process batch results\n", + " print(\"Processing batch results...\")\n", + " try:\n", + " for result in client.messages.batches.results(batch.id):\n", + " custom_id = result.custom_id\n", + "\n", + " # Extract variant ID from custom_id\n", + " variant_id = custom_id.split('_')[0]\n", + " output_file = os.path.join(output_dir, f\"{variant_id}_processed.json\")\n", + "\n", + " # Handle different result types\n", + " if result.result.type == \"succeeded\":\n", + " # Get the message content\n", + " message = result.result.message\n", + " content = message.content[0].text if message.content else \"\"\n", + "\n", + " # Extract and parse the JSON\n", + " try:\n", + " # Try direct parsing first\n", + " try:\n", + " parsed_json = json.loads(content)\n", + " except json.JSONDecodeError:\n", + " # Remove markdown code blocks if present\n", + " if \"```json\" in content or \"```\" in content:\n", + " content = re.sub(r'```json\\s*', '', content)\n", + " content = re.sub(r'```\\s*', '', content)\n", + "\n", + " # Extract just the JSON part\n", + " json_start = content.find('{')\n", + " json_end = content.rfind('}') + 1\n", + "\n", + " if json_start >= 0 and json_end > json_start:\n", + " json_text = content[json_start:json_end]\n", + " parsed_json = json.loads(json_text)\n", + "\n", + " # Save the parsed result\n", + " with open(output_file, 'w', encoding='utf-8') as f:\n", + " json.dump(parsed_json, f, indent=2)\n", + " print(f\"✓ Saved result for {variant_id}\")\n", + "\n", + " except Exception as e:\n", + " print(f\"✗ Error parsing result for {variant_id}: {e}\")\n", + " # Save the raw content\n", + " with open(output_file, 'w', encoding='utf-8') as f:\n", + " json.dump({\"error\": str(e), \"raw_content\": content}, f, indent=2)\n", + "\n", + " # Also save as text file for manual fixing\n", + " with open(f\"{output_file}_raw.txt\", 'w', encoding='utf-8') as f:\n", + " f.write(content)\n", + "\n", + " elif result.result.type == \"errored\":\n", + " error_message = \"Unknown error\"\n", + " if hasattr(result.result, 'error') and hasattr(result.result.error, 'message'):\n", + " error_message = result.result.error.message\n", + "\n", + " print(f\"✗ Error processing {variant_id}: {error_message}\")\n", + " # Save the error\n", + " with open(output_file, 'w', encoding='utf-8') as f:\n", + " json.dump({\"error\": error_message}, f, indent=2)\n", + "\n", + " except Exception as e:\n", + " print(f\"Error processing batch results: {str(e)}\")\n", + "\n", + " # Wait between batches\n", + " if i + batch_size < len(variants):\n", + " print(\"Waiting 5 seconds before next batch...\")\n", + " time.sleep(5)\n", + "\n", + " print(\"All batches processed!\")\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Batch Processing Functions\n", + "\n", + "Functions to process variants in batches using the Anthropic API." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Qh3milYjUBhP" + }, + "source": [ + "## This is the version I used for the curation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rNwRB7XK9LJ9" + }, + "source": [ + "## Run for a Batch ##" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Zt0TkTir4gpg" + }, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import time\n", + "import glob\n", + "import datetime\n", + "import re\n", + "from tqdm.notebook import tqdm\n", + "import anthropic\n", + "from anthropic.types.message_create_params import MessageCreateParamsNonStreaming\n", + "from anthropic.types.messages.batch_create_params import Request\n", + "\n", + "# Create directories\n", + "output_dir = \"processed_variants\"\n", + "os.makedirs(output_dir, exist_ok=True)\n", + "\n", + "# API key setup - multiple options for different environments\n", + "api_key = os.getenv('ANTHROPIC_API_KEY')\n", + "\n", + "# For Google Colab users, uncomment these lines:\n", + "# from google.colab import userdata\n", + "# api_key = userdata.get('ANTHROPIC_API_KEY')\n", + "\n", + "if not api_key:\n", + " api_key = input(\"Enter your Anthropic API key: \")\n", + "\n", + "# Create Anthropic client\n", + "client = anthropic.Anthropic(api_key=api_key)\n", + "\n", + "# Function to load variant data\n", + "def load_variant_data(file_path):\n", + " \"\"\"Load variant data from a TSV file.\"\"\"\n", + " variants = []\n", + "\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " # Get header line\n", + " header = f.readline().strip().split('\\t')\n", + "\n", + " # Read each line and create a dictionary\n", + " for line in f:\n", + " values = line.strip().split('\\t')\n", + " if len(values) == len(header):\n", + " variant = {header[i]: values[i] for i in range(len(header))}\n", + " variants.append(variant)\n", + " else:\n", + " print(f\"Skipping malformed line: {line[:50]}...\")\n", + "\n", + " return variants\n", + "\n", + "# Function to create the prompt\n", + "def create_variant_prompt(variant):\n", + " \"\"\"Create a prompt for analyzing a genetic variant.\"\"\"\n", + " # Parse Gene field\n", + " gene_info = {}\n", + " gene_names = \"Unknown\"\n", + " try:\n", + " # First try to parse as JSON\n", + " if variant.get('Gene') and variant.get('Gene').startswith('{'):\n", + " gene_info = json.loads(variant.get('Gene', '{}'))\n", + " gene_names = \", \".join([g.split(';')[0] for g in gene_info.values()]) if gene_info else \"Unknown\"\n", + " else:\n", + " # If not JSON, use as is\n", + " gene_names = variant.get('Gene', 'Unknown')\n", + " except:\n", + " gene_names = variant.get('Gene', 'Unknown')\n", + "\n", + " # Parse Disease field\n", + " disease_info = {}\n", + " disease_name = \"Unknown\"\n", + " try:\n", + " # First try to parse as JSON\n", + " if variant.get('Disease') and variant.get('Disease').startswith('{'):\n", + " disease_info = json.loads(variant.get('Disease', '{}'))\n", + " disease_name = list(disease_info.keys())[0] if disease_info else \"Unknown\"\n", + " else:\n", + " # If not JSON, use as is\n", + " disease_name = variant.get('Disease', 'Unknown')\n", + " except:\n", + " disease_name = variant.get('Disease', 'Unknown')\n", + "\n", + " prompt = f\"\"\"# Genetic Variant Analysis Prompt\n", + "\n", + " You are a genetics expert analyzing disease-causing mutations. For the following variant data, create a detailed reasoning path explaining the biological mechanism and disease relationship.\n", + "\n", + " ## Variant Data:\n", + " - Variant ID: {variant.get('Var_ID', 'Unknown')}\n", + " - Gene: {variant.get('ENTRY', 'Unknown')} ({gene_names})\n", + " - Chromosome: {variant.get('Chr', 'Unknown')}\n", + " - Position: {variant.get('Start', 'Unknown')}\n", + " - Reference Allele: {variant.get('RefAllele', 'Unknown')}\n", + " - Alternative Allele: {variant.get('AltAllele', 'Unknown')}\n", + " - Network: {variant.get('Network Definition', 'Unknown')}\n", + " - Associated Disease: {disease_name}\n", + "\n", + " ## Instructions\n", + " 1. Based on this variant data, provide a structured analysis in valid JSON format with the following components:\n", + " - Keep the complete raw_data object containing all original fields\n", + " - Generate one detailed question about the biological effect of this variant and what disease it might contribute to\n", + " - Provide a concise answer (2-3 sentences) summarizing the mechanism and disease relationship\n", + " - Develop a comprehensive reasoning path containing:\n", + " - The variant identifier\n", + " - The HGVS notation\n", + " - 8-12 sequential reasoning steps that trace the causal pathway from the genetic mutation to its cellular effects and disease manifestation\n", + " - Relevant labels for pathways, diseases, and genes\n", + "\n", + " ## Output Format\n", + " ```json\n", + " {{\n", + " \"raw_data\": {{\n", + " // Complete original data object with all fields\n", + " }},\n", + " \"question\": \"What is the biological effect of the [gene] mutation [id] ([ref]>[alt] at [position]) and what disease might it contribute to?\",\n", + " \"answer\": \"Concise 2-3 sentence answer summarizing mechanism and disease\",\n", + " \"reasoning\": {{\n", + " \"variant_id\": \"ID\",\n", + " \"hgvs\": \"Formal HGVS notation\",\n", + " \"reasoning_steps\": [\n", + " \"Step 1: Description of mutation at molecular level\",\n", + " \"Step 2: Effect on protein structure/function\",\n", + " \"Step 3: Effect on cellular pathway/process\",\n", + " // Additional steps showing causal chain\n", + " \"Final step: How this contributes to disease pathology\"\n", + " ],\n", + " \"labels\": {{\n", + " \"pathway\": [\"Pathway identifiers\"],\n", + " \"disease\": [\"Disease names\"],\n", + " \"gene\": [\"Gene names\"]\n", + " }}\n", + " }}\n", + " }}\n", + " Important notes:\n", + "\n", + " Ensure your response is VALID JSON without ANY explanatory text outside the JSON structure\n", + " Do not include markdown code blocks (```) in your response - just provide the raw JSON\n", + " Provide detailed, scientifically accurate reasoning steps that show the complete causal pathway\n", + " For HGVS notation, include both genomic (g.) and protein (p.) level changes\n", + "\n", + " Analyze this variant data and provide your complete analysis in valid JSON format:\n", + " \"\"\"\n", + " return prompt\n", + "\n", + "## Function to process variants in batches\n", + "def process_variants_in_batches(variants, batch_size=5, model=\"claude-3-7-sonnet-20250219\", max_tokens=6000):\n", + " \"\"\"Process variants in batches using the Anthropic SDK.\"\"\"\n", + " print(f\"Processing {len(variants)} variants in batches of {batch_size}\")\n", + "\n", + " # Process in batches\n", + " for i in range(0, len(variants), batch_size):\n", + " batch_variants = variants[i:i+batch_size]\n", + " print(f\"Processing batch {i//batch_size + 1} with {len(batch_variants)} variants\")\n", + "\n", + " # Create batch requests\n", + " batch_requests = []\n", + " for variant in batch_variants:\n", + " # Extract the Var_ID as the unique identifier\n", + " var_id = variant.get('Var_ID', f'variant_{i}_{len(batch_requests)}')\n", + "\n", + " # Create the prompt\n", + " prompt = create_variant_prompt(variant)\n", + "\n", + " # Add to batch requests\n", + " batch_requests.append(\n", + " Request(\n", + " custom_id=var_id, # Use Var_ID directly as the custom_id\n", + " params=MessageCreateParamsNonStreaming(\n", + " model=model,\n", + " max_tokens=max_tokens,\n", + " temperature=0.2,\n", + " system=\"You are a genetics expert analyzing disease-causing mutations. Provide your analysis in VALID JSON format only, with no markdown formatting or explanatory text. Your JSON should contain raw_data, question, answer, and reasoning components.\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": prompt}\n", + " ]\n", + " )\n", + " )\n", + " )\n", + "\n", + " # Submit batch\n", + " print(f\"Submitting batch with {len(batch_requests)} requests...\")\n", + " batch = client.messages.batches.create(requests=batch_requests)\n", + " print(f\"Batch created with ID: {batch.id}\")\n", + " print(f\"Initial status: {batch.processing_status}\")\n", + "\n", + " # Poll for batch completion\n", + " polling_interval = 10 # seconds\n", + " while True:\n", + " # Get batch status\n", + " batch_status = client.messages.batches.retrieve(batch.id)\n", + "\n", + " # Print status\n", + " print(f\"Batch status: {batch_status.processing_status}\")\n", + " print(f\"Processing: {batch_status.request_counts.processing}, \"\n", + " f\"Succeeded: {batch_status.request_counts.succeeded}, \"\n", + " f\"Errored: {batch_status.request_counts.errored}\")\n", + "\n", + " # Exit loop if processing is complete\n", + " if batch_status.processing_status == \"ended\":\n", + " break\n", + "\n", + " # Wait before checking again\n", + " print(f\"Waiting {polling_interval} seconds...\")\n", + " time.sleep(polling_interval)\n", + "\n", + " # Process batch results\n", + " print(\"Processing batch results...\")\n", + " try:\n", + " for result in client.messages.batches.results(batch.id):\n", + " # Get the variant ID from custom_id (which should be the Var_ID)\n", + " variant_id = result.custom_id\n", + " output_file = os.path.join(output_dir, f\"{variant_id}_processed.json\")\n", + "\n", + " # Handle different result types\n", + " if result.result.type == \"succeeded\":\n", + " # Get the message content\n", + " message = result.result.message\n", + " content = message.content[0].text if message.content else \"\"\n", + "\n", + " # Extract and parse the JSON\n", + " try:\n", + " # Try multiple approaches to extract and parse the JSON\n", + " json_text = None\n", + " parsed_json = None\n", + "\n", + " # Try direct parsing first\n", + " try:\n", + " parsed_json = json.loads(content)\n", + " print(f\"✓ Direct JSON parsing successful for {variant_id}\")\n", + " except json.JSONDecodeError:\n", + " # Try removing markdown code blocks if present\n", + " if \"```json\" in content or \"```\" in content:\n", + " cleaned_content = re.sub(r'```json\\s*', '', content)\n", + " cleaned_content = re.sub(r'```\\s*', '', cleaned_content)\n", + " try:\n", + " parsed_json = json.loads(cleaned_content)\n", + " print(f\"✓ JSON parsing after markdown removal successful for {variant_id}\")\n", + " except json.JSONDecodeError:\n", + " pass # Will try next method\n", + "\n", + " # Try extracting just the JSON part\n", + " if not parsed_json:\n", + " json_start = content.find('{')\n", + " json_end = content.rfind('}') + 1\n", + "\n", + " if json_start >= 0 and json_end > json_start:\n", + " json_text = content[json_start:json_end]\n", + " try:\n", + " parsed_json = json.loads(json_text)\n", + " print(f\"✓ JSON extraction and parsing successful for {variant_id}\")\n", + " except json.JSONDecodeError:\n", + " # Try fixing common JSON syntax issues\n", + " fixed_json = re.sub(r'\"\\s*\"', '\", \"', json_text)\n", + " fixed_json = re.sub(r'}\\s*{', '}, {', fixed_json)\n", + " fixed_json = re.sub(r']\\s*{', '], {', fixed_json)\n", + " fixed_json = re.sub(r'}\\s*\\[', '}, [', fixed_json)\n", + " fixed_json = re.sub(r']\\s*\\[', '], [', fixed_json)\n", + "\n", + " try:\n", + " parsed_json = json.loads(fixed_json)\n", + " print(f\"✓ JSON parsing after fixing syntax successful for {variant_id}\")\n", + " except json.JSONDecodeError as e:\n", + " print(f\"✗ All JSON parsing methods failed for {variant_id}: {e}\")\n", + "\n", + " # Save the parsed result or error\n", + " if parsed_json:\n", + " with open(output_file, 'w', encoding='utf-8') as f:\n", + " json.dump(parsed_json, f, indent=2)\n", + " print(f\"✓ Saved result for {variant_id}\")\n", + " else:\n", + " # Save the full raw response for manual fixing\n", + " with open(output_file, 'w', encoding='utf-8') as f:\n", + " json.dump({\n", + " \"error\": \"Invalid JSON in response\",\n", + " \"raw_response\": content\n", + " }, f, indent=2)\n", + " print(f\"✗ JSON parsing error for {variant_id}, saved full raw response\")\n", + "\n", + " # Also save raw content to a text file for easier manual fixing\n", + " with open(f\"{output_file}_raw.txt\", 'w', encoding='utf-8') as f:\n", + " f.write(content)\n", + "\n", + " except Exception as e:\n", + " print(f\"✗ Error processing result for {variant_id}: {e}\")\n", + " # Save the raw content\n", + " with open(output_file, 'w', encoding='utf-8') as f:\n", + " json.dump({\"error\": str(e), \"raw_content\": content}, f, indent=2)\n", + "\n", + " # Also save as text file for manual fixing\n", + " with open(f\"{output_file}_raw.txt\", 'w', encoding='utf-8') as f:\n", + " f.write(content)\n", + "\n", + " elif result.result.type == \"errored\":\n", + " error_message = \"Unknown error\"\n", + " if hasattr(result.result, 'error') and hasattr(result.result.error, 'message'):\n", + " error_message = result.result.error.message\n", + "\n", + " print(f\"✗ Error processing {variant_id}: {error_message}\")\n", + " # Save the error\n", + " with open(output_file, 'w', encoding='utf-8') as f:\n", + " json.dump({\"error\": error_message}, f, indent=2)\n", + "\n", + " except Exception as e:\n", + " print(f\"Error processing batch results: {str(e)}\")\n", + "\n", + " # Wait between batches\n", + " if i + batch_size < len(variants):\n", + " print(\"Waiting 5 seconds before next batch...\")\n", + " time.sleep(5)\n", + "\n", + " print(\"All batches processed!\")\n", + "\n", + "## Function to combine all results\n", + "def combine_all_results():\n", + " \"\"\"Combine all processed results into a single JSON file.\"\"\"\n", + " all_results = []\n", + " error_count = 0\n", + "\n", + " # List all JSON files in the output directory (excluding raw text files)\n", + " json_files = [f for f in glob.glob(os.path.join(output_dir, \"*.json\"))\n", + " if not f.endswith(\"_raw.txt\")]\n", + "\n", + " print(f\"Found {len(json_files)} JSON files to combine\")\n", + "\n", + " for file_path in json_files:\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " data = json.load(f)\n", + "\n", + " # Skip files with errors\n", + " if \"error\" in data:\n", + " error_count += 1\n", + " print(f\"Skipping file with error: {os.path.basename(file_path)}\")\n", + " continue\n", + "\n", + " all_results.append(data)\n", + " print(f\"Added {os.path.basename(file_path)} to combined results\")\n", + "\n", + " except Exception as e:\n", + " print(f\"Error loading {os.path.basename(file_path)}: {e}\")\n", + " error_count += 1\n", + "\n", + " print(f\"Successfully combined {len(all_results)} results. {error_count} files had errors.\")\n", + "\n", + " # Save the combined collection\n", + " with open(\"all_variant_analyses.json\", 'w', encoding='utf-8') as f:\n", + " json.dump(all_results, f, indent=2)\n", + "\n", + " print(\"Saved all results to 'all_variant_analyses.json'\")\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complete Processing Pipeline\n", + "\n", + "This section contains the complete pipeline with all functions integrated for easier execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ATfwtjWB4gr-" + }, + "outputs": [], + "source": [ + "## Main function that you will call\n", + "def process_genetic_variants(file_path, num_variants=20, batch_size=5, model=\"claude-3-7-sonnet-20250219\"):\n", + " \"\"\"\n", + " Process genetic variants from a TSV file.\n", + " Parameters:\n", + " file_path (str): Path to the TSV file containing variant data\n", + " num_variants (int): Number of variants to process (default: 20)\n", + " batch_size (int): Number of variants to process in each batch (default: 5)\n", + " model (str): Claude model to use (default: claude-3-7-sonnet-20250219)\n", + " \"\"\"\n", + " print(f\"Genetic Variant Analysis Script\")\n", + " print(f\"===============================\")\n", + " print(f\"Processing {num_variants} variants in batches of {batch_size} using {model}\")\n", + "\n", + " # Load data\n", + " print(f\"Loading variant data from {file_path}...\")\n", + " all_variants = load_variant_data(file_path)\n", + " print(f\"Loaded {len(all_variants)} variants in total\")\n", + "\n", + " # Limit to specified number of variants\n", + " variants = all_variants[:num_variants]\n", + " print(f\"Limited to the first {len(variants)} variants for processing\")\n", + "\n", + " # Process variants\n", + " process_variants_in_batches(\n", + " variants,\n", + " batch_size=batch_size,\n", + " model=model\n", + " )\n", + "\n", + " # Combine results\n", + " print(\"Combining results...\")\n", + " combine_all_results()\n", + "\n", + " print(\"Processing complete!\")\n", + " return f\"Results saved to {output_dir} and combined in all_variant_analyses.json\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Main Processing Function\n", + "\n", + "Convenience function to run the complete analysis pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BdCGzxeAFVU_", + "outputId": "5a3b6fc9-330f-4e38-b576-c3b9f1e64048" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Genetic Variant Analysis Script\n", + "===============================\n", + "Processing 440 variants in batches of 20 using claude-3-7-sonnet-20250219\n", + "Loading variant data from final_network_with_variant.tsv...\n", + "Loaded 289 variants in total\n", + "Limited to the first 289 variants for processing\n", + "Processing 289 variants in batches of 20\n", + "Processing batch 1 with 20 variants\n", + "Submitting batch with 20 requests...\n", + "Batch created with ID: msgbatch_013VgvncRWMwgGiuSD3ZU1Ug\n", + "Initial status: in_progress\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: ended\n", + "Processing: 0, Succeeded: 20, Errored: 0\n", + "Processing batch results...\n", + "✓ Direct JSON parsing successful for KEGG_1161\n", + "✓ Saved result for KEGG_1161\n", + "✓ Direct JSON parsing successful for KEGG_1162\n", + "✓ Saved result for KEGG_1162\n", + "✓ Direct JSON parsing successful for KEGG_1163\n", + "✓ Saved result for KEGG_1163\n", + "✓ Direct JSON parsing successful for KEGG_1164\n", + "✓ Saved result for KEGG_1164\n", + "✓ Direct JSON parsing successful for KEGG_1165\n", + "✓ Saved result for KEGG_1165\n", + "✓ Direct JSON parsing successful for KEGG_1166\n", + "✓ Saved result for KEGG_1166\n", + "✓ Direct JSON parsing successful for KEGG_1167\n", + "✓ Saved result for KEGG_1167\n", + "✓ Direct JSON parsing successful for KEGG_1168\n", + "✓ Saved result for KEGG_1168\n", + "✓ Direct JSON parsing successful for KEGG_1169\n", + "✓ Saved result for KEGG_1169\n", + "✓ Direct JSON parsing successful for KEGG_1170\n", + "✓ Saved result for KEGG_1170\n", + "✓ Direct JSON parsing successful for KEGG_1171\n", + "✓ Saved result for KEGG_1171\n", + "✓ Direct JSON parsing successful for KEGG_1172\n", + "✓ Saved result for KEGG_1172\n", + "✓ Direct JSON parsing successful for KEGG_1173\n", + "✓ Saved result for KEGG_1173\n", + "✓ Direct JSON parsing successful for KEGG_1174\n", + "✓ Saved result for KEGG_1174\n", + "✓ Direct JSON parsing successful for KEGG_1175\n", + "✓ Saved result for KEGG_1175\n", + "✗ All JSON parsing methods failed for KEGG_1176: Extra data: line 1 column 4259 (char 4258)\n", + "✗ JSON parsing error for KEGG_1176, saved full raw response\n", + "✓ Direct JSON parsing successful for KEGG_1177\n", + "✓ Saved result for KEGG_1177\n", + "✓ Direct JSON parsing successful for KEGG_1178\n", + "✓ Saved result for KEGG_1178\n", + "✓ Direct JSON parsing successful for KEGG_1179\n", + "✓ Saved result for KEGG_1179\n", + "✓ Direct JSON parsing successful for KEGG_1180\n", + "✓ Saved result for KEGG_1180\n", + "Waiting 5 seconds before next batch...\n", + "Processing batch 2 with 20 variants\n", + "Submitting batch with 20 requests...\n", + "Batch created with ID: msgbatch_01WyxbNt22ncwGbQ1TQe2N62\n", + "Initial status: in_progress\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: ended\n", + "Processing: 0, Succeeded: 20, Errored: 0\n", + "Processing batch results...\n", + "✓ Direct JSON parsing successful for KEGG_1181\n", + "✓ Saved result for KEGG_1181\n", + "✓ Direct JSON parsing successful for KEGG_1182\n", + "✓ Saved result for KEGG_1182\n", + "✓ Direct JSON parsing successful for KEGG_1183\n", + "✓ Saved result for KEGG_1183\n", + "✓ Direct JSON parsing successful for KEGG_1184\n", + "✓ Saved result for KEGG_1184\n", + "✓ Direct JSON parsing successful for KEGG_1185\n", + "✓ Saved result for KEGG_1185\n", + "✓ Direct JSON parsing successful for KEGG_1186\n", + "✓ Saved result for KEGG_1186\n", + "✓ Direct JSON parsing successful for KEGG_1187\n", + "✓ Saved result for KEGG_1187\n", + "✓ Direct JSON parsing successful for KEGG_1188\n", + "✓ Saved result for KEGG_1188\n", + "✓ Direct JSON parsing successful for KEGG_1189\n", + "✓ Saved result for KEGG_1189\n", + "✓ Direct JSON parsing successful for KEGG_1190\n", + "✓ Saved result for KEGG_1190\n", + "✓ Direct JSON parsing successful for KEGG_1191\n", + "✓ Saved result for KEGG_1191\n", + "✓ Direct JSON parsing successful for KEGG_1192\n", + "✓ Saved result for KEGG_1192\n", + "✓ Direct JSON parsing successful for KEGG_1193\n", + "✓ Saved result for KEGG_1193\n", + "✓ Direct JSON parsing successful for KEGG_1194\n", + "✓ Saved result for KEGG_1194\n", + "✓ Direct JSON parsing successful for KEGG_1195\n", + "✓ Saved result for KEGG_1195\n", + "✓ Direct JSON parsing successful for KEGG_1196\n", + "✓ Saved result for KEGG_1196\n", + "✓ Direct JSON parsing successful for KEGG_1197\n", + "✓ Saved result for KEGG_1197\n", + "✓ Direct JSON parsing successful for KEGG_1198\n", + "✓ Saved result for KEGG_1198\n", + "✓ Direct JSON parsing successful for KEGG_1199\n", + "✓ Saved result for KEGG_1199\n", + "✓ Direct JSON parsing successful for KEGG_1200\n", + "✓ Saved result for KEGG_1200\n", + "Waiting 5 seconds before next batch...\n", + "Processing batch 3 with 20 variants\n", + "Submitting batch with 20 requests...\n", + "Batch created with ID: msgbatch_01KuXnQxQkDhRrFdr2GoXJyN\n", + "Initial status: in_progress\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: ended\n", + "Processing: 0, Succeeded: 20, Errored: 0\n", + "Processing batch results...\n", + "✓ Direct JSON parsing successful for KEGG_1201\n", + "✓ Saved result for KEGG_1201\n", + "✓ Direct JSON parsing successful for KEGG_1202\n", + "✓ Saved result for KEGG_1202\n", + "✓ Direct JSON parsing successful for KEGG_1203\n", + "✓ Saved result for KEGG_1203\n", + "✓ Direct JSON parsing successful for KEGG_1204\n", + "✓ Saved result for KEGG_1204\n", + "✓ Direct JSON parsing successful for KEGG_1205\n", + "✓ Saved result for KEGG_1205\n", + "✓ Direct JSON parsing successful for KEGG_1206\n", + "✓ Saved result for KEGG_1206\n", + "✓ Direct JSON parsing successful for KEGG_1207\n", + "✓ Saved result for KEGG_1207\n", + "✓ Direct JSON parsing successful for KEGG_1208\n", + "✓ Saved result for KEGG_1208\n", + "✓ Direct JSON parsing successful for KEGG_1209\n", + "✓ Saved result for KEGG_1209\n", + "✓ Direct JSON parsing successful for KEGG_1210\n", + "✓ Saved result for KEGG_1210\n", + "✓ Direct JSON parsing successful for KEGG_1211\n", + "✓ Saved result for KEGG_1211\n", + "✓ Direct JSON parsing successful for KEGG_1212\n", + "✓ Saved result for KEGG_1212\n", + "✓ Direct JSON parsing successful for KEGG_1213\n", + "✓ Saved result for KEGG_1213\n", + "✓ Direct JSON parsing successful for KEGG_1214\n", + "✓ Saved result for KEGG_1214\n", + "✓ Direct JSON parsing successful for KEGG_1215\n", + "✓ Saved result for KEGG_1215\n", + "✓ Direct JSON parsing successful for KEGG_1216\n", + "✓ Saved result for KEGG_1216\n", + "✓ Direct JSON parsing successful for KEGG_1217\n", + "✓ Saved result for KEGG_1217\n", + "✓ Direct JSON parsing successful for KEGG_1218\n", + "✓ Saved result for KEGG_1218\n", + "✓ Direct JSON parsing successful for KEGG_1219\n", + "✓ Saved result for KEGG_1219\n", + "✓ Direct JSON parsing successful for KEGG_1220\n", + "✓ Saved result for KEGG_1220\n", + "Waiting 5 seconds before next batch...\n", + "Processing batch 4 with 20 variants\n", + "Submitting batch with 20 requests...\n", + "Batch created with ID: msgbatch_01AvcVJWBaxzqKsHJQaQ3RVT\n", + "Initial status: in_progress\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: ended\n", + "Processing: 0, Succeeded: 20, Errored: 0\n", + "Processing batch results...\n", + "✓ Direct JSON parsing successful for KEGG_1221\n", + "✓ Saved result for KEGG_1221\n", + "✓ Direct JSON parsing successful for KEGG_1222\n", + "✓ Saved result for KEGG_1222\n", + "✓ Direct JSON parsing successful for KEGG_1223\n", + "✓ Saved result for KEGG_1223\n", + "✓ Direct JSON parsing successful for KEGG_1224\n", + "✓ Saved result for KEGG_1224\n", + "✓ Direct JSON parsing successful for KEGG_1225\n", + "✓ Saved result for KEGG_1225\n", + "✓ Direct JSON parsing successful for KEGG_1226\n", + "✓ Saved result for KEGG_1226\n", + "✓ Direct JSON parsing successful for KEGG_1227\n", + "✓ Saved result for KEGG_1227\n", + "✓ Direct JSON parsing successful for KEGG_1228\n", + "✓ Saved result for KEGG_1228\n", + "✓ Direct JSON parsing successful for KEGG_1229\n", + "✓ Saved result for KEGG_1229\n", + "✓ Direct JSON parsing successful for KEGG_1230\n", + "✓ Saved result for KEGG_1230\n", + "✓ Direct JSON parsing successful for KEGG_1231\n", + "✓ Saved result for KEGG_1231\n", + "✓ Direct JSON parsing successful for KEGG_1232\n", + "✓ Saved result for KEGG_1232\n", + "✓ Direct JSON parsing successful for KEGG_1233\n", + "✓ Saved result for KEGG_1233\n", + "✓ Direct JSON parsing successful for KEGG_1234\n", + "✓ Saved result for KEGG_1234\n", + "✓ Direct JSON parsing successful for KEGG_1235\n", + "✓ Saved result for KEGG_1235\n", + "✓ Direct JSON parsing successful for KEGG_1236\n", + "✓ Saved result for KEGG_1236\n", + "✓ Direct JSON parsing successful for KEGG_1237\n", + "✓ Saved result for KEGG_1237\n", + "✓ Direct JSON parsing successful for KEGG_1238\n", + "✓ Saved result for KEGG_1238\n", + "✓ Direct JSON parsing successful for KEGG_1239\n", + "✓ Saved result for KEGG_1239\n", + "✓ Direct JSON parsing successful for KEGG_1240\n", + "✓ Saved result for KEGG_1240\n", + "Waiting 5 seconds before next batch...\n", + "Processing batch 5 with 20 variants\n", + "Submitting batch with 20 requests...\n", + "Batch created with ID: msgbatch_01AYb2QqhD3rVnbgphZXpR74\n", + "Initial status: in_progress\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: ended\n", + "Processing: 0, Succeeded: 20, Errored: 0\n", + "Processing batch results...\n", + "✓ Direct JSON parsing successful for KEGG_1241\n", + "✓ Saved result for KEGG_1241\n", + "✓ Direct JSON parsing successful for KEGG_1242\n", + "✓ Saved result for KEGG_1242\n", + "✓ Direct JSON parsing successful for KEGG_1243\n", + "✓ Saved result for KEGG_1243\n", + "✓ Direct JSON parsing successful for KEGG_1244\n", + "✓ Saved result for KEGG_1244\n", + "✓ Direct JSON parsing successful for KEGG_1245\n", + "✓ Saved result for KEGG_1245\n", + "✓ Direct JSON parsing successful for KEGG_1246\n", + "✓ Saved result for KEGG_1246\n", + "✓ Direct JSON parsing successful for KEGG_1247\n", + "✓ Saved result for KEGG_1247\n", + "✓ Direct JSON parsing successful for KEGG_1248\n", + "✓ Saved result for KEGG_1248\n", + "✓ Direct JSON parsing successful for KEGG_1249\n", + "✓ Saved result for KEGG_1249\n", + "✓ Direct JSON parsing successful for KEGG_1250\n", + "✓ Saved result for KEGG_1250\n", + "✓ Direct JSON parsing successful for KEGG_1251\n", + "✓ Saved result for KEGG_1251\n", + "✓ Direct JSON parsing successful for KEGG_1252\n", + "✓ Saved result for KEGG_1252\n", + "✓ Direct JSON parsing successful for KEGG_1253\n", + "✓ Saved result for KEGG_1253\n", + "✓ Direct JSON parsing successful for KEGG_1254\n", + "✓ Saved result for KEGG_1254\n", + "✓ Direct JSON parsing successful for KEGG_1255\n", + "✓ Saved result for KEGG_1255\n", + "✓ Direct JSON parsing successful for KEGG_1256\n", + "✓ Saved result for KEGG_1256\n", + "✓ Direct JSON parsing successful for KEGG_1257\n", + "✓ Saved result for KEGG_1257\n", + "✓ Direct JSON parsing successful for KEGG_1258\n", + "✓ Saved result for KEGG_1258\n", + "✓ Direct JSON parsing successful for KEGG_1259\n", + "✓ Saved result for KEGG_1259\n", + "✓ Direct JSON parsing successful for KEGG_1260\n", + "✓ Saved result for KEGG_1260\n", + "Waiting 5 seconds before next batch...\n", + "Processing batch 6 with 20 variants\n", + "Submitting batch with 20 requests...\n", + "Batch created with ID: msgbatch_015ZnmQjCJc4DAtWMRLmrWow\n", + "Initial status: in_progress\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: ended\n", + "Processing: 0, Succeeded: 20, Errored: 0\n", + "Processing batch results...\n", + "✓ Direct JSON parsing successful for KEGG_1261\n", + "✓ Saved result for KEGG_1261\n", + "✓ Direct JSON parsing successful for KEGG_1262\n", + "✓ Saved result for KEGG_1262\n", + "✓ Direct JSON parsing successful for KEGG_1263\n", + "✓ Saved result for KEGG_1263\n", + "✓ Direct JSON parsing successful for KEGG_1264\n", + "✓ Saved result for KEGG_1264\n", + "✓ Direct JSON parsing successful for KEGG_1265\n", + "✓ Saved result for KEGG_1265\n", + "✓ Direct JSON parsing successful for KEGG_1266\n", + "✓ Saved result for KEGG_1266\n", + "✓ Direct JSON parsing successful for KEGG_1267\n", + "✓ Saved result for KEGG_1267\n", + "✓ Direct JSON parsing successful for KEGG_1268\n", + "✓ Saved result for KEGG_1268\n", + "✓ Direct JSON parsing successful for KEGG_1269\n", + "✓ Saved result for KEGG_1269\n", + "✓ Direct JSON parsing successful for KEGG_1270\n", + "✓ Saved result for KEGG_1270\n", + "✓ Direct JSON parsing successful for KEGG_1271\n", + "✓ Saved result for KEGG_1271\n", + "✓ Direct JSON parsing successful for KEGG_1272\n", + "✓ Saved result for KEGG_1272\n", + "✓ Direct JSON parsing successful for KEGG_1273\n", + "✓ Saved result for KEGG_1273\n", + "✓ Direct JSON parsing successful for KEGG_1274\n", + "✓ Saved result for KEGG_1274\n", + "✓ Direct JSON parsing successful for KEGG_1275\n", + "✓ Saved result for KEGG_1275\n", + "✓ Direct JSON parsing successful for KEGG_1276\n", + "✓ Saved result for KEGG_1276\n", + "✓ Direct JSON parsing successful for KEGG_1277\n", + "✓ Saved result for KEGG_1277\n", + "✓ Direct JSON parsing successful for KEGG_1278\n", + "✓ Saved result for KEGG_1278\n", + "✓ Direct JSON parsing successful for KEGG_1279\n", + "✓ Saved result for KEGG_1279\n", + "✓ Direct JSON parsing successful for KEGG_1280\n", + "✓ Saved result for KEGG_1280\n", + "Waiting 5 seconds before next batch...\n", + "Processing batch 7 with 20 variants\n", + "Submitting batch with 20 requests...\n", + "Batch created with ID: msgbatch_013t9ZmJtiVrtsZ3jjZkw72i\n", + "Initial status: in_progress\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: ended\n", + "Processing: 0, Succeeded: 20, Errored: 0\n", + "Processing batch results...\n", + "✓ Direct JSON parsing successful for KEGG_1281\n", + "✓ Saved result for KEGG_1281\n", + "✓ Direct JSON parsing successful for KEGG_1282\n", + "✓ Saved result for KEGG_1282\n", + "✓ Direct JSON parsing successful for KEGG_1283\n", + "✓ Saved result for KEGG_1283\n", + "✓ Direct JSON parsing successful for KEGG_1284\n", + "✓ Saved result for KEGG_1284\n", + "✓ Direct JSON parsing successful for KEGG_1285\n", + "✓ Saved result for KEGG_1285\n", + "✓ Direct JSON parsing successful for KEGG_1286\n", + "✓ Saved result for KEGG_1286\n", + "✓ Direct JSON parsing successful for KEGG_1287\n", + "✓ Saved result for KEGG_1287\n", + "✓ Direct JSON parsing successful for KEGG_1288\n", + "✓ Saved result for KEGG_1288\n", + "✓ Direct JSON parsing successful for KEGG_1289\n", + "✓ Saved result for KEGG_1289\n", + "✓ Direct JSON parsing successful for KEGG_1290\n", + "✓ Saved result for KEGG_1290\n", + "✓ Direct JSON parsing successful for KEGG_1291\n", + "✓ Saved result for KEGG_1291\n", + "✗ All JSON parsing methods failed for KEGG_1292: Extra data: line 1 column 5072 (char 5071)\n", + "✗ JSON parsing error for KEGG_1292, saved full raw response\n", + "✓ Direct JSON parsing successful for KEGG_1293\n", + "✓ Saved result for KEGG_1293\n", + "✓ Direct JSON parsing successful for KEGG_1294\n", + "✓ Saved result for KEGG_1294\n", + "✓ Direct JSON parsing successful for KEGG_1295\n", + "✓ Saved result for KEGG_1295\n", + "✓ Direct JSON parsing successful for KEGG_1296\n", + "✓ Saved result for KEGG_1296\n", + "✓ Direct JSON parsing successful for KEGG_1297\n", + "✓ Saved result for KEGG_1297\n", + "✓ Direct JSON parsing successful for KEGG_1298\n", + "✓ Saved result for KEGG_1298\n", + "✓ Direct JSON parsing successful for KEGG_1299\n", + "✓ Saved result for KEGG_1299\n", + "✓ Direct JSON parsing successful for KEGG_1300\n", + "✓ Saved result for KEGG_1300\n", + "Waiting 5 seconds before next batch...\n", + "Processing batch 8 with 20 variants\n", + "Submitting batch with 20 requests...\n", + "Batch created with ID: msgbatch_01MydwyHdbpKxdsYiHBAQWKy\n", + "Initial status: in_progress\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: ended\n", + "Processing: 0, Succeeded: 20, Errored: 0\n", + "Processing batch results...\n", + "✓ Direct JSON parsing successful for KEGG_1301\n", + "✓ Saved result for KEGG_1301\n", + "✓ Direct JSON parsing successful for KEGG_1302\n", + "✓ Saved result for KEGG_1302\n", + "✓ Direct JSON parsing successful for KEGG_1303\n", + "✓ Saved result for KEGG_1303\n", + "✓ Direct JSON parsing successful for KEGG_1304\n", + "✓ Saved result for KEGG_1304\n", + "✓ Direct JSON parsing successful for KEGG_1305\n", + "✓ Saved result for KEGG_1305\n", + "✓ Direct JSON parsing successful for KEGG_1306\n", + "✓ Saved result for KEGG_1306\n", + "✓ Direct JSON parsing successful for KEGG_1307\n", + "✓ Saved result for KEGG_1307\n", + "✓ Direct JSON parsing successful for KEGG_1308\n", + "✓ Saved result for KEGG_1308\n", + "✓ Direct JSON parsing successful for KEGG_1309\n", + "✓ Saved result for KEGG_1309\n", + "✓ Direct JSON parsing successful for KEGG_1310\n", + "✓ Saved result for KEGG_1310\n", + "✓ Direct JSON parsing successful for KEGG_1311\n", + "✓ Saved result for KEGG_1311\n", + "✓ Direct JSON parsing successful for KEGG_1312\n", + "✓ Saved result for KEGG_1312\n", + "✓ Direct JSON parsing successful for KEGG_1313\n", + "✓ Saved result for KEGG_1313\n", + "✓ Direct JSON parsing successful for KEGG_1314\n", + "✓ Saved result for KEGG_1314\n", + "✓ Direct JSON parsing successful for KEGG_1315\n", + "✓ Saved result for KEGG_1315\n", + "✓ Direct JSON parsing successful for KEGG_1316\n", + "✓ Saved result for KEGG_1316\n", + "✓ Direct JSON parsing successful for KEGG_1317\n", + "✓ Saved result for KEGG_1317\n", + "✓ Direct JSON parsing successful for KEGG_1318\n", + "✓ Saved result for KEGG_1318\n", + "✓ Direct JSON parsing successful for KEGG_1319\n", + "✓ Saved result for KEGG_1319\n", + "✓ Direct JSON parsing successful for KEGG_1320\n", + "✓ Saved result for KEGG_1320\n", + "Waiting 5 seconds before next batch...\n", + "Processing batch 9 with 20 variants\n", + "Submitting batch with 20 requests...\n", + "Batch created with ID: msgbatch_01DmhWnyT88xsC5qMMZpNRWX\n", + "Initial status: in_progress\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: ended\n", + "Processing: 0, Succeeded: 20, Errored: 0\n", + "Processing batch results...\n", + "✓ Direct JSON parsing successful for KEGG_1321\n", + "✓ Saved result for KEGG_1321\n", + "✓ Direct JSON parsing successful for KEGG_1322\n", + "✓ Saved result for KEGG_1322\n", + "✓ Direct JSON parsing successful for KEGG_1323\n", + "✓ Saved result for KEGG_1323\n", + "✗ All JSON parsing methods failed for KEGG_1324: Extra data: line 1 column 3176 (char 3175)\n", + "✗ JSON parsing error for KEGG_1324, saved full raw response\n", + "✓ Direct JSON parsing successful for KEGG_1325\n", + "✓ Saved result for KEGG_1325\n", + "✓ Direct JSON parsing successful for KEGG_1326\n", + "✓ Saved result for KEGG_1326\n", + "✓ Direct JSON parsing successful for KEGG_1327\n", + "✓ Saved result for KEGG_1327\n", + "✓ Direct JSON parsing successful for KEGG_1328\n", + "✓ Saved result for KEGG_1328\n", + "✓ Direct JSON parsing successful for KEGG_1329\n", + "✓ Saved result for KEGG_1329\n", + "✓ Direct JSON parsing successful for KEGG_1330\n", + "✓ Saved result for KEGG_1330\n", + "✓ Direct JSON parsing successful for KEGG_1331\n", + "✓ Saved result for KEGG_1331\n", + "✓ Direct JSON parsing successful for KEGG_1332\n", + "✓ Saved result for KEGG_1332\n", + "✓ Direct JSON parsing successful for KEGG_1333\n", + "✓ Saved result for KEGG_1333\n", + "✓ Direct JSON parsing successful for KEGG_1334\n", + "✓ Saved result for KEGG_1334\n", + "✓ Direct JSON parsing successful for KEGG_1335\n", + "✓ Saved result for KEGG_1335\n", + "✓ Direct JSON parsing successful for KEGG_1336\n", + "✓ Saved result for KEGG_1336\n", + "✓ Direct JSON parsing successful for KEGG_1337\n", + "✓ Saved result for KEGG_1337\n", + "✓ Direct JSON parsing successful for KEGG_1338\n", + "✓ Saved result for KEGG_1338\n", + "✓ Direct JSON parsing successful for KEGG_1339\n", + "✓ Saved result for KEGG_1339\n", + "✓ Direct JSON parsing successful for KEGG_1340\n", + "✓ Saved result for KEGG_1340\n", + "Waiting 5 seconds before next batch...\n", + "Processing batch 10 with 20 variants\n", + "Submitting batch with 20 requests...\n", + "Batch created with ID: msgbatch_01WXTtNdeVvu6hAachYJLekw\n", + "Initial status: in_progress\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: ended\n", + "Processing: 0, Succeeded: 20, Errored: 0\n", + "Processing batch results...\n", + "✓ Direct JSON parsing successful for KEGG_1341\n", + "✓ Saved result for KEGG_1341\n", + "✓ Direct JSON parsing successful for KEGG_1342\n", + "✓ Saved result for KEGG_1342\n", + "✓ Direct JSON parsing successful for KEGG_1343\n", + "✓ Saved result for KEGG_1343\n", + "✓ Direct JSON parsing successful for KEGG_1344\n", + "✓ Saved result for KEGG_1344\n", + "✓ Direct JSON parsing successful for KEGG_1345\n", + "✓ Saved result for KEGG_1345\n", + "✓ Direct JSON parsing successful for KEGG_1346\n", + "✓ Saved result for KEGG_1346\n", + "✓ Direct JSON parsing successful for KEGG_1347\n", + "✓ Saved result for KEGG_1347\n", + "✓ Direct JSON parsing successful for KEGG_1348\n", + "✓ Saved result for KEGG_1348\n", + "✓ Direct JSON parsing successful for KEGG_1349\n", + "✓ Saved result for KEGG_1349\n", + "✓ Direct JSON parsing successful for KEGG_1350\n", + "✓ Saved result for KEGG_1350\n", + "✓ Direct JSON parsing successful for KEGG_1351\n", + "✓ Saved result for KEGG_1351\n", + "✓ Direct JSON parsing successful for KEGG_1352\n", + "✓ Saved result for KEGG_1352\n", + "✓ Direct JSON parsing successful for KEGG_1353\n", + "✓ Saved result for KEGG_1353\n", + "✓ Direct JSON parsing successful for KEGG_1354\n", + "✓ Saved result for KEGG_1354\n", + "✓ Direct JSON parsing successful for KEGG_1355\n", + "✓ Saved result for KEGG_1355\n", + "✓ Direct JSON parsing successful for KEGG_1356\n", + "✓ Saved result for KEGG_1356\n", + "✓ Direct JSON parsing successful for KEGG_1357\n", + "✓ Saved result for KEGG_1357\n", + "✓ Direct JSON parsing successful for KEGG_1358\n", + "✓ Saved result for KEGG_1358\n", + "✓ Direct JSON parsing successful for KEGG_1359\n", + "✓ Saved result for KEGG_1359\n", + "✓ Direct JSON parsing successful for KEGG_1360\n", + "✓ Saved result for KEGG_1360\n", + "Waiting 5 seconds before next batch...\n", + "Processing batch 11 with 20 variants\n", + "Submitting batch with 20 requests...\n", + "Batch created with ID: msgbatch_01EozBcgr17wzXEjkZYfi3rX\n", + "Initial status: in_progress\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: ended\n", + "Processing: 0, Succeeded: 20, Errored: 0\n", + "Processing batch results...\n", + "✓ Direct JSON parsing successful for KEGG_1361\n", + "✓ Saved result for KEGG_1361\n", + "✓ Direct JSON parsing successful for KEGG_1362\n", + "✓ Saved result for KEGG_1362\n", + "✓ Direct JSON parsing successful for KEGG_1363\n", + "✓ Saved result for KEGG_1363\n", + "✓ Direct JSON parsing successful for KEGG_1364\n", + "✓ Saved result for KEGG_1364\n", + "✓ Direct JSON parsing successful for KEGG_1365\n", + "✓ Saved result for KEGG_1365\n", + "✓ Direct JSON parsing successful for KEGG_1366\n", + "✓ Saved result for KEGG_1366\n", + "✓ Direct JSON parsing successful for KEGG_1367\n", + "✓ Saved result for KEGG_1367\n", + "✓ Direct JSON parsing successful for KEGG_1368\n", + "✓ Saved result for KEGG_1368\n", + "✓ Direct JSON parsing successful for KEGG_1369\n", + "✓ Saved result for KEGG_1369\n", + "✓ Direct JSON parsing successful for KEGG_1370\n", + "✓ Saved result for KEGG_1370\n", + "✓ Direct JSON parsing successful for KEGG_1371\n", + "✓ Saved result for KEGG_1371\n", + "✓ Direct JSON parsing successful for KEGG_1372\n", + "✓ Saved result for KEGG_1372\n", + "✓ Direct JSON parsing successful for KEGG_1373\n", + "✓ Saved result for KEGG_1373\n", + "✓ Direct JSON parsing successful for KEGG_1374\n", + "✓ Saved result for KEGG_1374\n", + "✓ Direct JSON parsing successful for KEGG_1375\n", + "✓ Saved result for KEGG_1375\n", + "✓ Direct JSON parsing successful for KEGG_1376\n", + "✓ Saved result for KEGG_1376\n", + "✓ Direct JSON parsing successful for KEGG_1377\n", + "✓ Saved result for KEGG_1377\n", + "✓ Direct JSON parsing successful for KEGG_1378\n", + "✓ Saved result for KEGG_1378\n", + "✓ Direct JSON parsing successful for KEGG_1379\n", + "✓ Saved result for KEGG_1379\n", + "✓ Direct JSON parsing successful for KEGG_1380\n", + "✓ Saved result for KEGG_1380\n", + "Waiting 5 seconds before next batch...\n", + "Processing batch 12 with 20 variants\n", + "Submitting batch with 20 requests...\n", + "Batch created with ID: msgbatch_01VpCVdDVpCo6KxrZoKifjy8\n", + "Initial status: in_progress\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: ended\n", + "Processing: 0, Succeeded: 20, Errored: 0\n", + "Processing batch results...\n", + "✓ Direct JSON parsing successful for KEGG_1381\n", + "✓ Saved result for KEGG_1381\n", + "✓ Direct JSON parsing successful for KEGG_1382\n", + "✓ Saved result for KEGG_1382\n", + "✓ Direct JSON parsing successful for KEGG_1383\n", + "✓ Saved result for KEGG_1383\n", + "✓ Direct JSON parsing successful for KEGG_1384\n", + "✓ Saved result for KEGG_1384\n", + "✓ Direct JSON parsing successful for KEGG_1385\n", + "✓ Saved result for KEGG_1385\n", + "✓ Direct JSON parsing successful for KEGG_1386\n", + "✓ Saved result for KEGG_1386\n", + "✓ Direct JSON parsing successful for KEGG_1387\n", + "✓ Saved result for KEGG_1387\n", + "✓ Direct JSON parsing successful for KEGG_1388\n", + "✓ Saved result for KEGG_1388\n", + "✓ Direct JSON parsing successful for KEGG_1389\n", + "✓ Saved result for KEGG_1389\n", + "✓ Direct JSON parsing successful for KEGG_1390\n", + "✓ Saved result for KEGG_1390\n", + "✓ Direct JSON parsing successful for KEGG_1391\n", + "✓ Saved result for KEGG_1391\n", + "✓ Direct JSON parsing successful for KEGG_1392\n", + "✓ Saved result for KEGG_1392\n", + "✓ Direct JSON parsing successful for KEGG_1393\n", + "✓ Saved result for KEGG_1393\n", + "✓ Direct JSON parsing successful for KEGG_1394\n", + "✓ Saved result for KEGG_1394\n", + "✓ Direct JSON parsing successful for KEGG_1395\n", + "✓ Saved result for KEGG_1395\n", + "✓ Direct JSON parsing successful for KEGG_1396\n", + "✓ Saved result for KEGG_1396\n", + "✓ Direct JSON parsing successful for KEGG_1397\n", + "✓ Saved result for KEGG_1397\n", + "✓ Direct JSON parsing successful for KEGG_1398\n", + "✓ Saved result for KEGG_1398\n", + "✓ Direct JSON parsing successful for KEGG_1399\n", + "✓ Saved result for KEGG_1399\n", + "✓ Direct JSON parsing successful for KEGG_1400\n", + "✓ Saved result for KEGG_1400\n", + "Waiting 5 seconds before next batch...\n", + "Processing batch 13 with 20 variants\n", + "Submitting batch with 20 requests...\n", + "Batch created with ID: msgbatch_01Aq6eQygzrFD1uckWDMF6uE\n", + "Initial status: in_progress\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: ended\n", + "Processing: 0, Succeeded: 20, Errored: 0\n", + "Processing batch results...\n", + "✓ Direct JSON parsing successful for KEGG_1401\n", + "✓ Saved result for KEGG_1401\n", + "✓ Direct JSON parsing successful for KEGG_1402\n", + "✓ Saved result for KEGG_1402\n", + "✓ Direct JSON parsing successful for KEGG_1403\n", + "✓ Saved result for KEGG_1403\n", + "✓ Direct JSON parsing successful for KEGG_1404\n", + "✓ Saved result for KEGG_1404\n", + "✓ Direct JSON parsing successful for KEGG_1405\n", + "✓ Saved result for KEGG_1405\n", + "✓ Direct JSON parsing successful for KEGG_1406\n", + "✓ Saved result for KEGG_1406\n", + "✓ Direct JSON parsing successful for KEGG_1407\n", + "✓ Saved result for KEGG_1407\n", + "✓ Direct JSON parsing successful for KEGG_1408\n", + "✓ Saved result for KEGG_1408\n", + "✓ Direct JSON parsing successful for KEGG_1409\n", + "✓ Saved result for KEGG_1409\n", + "✓ Direct JSON parsing successful for KEGG_1410\n", + "✓ Saved result for KEGG_1410\n", + "✓ Direct JSON parsing successful for KEGG_1411\n", + "✓ Saved result for KEGG_1411\n", + "✓ Direct JSON parsing successful for KEGG_1412\n", + "✓ Saved result for KEGG_1412\n", + "✓ Direct JSON parsing successful for KEGG_1413\n", + "✓ Saved result for KEGG_1413\n", + "✓ Direct JSON parsing successful for KEGG_1414\n", + "✓ Saved result for KEGG_1414\n", + "✓ Direct JSON parsing successful for KEGG_1415\n", + "✓ Saved result for KEGG_1415\n", + "✓ Direct JSON parsing successful for KEGG_1416\n", + "✓ Saved result for KEGG_1416\n", + "✓ Direct JSON parsing successful for KEGG_1417\n", + "✓ Saved result for KEGG_1417\n", + "✓ Direct JSON parsing successful for KEGG_1418\n", + "✓ Saved result for KEGG_1418\n", + "✓ Direct JSON parsing successful for KEGG_1419\n", + "✓ Saved result for KEGG_1419\n", + "✓ Direct JSON parsing successful for KEGG_1420\n", + "✓ Saved result for KEGG_1420\n", + "Waiting 5 seconds before next batch...\n", + "Processing batch 14 with 20 variants\n", + "Submitting batch with 20 requests...\n", + "Batch created with ID: msgbatch_01JtWcxJDaeVL6SquFG9Rriv\n", + "Initial status: in_progress\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 20, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: ended\n", + "Processing: 0, Succeeded: 20, Errored: 0\n", + "Processing batch results...\n", + "✓ Direct JSON parsing successful for KEGG_1421\n", + "✓ Saved result for KEGG_1421\n", + "✓ Direct JSON parsing successful for KEGG_1422\n", + "✓ Saved result for KEGG_1422\n", + "✓ Direct JSON parsing successful for KEGG_1423\n", + "✓ Saved result for KEGG_1423\n", + "✓ Direct JSON parsing successful for KEGG_1424\n", + "✓ Saved result for KEGG_1424\n", + "✗ All JSON parsing methods failed for KEGG_1425: Extra data: line 1 column 4488 (char 4487)\n", + "✗ JSON parsing error for KEGG_1425, saved full raw response\n", + "✗ All JSON parsing methods failed for KEGG_1426: Extra data: line 1 column 4285 (char 4284)\n", + "✗ JSON parsing error for KEGG_1426, saved full raw response\n", + "✓ Direct JSON parsing successful for KEGG_1427\n", + "✓ Saved result for KEGG_1427\n", + "✓ Direct JSON parsing successful for KEGG_1428\n", + "✓ Saved result for KEGG_1428\n", + "✓ Direct JSON parsing successful for KEGG_1429\n", + "✓ Saved result for KEGG_1429\n", + "✓ Direct JSON parsing successful for KEGG_1430\n", + "✓ Saved result for KEGG_1430\n", + "✓ Direct JSON parsing successful for KEGG_1431\n", + "✓ Saved result for KEGG_1431\n", + "✓ Direct JSON parsing successful for KEGG_1432\n", + "✓ Saved result for KEGG_1432\n", + "✓ Direct JSON parsing successful for KEGG_1433\n", + "✓ Saved result for KEGG_1433\n", + "✓ Direct JSON parsing successful for KEGG_1434\n", + "✓ Saved result for KEGG_1434\n", + "✓ Direct JSON parsing successful for KEGG_1435\n", + "✓ Saved result for KEGG_1435\n", + "✓ Direct JSON parsing successful for KEGG_1436\n", + "✓ Saved result for KEGG_1436\n", + "✓ Direct JSON parsing successful for KEGG_1437\n", + "✓ Saved result for KEGG_1437\n", + "✓ Direct JSON parsing successful for KEGG_1438\n", + "✓ Saved result for KEGG_1438\n", + "✓ Direct JSON parsing successful for KEGG_1439\n", + "✓ Saved result for KEGG_1439\n", + "✓ Direct JSON parsing successful for KEGG_1440\n", + "✓ Saved result for KEGG_1440\n", + "Waiting 5 seconds before next batch...\n", + "Processing batch 15 with 9 variants\n", + "Submitting batch with 9 requests...\n", + "Batch created with ID: msgbatch_01JpVNThUhArnk2wVpKbcxko\n", + "Initial status: in_progress\n", + "Batch status: in_progress\n", + "Processing: 9, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 9, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 9, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 9, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 9, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 9, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 9, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 9, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 9, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 9, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 9, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: in_progress\n", + "Processing: 9, Succeeded: 0, Errored: 0\n", + "Waiting 10 seconds...\n", + "Batch status: ended\n", + "Processing: 0, Succeeded: 9, Errored: 0\n", + "Processing batch results...\n", + "✓ Direct JSON parsing successful for KEGG_1441\n", + "✓ Saved result for KEGG_1441\n", + "✓ Direct JSON parsing successful for KEGG_1442\n", + "✓ Saved result for KEGG_1442\n", + "✓ Direct JSON parsing successful for KEGG_1443\n", + "✓ Saved result for KEGG_1443\n", + "✓ Direct JSON parsing successful for KEGG_1444\n", + "✓ Saved result for KEGG_1444\n", + "✓ Direct JSON parsing successful for KEGG_1445\n", + "✓ Saved result for KEGG_1445\n", + "✓ Direct JSON parsing successful for KEGG_1446\n", + "✓ Saved result for KEGG_1446\n", + "✓ Direct JSON parsing successful for KEGG_1447\n", + "✓ Saved result for KEGG_1447\n", + "✓ Direct JSON parsing successful for KEGG_1448\n", + "✓ Saved result for KEGG_1448\n", + "✓ Direct JSON parsing successful for KEGG_1449\n", + "✓ Saved result for KEGG_1449\n", + "All batches processed!\n", + "Combining results...\n", + "Found 289 JSON files to combine\n", + "Added KEGG_1161_processed.json to combined results\n", + "Added KEGG_1162_processed.json to combined results\n", + "Added KEGG_1163_processed.json to combined results\n", + "Added KEGG_1164_processed.json to combined results\n", + "Added KEGG_1165_processed.json to combined results\n", + "Added KEGG_1166_processed.json to combined results\n", + "Added KEGG_1167_processed.json to combined results\n", + "Added KEGG_1168_processed.json to combined results\n", + "Added KEGG_1169_processed.json to combined results\n", + "Added KEGG_1170_processed.json to combined results\n", + "Added KEGG_1171_processed.json to combined results\n", + "Added KEGG_1172_processed.json to combined results\n", + "Added KEGG_1173_processed.json to combined results\n", + "Added KEGG_1174_processed.json to combined results\n", + "Added KEGG_1175_processed.json to combined results\n", + "Skipping file with error: KEGG_1176_processed.json\n", + "Added KEGG_1177_processed.json to combined results\n", + "Added KEGG_1178_processed.json to combined results\n", + "Added KEGG_1179_processed.json to combined results\n", + "Added KEGG_1180_processed.json to combined results\n", + "Added KEGG_1181_processed.json to combined results\n", + "Added KEGG_1182_processed.json to combined results\n", + "Added KEGG_1183_processed.json to combined results\n", + "Added KEGG_1184_processed.json to combined results\n", + "Added KEGG_1185_processed.json to combined results\n", + "Added KEGG_1186_processed.json to combined results\n", + "Added KEGG_1187_processed.json to combined results\n", + "Added KEGG_1188_processed.json to combined results\n", + "Added KEGG_1189_processed.json to combined results\n", + "Added KEGG_1190_processed.json to combined results\n", + "Added KEGG_1191_processed.json to combined results\n", + "Added KEGG_1192_processed.json to combined results\n", + "Added KEGG_1193_processed.json to combined results\n", + "Added KEGG_1194_processed.json to combined results\n", + "Added KEGG_1195_processed.json to combined results\n", + "Added KEGG_1196_processed.json to combined results\n", + "Added KEGG_1197_processed.json to combined results\n", + "Added KEGG_1198_processed.json to combined results\n", + "Added KEGG_1199_processed.json to combined results\n", + "Added KEGG_1200_processed.json to combined results\n", + "Added KEGG_1201_processed.json to combined results\n", + "Added KEGG_1202_processed.json to combined results\n", + "Added KEGG_1203_processed.json to combined results\n", + "Added KEGG_1204_processed.json to combined results\n", + "Added KEGG_1205_processed.json to combined results\n", + "Added KEGG_1206_processed.json to combined results\n", + "Added KEGG_1207_processed.json to combined results\n", + "Added KEGG_1208_processed.json to combined results\n", + "Added KEGG_1209_processed.json to combined results\n", + "Added KEGG_1210_processed.json to combined results\n", + "Added KEGG_1211_processed.json to combined results\n", + "Added KEGG_1212_processed.json to combined results\n", + "Added KEGG_1213_processed.json to combined results\n", + "Added KEGG_1214_processed.json to combined results\n", + "Added KEGG_1215_processed.json to combined results\n", + "Added KEGG_1216_processed.json to combined results\n", + "Added KEGG_1217_processed.json to combined results\n", + "Added KEGG_1218_processed.json to combined results\n", + "Added KEGG_1219_processed.json to combined results\n", + "Added KEGG_1220_processed.json to combined results\n", + "Added KEGG_1221_processed.json to combined results\n", + "Added KEGG_1222_processed.json to combined results\n", + "Added KEGG_1223_processed.json to combined results\n", + "Added KEGG_1224_processed.json to combined results\n", + "Added KEGG_1225_processed.json to combined results\n", + "Added KEGG_1226_processed.json to combined results\n", + "Added KEGG_1227_processed.json to combined results\n", + "Added KEGG_1228_processed.json to combined results\n", + "Added KEGG_1229_processed.json to combined results\n", + "Added KEGG_1230_processed.json to combined results\n", + "Added KEGG_1231_processed.json to combined results\n", + "Added KEGG_1232_processed.json to combined results\n", + "Added KEGG_1233_processed.json to combined results\n", + "Added KEGG_1234_processed.json to combined results\n", + "Added KEGG_1235_processed.json to combined results\n", + "Added KEGG_1236_processed.json to combined results\n", + "Added KEGG_1237_processed.json to combined results\n", + "Added KEGG_1238_processed.json to combined results\n", + "Added KEGG_1239_processed.json to combined results\n", + "Added KEGG_1240_processed.json to combined results\n", + "Added KEGG_1241_processed.json to combined results\n", + "Added KEGG_1242_processed.json to combined results\n", + "Added KEGG_1243_processed.json to combined results\n", + "Added KEGG_1244_processed.json to combined results\n", + "Added KEGG_1245_processed.json to combined results\n", + "Added KEGG_1246_processed.json to combined results\n", + "Added KEGG_1247_processed.json to combined results\n", + "Added KEGG_1248_processed.json to combined results\n", + "Added KEGG_1249_processed.json to combined results\n", + "Added KEGG_1250_processed.json to combined results\n", + "Added KEGG_1251_processed.json to combined results\n", + "Added KEGG_1252_processed.json to combined results\n", + "Added KEGG_1253_processed.json to combined results\n", + "Added KEGG_1254_processed.json to combined results\n", + "Added KEGG_1255_processed.json to combined results\n", + "Added KEGG_1256_processed.json to combined results\n", + "Added KEGG_1257_processed.json to combined results\n", + "Added KEGG_1258_processed.json to combined results\n", + "Added KEGG_1259_processed.json to combined results\n", + "Added KEGG_1260_processed.json to combined results\n", + "Added KEGG_1261_processed.json to combined results\n", + "Added KEGG_1262_processed.json to combined results\n", + "Added KEGG_1263_processed.json to combined results\n", + "Added KEGG_1264_processed.json to combined results\n", + "Added KEGG_1265_processed.json to combined results\n", + "Added KEGG_1266_processed.json to combined results\n", + "Added KEGG_1267_processed.json to combined results\n", + "Added KEGG_1268_processed.json to combined results\n", + "Added KEGG_1269_processed.json to combined results\n", + "Added KEGG_1270_processed.json to combined results\n", + "Added KEGG_1271_processed.json to combined results\n", + "Added KEGG_1272_processed.json to combined results\n", + "Added KEGG_1273_processed.json to combined results\n", + "Added KEGG_1274_processed.json to combined results\n", + "Added KEGG_1275_processed.json to combined results\n", + "Added KEGG_1276_processed.json to combined results\n", + "Added KEGG_1277_processed.json to combined results\n", + "Added KEGG_1278_processed.json to combined results\n", + "Added KEGG_1279_processed.json to combined results\n", + "Added KEGG_1280_processed.json to combined results\n", + "Added KEGG_1281_processed.json to combined results\n", + "Added KEGG_1282_processed.json to combined results\n", + "Added KEGG_1283_processed.json to combined results\n", + "Added KEGG_1284_processed.json to combined results\n", + "Added KEGG_1285_processed.json to combined results\n", + "Added KEGG_1286_processed.json to combined results\n", + "Added KEGG_1287_processed.json to combined results\n", + "Added KEGG_1288_processed.json to combined results\n", + "Added KEGG_1289_processed.json to combined results\n", + "Added KEGG_1290_processed.json to combined results\n", + "Added KEGG_1291_processed.json to combined results\n", + "Skipping file with error: KEGG_1292_processed.json\n", + "Added KEGG_1293_processed.json to combined results\n", + "Added KEGG_1294_processed.json to combined results\n", + "Added KEGG_1295_processed.json to combined results\n", + "Added KEGG_1296_processed.json to combined results\n", + "Added KEGG_1297_processed.json to combined results\n", + "Added KEGG_1298_processed.json to combined results\n", + "Added KEGG_1299_processed.json to combined results\n", + "Added KEGG_1300_processed.json to combined results\n", + "Added KEGG_1301_processed.json to combined results\n", + "Added KEGG_1302_processed.json to combined results\n", + "Added KEGG_1303_processed.json to combined results\n", + "Added KEGG_1304_processed.json to combined results\n", + "Added KEGG_1305_processed.json to combined results\n", + "Added KEGG_1306_processed.json to combined results\n", + "Added KEGG_1307_processed.json to combined results\n", + "Added KEGG_1308_processed.json to combined results\n", + "Added KEGG_1309_processed.json to combined results\n", + "Added KEGG_1310_processed.json to combined results\n", + "Added KEGG_1311_processed.json to combined results\n", + "Added KEGG_1312_processed.json to combined results\n", + "Added KEGG_1313_processed.json to combined results\n", + "Added KEGG_1314_processed.json to combined results\n", + "Added KEGG_1315_processed.json to combined results\n", + "Added KEGG_1316_processed.json to combined results\n", + "Added KEGG_1317_processed.json to combined results\n", + "Added KEGG_1318_processed.json to combined results\n", + "Added KEGG_1319_processed.json to combined results\n", + "Added KEGG_1320_processed.json to combined results\n", + "Added KEGG_1321_processed.json to combined results\n", + "Added KEGG_1322_processed.json to combined results\n", + "Added KEGG_1323_processed.json to combined results\n", + "Skipping file with error: KEGG_1324_processed.json\n", + "Added KEGG_1325_processed.json to combined results\n", + "Added KEGG_1326_processed.json to combined results\n", + "Added KEGG_1327_processed.json to combined results\n", + "Added KEGG_1328_processed.json to combined results\n", + "Added KEGG_1329_processed.json to combined results\n", + "Added KEGG_1330_processed.json to combined results\n", + "Added KEGG_1331_processed.json to combined results\n", + "Added KEGG_1332_processed.json to combined results\n", + "Added KEGG_1333_processed.json to combined results\n", + "Added KEGG_1334_processed.json to combined results\n", + "Added KEGG_1335_processed.json to combined results\n", + "Added KEGG_1336_processed.json to combined results\n", + "Added KEGG_1337_processed.json to combined results\n", + "Added KEGG_1338_processed.json to combined results\n", + "Added KEGG_1339_processed.json to combined results\n", + "Added KEGG_1340_processed.json to combined results\n", + "Added KEGG_1341_processed.json to combined results\n", + "Added KEGG_1342_processed.json to combined results\n", + "Added KEGG_1343_processed.json to combined results\n", + "Added KEGG_1344_processed.json to combined results\n", + "Added KEGG_1345_processed.json to combined results\n", + "Added KEGG_1346_processed.json to combined results\n", + "Added KEGG_1347_processed.json to combined results\n", + "Added KEGG_1348_processed.json to combined results\n", + "Added KEGG_1349_processed.json to combined results\n", + "Added KEGG_1350_processed.json to combined results\n", + "Added KEGG_1351_processed.json to combined results\n", + "Added KEGG_1352_processed.json to combined results\n", + "Added KEGG_1353_processed.json to combined results\n", + "Added KEGG_1354_processed.json to combined results\n", + "Added KEGG_1355_processed.json to combined results\n", + "Added KEGG_1356_processed.json to combined results\n", + "Added KEGG_1357_processed.json to combined results\n", + "Added KEGG_1358_processed.json to combined results\n", + "Added KEGG_1359_processed.json to combined results\n", + "Added KEGG_1360_processed.json to combined results\n", + "Added KEGG_1361_processed.json to combined results\n", + "Added KEGG_1362_processed.json to combined results\n", + "Added KEGG_1363_processed.json to combined results\n", + "Added KEGG_1364_processed.json to combined results\n", + "Added KEGG_1365_processed.json to combined results\n", + "Added KEGG_1366_processed.json to combined results\n", + "Added KEGG_1367_processed.json to combined results\n", + "Added KEGG_1368_processed.json to combined results\n", + "Added KEGG_1369_processed.json to combined results\n", + "Added KEGG_1370_processed.json to combined results\n", + "Added KEGG_1371_processed.json to combined results\n", + "Added KEGG_1372_processed.json to combined results\n", + "Added KEGG_1373_processed.json to combined results\n", + "Added KEGG_1374_processed.json to combined results\n", + "Added KEGG_1375_processed.json to combined results\n", + "Added KEGG_1376_processed.json to combined results\n", + "Added KEGG_1377_processed.json to combined results\n", + "Added KEGG_1378_processed.json to combined results\n", + "Added KEGG_1379_processed.json to combined results\n", + "Added KEGG_1380_processed.json to combined results\n", + "Added KEGG_1381_processed.json to combined results\n", + "Added KEGG_1382_processed.json to combined results\n", + "Added KEGG_1383_processed.json to combined results\n", + "Added KEGG_1384_processed.json to combined results\n", + "Added KEGG_1385_processed.json to combined results\n", + "Added KEGG_1386_processed.json to combined results\n", + "Added KEGG_1387_processed.json to combined results\n", + "Added KEGG_1388_processed.json to combined results\n", + "Added KEGG_1389_processed.json to combined results\n", + "Added KEGG_1390_processed.json to combined results\n", + "Added KEGG_1391_processed.json to combined results\n", + "Added KEGG_1392_processed.json to combined results\n", + "Added KEGG_1393_processed.json to combined results\n", + "Added KEGG_1394_processed.json to combined results\n", + "Added KEGG_1395_processed.json to combined results\n", + "Added KEGG_1396_processed.json to combined results\n", + "Added KEGG_1397_processed.json to combined results\n", + "Added KEGG_1398_processed.json to combined results\n", + "Added KEGG_1399_processed.json to combined results\n", + "Added KEGG_1400_processed.json to combined results\n", + "Added KEGG_1401_processed.json to combined results\n", + "Added KEGG_1402_processed.json to combined results\n", + "Added KEGG_1403_processed.json to combined results\n", + "Added KEGG_1404_processed.json to combined results\n", + "Added KEGG_1405_processed.json to combined results\n", + "Added KEGG_1406_processed.json to combined results\n", + "Added KEGG_1407_processed.json to combined results\n", + "Added KEGG_1408_processed.json to combined results\n", + "Added KEGG_1409_processed.json to combined results\n", + "Added KEGG_1410_processed.json to combined results\n", + "Added KEGG_1411_processed.json to combined results\n", + "Added KEGG_1412_processed.json to combined results\n", + "Added KEGG_1413_processed.json to combined results\n", + "Added KEGG_1414_processed.json to combined results\n", + "Added KEGG_1415_processed.json to combined results\n", + "Added KEGG_1416_processed.json to combined results\n", + "Added KEGG_1417_processed.json to combined results\n", + "Added KEGG_1418_processed.json to combined results\n", + "Added KEGG_1419_processed.json to combined results\n", + "Added KEGG_1420_processed.json to combined results\n", + "Added KEGG_1421_processed.json to combined results\n", + "Added KEGG_1422_processed.json to combined results\n", + "Added KEGG_1423_processed.json to combined results\n", + "Added KEGG_1424_processed.json to combined results\n", + "Skipping file with error: KEGG_1425_processed.json\n", + "Skipping file with error: KEGG_1426_processed.json\n", + "Added KEGG_1427_processed.json to combined results\n", + "Added KEGG_1428_processed.json to combined results\n", + "Added KEGG_1429_processed.json to combined results\n", + "Added KEGG_1430_processed.json to combined results\n", + "Added KEGG_1431_processed.json to combined results\n", + "Added KEGG_1432_processed.json to combined results\n", + "Added KEGG_1433_processed.json to combined results\n", + "Added KEGG_1434_processed.json to combined results\n", + "Added KEGG_1435_processed.json to combined results\n", + "Added KEGG_1436_processed.json to combined results\n", + "Added KEGG_1437_processed.json to combined results\n", + "Added KEGG_1438_processed.json to combined results\n", + "Added KEGG_1439_processed.json to combined results\n", + "Added KEGG_1440_processed.json to combined results\n", + "Added KEGG_1441_processed.json to combined results\n", + "Added KEGG_1442_processed.json to combined results\n", + "Added KEGG_1443_processed.json to combined results\n", + "Added KEGG_1444_processed.json to combined results\n", + "Added KEGG_1445_processed.json to combined results\n", + "Added KEGG_1446_processed.json to combined results\n", + "Added KEGG_1447_processed.json to combined results\n", + "Added KEGG_1448_processed.json to combined results\n", + "Added KEGG_1449_processed.json to combined results\n", + "Successfully combined 284 results. 5 files had errors.\n", + "Saved all results to 'all_variant_analyses.json'\n", + "Processing complete!\n", + "Results saved to processed_variants and combined in all_variant_analyses.json\n" + ] + } + ], + "source": [ + "# 2. Then run the process_genetic_variants function:\n", + "# Run the function with your parameters\n", + "result = process_genetic_variants(\n", + " file_path=\"final_network_with_variant.tsv\",\n", + " num_variants=20,\n", + " batch_size=5,\n", + " model=\"claude-3-7-sonnet-20250219\"\n", + ")\n", + "print(result)\n", + "\n", + "def process_genetic_variants(file_path, num_variants=20, batch_size=5, model=\"claude-3-7-sonnet-20250219\"):\n", + " \"\"\"\n", + " Process genetic variants from a TSV file using the Anthropic Claude API.\n", + " \n", + " Parameters:\n", + " -----------\n", + " file_path : str\n", + " Path to the TSV file containing variant data (relative to notebook location)\n", + " num_variants : int, optional\n", + " Number of variants to process (default: 20)\n", + " Set to None to process all variants in the file\n", + " batch_size : int, optional\n", + " Number of variants to process in each API batch (default: 5)\n", + " Smaller batches provide better error handling but may be slower\n", + " model : str, optional\n", + " Claude model to use (default: \"claude-3-7-sonnet-20250219\")\n", + " \n", + " Returns:\n", + " --------\n", + " str\n", + " Status message indicating completion and output locations\n", + " \n", + " Output Files:\n", + " -------------\n", + " - Individual analyses: saved in processed_variants/ directory\n", + " - Combined results: saved as all_variant_analyses.json\n", + " \"\"\"\n", + " print(f\"Genetic Variant Analysis Script\")\n", + " print(f\"===============================\")\n", + " print(f\"Model: {model}\")\n", + " print(f\"Batch size: {batch_size}\")\n", + " \n", + " # Load data\n", + " print(f\"Loading variant data from {file_path}...\")\n", + " try:\n", + " all_variants = load_variant_data(file_path)\n", + " print(f\"Loaded {len(all_variants)} variants in total\")\n", + " except FileNotFoundError:\n", + " return f\"Error: File '{file_path}' not found. Please check the file path.\"\n", + " except Exception as e:\n", + " return f\"Error loading data: {str(e)}\"\n", + " \n", + " # Limit to specified number of variants\n", + " if num_variants is None:\n", + " variants = all_variants\n", + " print(f\"Processing all {len(variants)} variants\")\n", + " else:\n", + " variants = all_variants[:num_variants]\n", + " print(f\"Processing the first {len(variants)} variants\")\n", + " \n", + " if not variants:\n", + " return \"Error: No variants to process\"\n", + " \n", + " # Process variants\n", + " try:\n", + " process_variants_in_batches(\n", + " variants,\n", + " batch_size=batch_size,\n", + " model=model\n", + " )\n", + " except Exception as e:\n", + " return f\"Error during processing: {str(e)}\"\n", + " \n", + " # Combine results\n", + " print(\"Combining results...\")\n", + " try:\n", + " combine_all_results()\n", + " except Exception as e:\n", + " print(f\"Warning: Error combining results: {str(e)}\")\n", + " \n", + " print(\"Processing complete!\")\n", + " return f\"Analysis complete. Results saved to '{output_dir}/' directory and combined in 'all_variant_analyses.json'\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Usage Examples\n", + "\n", + "Examples of how to run the genetic variant analysis with different parameters." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Notes and Considerations\n", + "\n", + "### API Usage\n", + "- This notebook uses the Anthropic Claude API which requires an API key\n", + "- Processing large numbers of variants will consume significant API credits\n", + "- Consider rate limits and batch sizes based on your API plan\n", + "\n", + "### Data Requirements\n", + "- Input data should be in TSV format with required columns\n", + "- Gene and Disease fields should contain valid JSON when structured data is available\n", + "- Ensure your input file path is correct relative to the notebook location\n", + "\n", + "### Output\n", + "- Individual variant analyses are saved in the `processed_variants/` directory\n", + "- Combined results are saved as `all_variant_analyses.json`\n", + "- Failed analyses are saved with error information for debugging\n", + "\n", + "### Customization\n", + "- Adjust `num_variants` and `batch_size` parameters based on your needs\n", + "- Modify the prompt template in `create_variant_prompt()` for different analysis focuses\n", + "- Change the output directory by modifying the `output_dir` variable\n", + "\n", + "### Example 1: Basic usage with default parameters\n", + "Process first 20 variants from the KEGG dataset\n", + "```python\n", + "file_path = \"kegg_data/final_network_with_variant.tsv\"\n", + "\n", + "result = process_genetic_variants(\n", + " file_path=file_path,\n", + " num_variants=20, # Process first 20 variants\n", + " batch_size=5, # Process 5 variants per batch\n", + " model=\"claude-3-7-sonnet-20250219\"\n", + ")\n", + "print(result)\n", + "```\n", + "\n", + "### Example 2: Process more variants with larger batches\n", + "Uncomment the following lines to run:\n", + "```python\n", + "result = process_genetic_variants(\n", + " file_path=file_path,\n", + " num_variants=100, # Process first 100 variants\n", + " batch_size=10, # Larger batches for efficiency\n", + " model=\"claude-3-7-sonnet-20250219\"\n", + ")\n", + "print(result)\n", + "```\n", + "\n", + "### Example 3: Process all variants in the file\n", + "Uncomment the following lines to run (be aware of API costs):\n", + "```python\n", + "result = process_genetic_variants(\n", + " file_path=file_path,\n", + " num_variants=None, # Process all variants\n", + " batch_size=5, # Conservative batch size\n", + " model=\"claude-3-7-sonnet-20250219\"\n", + ")\n", + "print(result)\n", + "```" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/BioReason-main/data/Clinvar_Coding.ipynb b/BioReason-main/data/Clinvar_Coding.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..40e27396dd2bdda32be000db1eb372a6b144c603 --- /dev/null +++ b/BioReason-main/data/Clinvar_Coding.ipynb @@ -0,0 +1,2481 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "83c9cd1f", + "metadata": {}, + "source": [ + "## Setup and Data Preparation\n", + "\n", + "Initial setup steps to prepare the working environment and extract ClinVar data." + ] + }, + { + "cell_type": "markdown", + "id": "81a36253-9050-4d58-96cd-8238aae51e0e", + "metadata": {}, + "source": [ + "# ClinVar Coding Variants Data Processing\n", + "\n", + "This notebook processes ClinVar coding variants data by extracting additional information including gene names, gene IDs, and associated diseases from ClinVar XML records.\n", + "\n", + "## Overview\n", + "\n", + "The workflow includes:\n", + "1. **Data Extraction**: Filter ClinVar entries from VEP-annotated pathogenic coding variants\n", + "2. **XML Processing**: Parse ClinVar XML records to extract gene and disease information\n", + "3. **Gene Annotation**: Map gene IDs to gene names using NCBI Entrez utilities\n", + "4. **Data Integration**: Combine all information into a comprehensive dataset\n", + "\n", + "## Requirements\n", + "\n", + "- Python 3.7+\n", + "- pandas library\n", + "- xml.etree.ElementTree (built-in)\n", + "- NCBI Entrez Direct tools (for gene name mapping)\n", + "- Input data: VEP-annotated pathogenic coding variants CSV file\n", + "\n", + "## Data Structure\n", + "\n", + "The processing creates a dataset with the following key columns:\n", + "- Variant information (chromosome, position, alleles)\n", + "- ClinVar ID and significance\n", + "- Gene symbols and IDs\n", + "- Associated disease/phenotype information" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb351234-50a3-4061-81ce-bdce5343e790", + "metadata": {}, + "outputs": [], + "source": [ + "# Create working directory for ClinVar data processing\n", + "import os\n", + "os.makedirs('clinvar', exist_ok=True)\n", + "print(\"✅ Created 'clinvar' directory\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "443ccab8-50a1-45ae-950c-8425eb318e93", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Navigate to clinvar directory\n", + "os.chdir('clinvar')\n", + "print(f\"📁 Current working directory: {os.getcwd()}\")\n", + "\n", + "with open('vep_pathogenic_coding.csv') as infile, open('clinvar_coding_raw.csv', 'w') as outfile:\n", + " for line in infile:\n", + " if 'ClinVar' in line:\n", + " outfile.write(line)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1f92675-b85c-4baa-8680-9c3776e04ac9", + "metadata": {}, + "outputs": [], + "source": [ + "# Extract ClinVar entries from VEP-annotated pathogenic coding variants\n", + "# Note: Update the input file path to match your data location\n", + "input_file = \"../data/vep_pathogenic_coding.csv\" # Adjust path as needed\n", + "output_file = \"clinvar_coding_raw.csv\"\n", + "\n", + "# Use shell command to filter ClinVar entries\n", + "import subprocess\n", + "try:\n", + " result = subprocess.run(\n", + " [\"grep\", \"ClinVar\", input_file],\n", + " capture_output=True,\n", + " text=True,\n", + " check=True\n", + " )\n", + " \n", + " with open(output_file, 'w') as f:\n", + " f.write(result.stdout)\n", + " \n", + " print(f\"✅ Extracted ClinVar entries to {output_file}\")\n", + " print(f\"📊 Found {len(result.stdout.strip().split('\\n'))} ClinVar entries\")\n", + " \n", + "except subprocess.CalledProcessError:\n", + " print(f\"❌ Error: Could not find ClinVar entries in {input_file}\")\n", + " print(\"Please ensure the input file exists and contains ClinVar annotations\")\n", + "except FileNotFoundError:\n", + " print(f\"❌ Error: Input file {input_file} not found\")\n", + " print(\"Please update the input_file path to point to your VEP-annotated data\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e560308-135b-4189-9146-ff50845839a4", + "metadata": {}, + "outputs": [], + "source": [ + "# Extract ClinVar IDs from the filtered data (assuming ID is in column 8)\n", + "# Note: Adjust column number if your data structure is different\n", + "import pandas as pd\n", + "\n", + "try:\n", + " # Read the raw ClinVar data to determine structure\n", + " df_temp = pd.read_csv(\"clinvar_coding_raw.csv\")\n", + " print(f\"📋 Data shape: {df_temp.shape}\")\n", + " print(f\"📋 Columns: {list(df_temp.columns)}\")\n", + " \n", + " # Extract ClinVar IDs (adjust column index as needed)\n", + " # Column 8 corresponds to index 7 in Python (0-based)\n", + " if df_temp.shape[1] >= 8:\n", + " clinvar_ids = df_temp.iloc[:, 7] # 8th column (0-based index 7)\n", + " \n", + " # Save IDs to file\n", + " with open(\"Clinvar_ID.txt\", 'w') as f:\n", + " for id_val in clinvar_ids:\n", + " if pd.notna(id_val):\n", + " f.write(f\"{id_val}\\n\")\n", + " \n", + " print(f\"✅ Extracted {len(clinvar_ids.dropna())} ClinVar IDs to Clinvar_ID.txt\")\n", + " else:\n", + " print(f\"❌ Error: Expected at least 8 columns, found {df_temp.shape[1]}\")\n", + " \n", + "except FileNotFoundError:\n", + " print(\"❌ Error: clinvar_coding_raw.csv not found\")\n", + " print(\"Please run the previous cell first to extract ClinVar data\")\n", + "except Exception as e:\n", + " print(f\"❌ Error processing ClinVar data: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53b0dfd8-8d49-4c3f-adb4-4c6bfbffcfa9", + "metadata": {}, + "outputs": [], + "source": [ + "chmod +x Clinvar_esearch.sh\n", + "\n", + "## XML Data Retrieval\n", + "\n", + "**Note**: This step requires creating a shell script (`Clinvar_esearch.sh`) to fetch XML data from NCBI.\n", + "\n", + "The script should:\n", + "1. Read ClinVar IDs from `Clinvar_ID.txt`\n", + "2. Use NCBI Entrez Direct tools to fetch XML records\n", + "3. Save XML files in a `data/` subdirectory\n", + "\n", + "Example script content:\n", + "```bash\n", + "#!/bin/bash\n", + "mkdir -p data\n", + "while read -r id; do\n", + " esearch -db clinvar -query \"$id\" | efetch -format xml > \"data/${id}.xml\"\n", + " echo \"Downloaded XML for ClinVar ID: $id\"\n", + "done < Clinvar_ID.txt\n", + "```\n", + "\n", + "**Prerequisites**: Install NCBI Entrez Direct tools:\n", + "- macOS: `brew install brewsci/bio/edirect`\n", + "- Linux: Follow NCBI EDirect installation guide" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0755ad6d", + "metadata": {}, + "outputs": [], + "source": [ + "# Parsing XML for Gene and Disease\n", + "\n", + "# Make the ClinVar search script executable and run it\n", + "# Note: This assumes you have created the Clinvar_esearch.sh script\n", + "\n", + "import os\n", + "import subprocess\n", + "\n", + "script_path = \"Clinvar_esearch.sh\"\n", + "\n", + "if os.path.exists(script_path):\n", + " # Make script executable\n", + " os.chmod(script_path, 0o755)\n", + " print(f\"✅ Made {script_path} executable\")\n", + " \n", + " # Optionally run the script (uncomment if you want to execute automatically)\n", + " # print(\"🚀 Running ClinVar XML download script...\")\n", + " # result = subprocess.run([f\"./{script_path}\"], capture_output=True, text=True)\n", + " # if result.returncode == 0:\n", + " # print(\"✅ XML download completed successfully\")\n", + " # else:\n", + " # print(f\"❌ Script execution failed: {result.stderr}\")\n", + "else:\n", + " print(f\"⚠️ Warning: {script_path} not found\")\n", + " print(\"Please create this script manually to download ClinVar XML data\")\n", + " print(\"See the documentation in the previous cell for script template\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d21a188b-a0dc-4af2-9b71-5a44d8cd4673", + "metadata": {}, + "outputs": [], + "source": [ + "# Import required libraries\n", + "import pandas as pd\n", + "import xml.etree.ElementTree as ET\n", + "import json\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "print(\"📚 Libraries imported successfully\")\n", + "print(f\"📁 Current directory: {os.getcwd()}\")\n", + "print(f\"📊 Pandas version: {pd.__version__}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1365615b-ee81-4df0-9fca-df001e9f01d4", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the raw ClinVar data\n", + "try:\n", + " clinvar_raw = pd.read_csv(\"clinvar_coding_raw.csv\")\n", + " print(f\"✅ Loaded ClinVar data: {clinvar_raw.shape[0]} rows, {clinvar_raw.shape[1]} columns\")\n", + " print(f\"📋 Columns: {list(clinvar_raw.columns)[:10]}\") # Show first 10 columns\n", + " \n", + "except FileNotFoundError:\n", + " print(\"❌ Error: clinvar_coding_raw.csv not found\")\n", + " print(\"Please run the data extraction steps first\")\n", + " clinvar_raw = None\n", + "except Exception as e:\n", + " print(f\"❌ Error loading data: {e}\")\n", + " clinvar_raw = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7144ddf2-abf7-4680-b578-d4bd4b7195ea", + "metadata": {}, + "outputs": [], + "source": [ + "# Remove unnecessary columns to streamline the dataset\n", + "# Note: Adjust column names based on your actual data structure\n", + "\n", + "if clinvar_raw is not None:\n", + " columns_to_remove = [\n", + " \"GENOMIC_MUTATION_ID\", \"N_SAMPLES\", \"TOTAL_SAMPLES\", \"FREQ\", \n", + " \"OMIM\", \"PMID\", \"AC\", \"AN\", \"AF\", \"MAF\", \"MAC\"\n", + " ]\n", + " \n", + " # Only remove columns that actually exist in the dataset\n", + " existing_columns = [col for col in columns_to_remove if col in clinvar_raw.columns]\n", + " missing_columns = [col for col in columns_to_remove if col not in clinvar_raw.columns]\n", + " \n", + " if existing_columns:\n", + " clinvar_raw = clinvar_raw.drop(columns=existing_columns)\n", + " print(f\"✅ Removed {len(existing_columns)} columns: {existing_columns}\")\n", + " \n", + " if missing_columns:\n", + " print(f\"ℹ️ Columns not found (skipped): {missing_columns}\")\n", + " \n", + " print(f\"📊 Remaining columns: {clinvar_raw.shape[1]}\")\n", + "else:\n", + " print(\"⚠️ Skipping column removal - data not loaded\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbffd3cd-7df3-43e2-8d73-01f54e8d1da6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CHROMPOSREFALTLABELSOURCECONSEQUENCEIDREVIEW_STATUSGENEsplitINT_LABEL
0chr1976215AGPathogenicClinVarmissense_variant1320032no_assertion_criteria_providedNaNtrain1
1chr11050449GAPathogenicClinVarmissense_variant1284257no_assertion_criteria_providedNaNtrain1
2chr11050575GCPathogenicClinVarmissense_variant18241no_assertion_criteria_providedNaNtrain1
3chr11213738GAPathogenicClinVarmissense_variant96692no_assertion_criteria_providedNaNtrain1
4chr11232279AGPathogenicClinVarinitiatior_codon_variant,missense_variant60484criteria_provided,_multiple_submitters,_no_con...NaNtrain1
.......................................
22249chrY2787412CTPathogenicClinVarmissense_variant9747no_assertion_criteria_providedNaNtrain1
22250chrY2787426CGPathogenicClinVarmissense_variant9739criteria_provided,_single_submitterNaNtrain1
22251chrY2787515CAPathogenicClinVarmissense_variant492908no_assertion_criteria_providedNaNtrain1
22252chrY2787551CTPathogenicClinVarmissense_variant9754no_assertion_criteria_providedNaNtrain1
22253chrY7063898ATPathogenicClinVarmissense_variant625467no_assertion_criteria_providedNaNtrain1
\n", + "

22254 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " CHROM POS REF ALT LABEL SOURCE \\\n", + "0 chr1 976215 A G Pathogenic ClinVar \n", + "1 chr1 1050449 G A Pathogenic ClinVar \n", + "2 chr1 1050575 G C Pathogenic ClinVar \n", + "3 chr1 1213738 G A Pathogenic ClinVar \n", + "4 chr1 1232279 A G Pathogenic ClinVar \n", + "... ... ... .. .. ... ... \n", + "22249 chrY 2787412 C T Pathogenic ClinVar \n", + "22250 chrY 2787426 C G Pathogenic ClinVar \n", + "22251 chrY 2787515 C A Pathogenic ClinVar \n", + "22252 chrY 2787551 C T Pathogenic ClinVar \n", + "22253 chrY 7063898 A T Pathogenic ClinVar \n", + "\n", + " CONSEQUENCE ID \\\n", + "0 missense_variant 1320032 \n", + "1 missense_variant 1284257 \n", + "2 missense_variant 18241 \n", + "3 missense_variant 96692 \n", + "4 initiatior_codon_variant,missense_variant 60484 \n", + "... ... ... \n", + "22249 missense_variant 9747 \n", + "22250 missense_variant 9739 \n", + "22251 missense_variant 492908 \n", + "22252 missense_variant 9754 \n", + "22253 missense_variant 625467 \n", + "\n", + " REVIEW_STATUS GENE split \\\n", + "0 no_assertion_criteria_provided NaN train \n", + "1 no_assertion_criteria_provided NaN train \n", + "2 no_assertion_criteria_provided NaN train \n", + "3 no_assertion_criteria_provided NaN train \n", + "4 criteria_provided,_multiple_submitters,_no_con... NaN train \n", + "... ... ... ... \n", + "22249 no_assertion_criteria_provided NaN train \n", + "22250 criteria_provided,_single_submitter NaN train \n", + "22251 no_assertion_criteria_provided NaN train \n", + "22252 no_assertion_criteria_provided NaN train \n", + "22253 no_assertion_criteria_provided NaN train \n", + "\n", + " INT_LABEL \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "... ... \n", + "22249 1 \n", + "22250 1 \n", + "22251 1 \n", + "22252 1 \n", + "22253 1 \n", + "\n", + "[22254 rows x 12 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clinvar_raw\n", + "\n", + "# Preview the cleaned dataset\n", + "if clinvar_raw is not None:\n", + " print(f\"📊 Dataset shape: {clinvar_raw.shape}\")\n", + " print(f\"📋 Column names: {list(clinvar_raw.columns)}\")\n", + " print(\"\\n🔍 First few rows:\")\n", + " display(clinvar_raw.head())\n", + " \n", + " # Check for any null values\n", + " null_counts = clinvar_raw.isnull().sum()\n", + " if null_counts.sum() > 0:\n", + " print(\"\\n⚠️ Null values found:\")\n", + " print(null_counts[null_counts > 0])\n", + "else:\n", + " print(\"❌ No data to display\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e380634b-0c22-4d1e-8520-6fc5728e7de5", + "metadata": {}, + "outputs": [], + "source": [ + "# Add new columns for gene information\n", + "if clinvar_raw is not None:\n", + " clinvar_raw['GENE_ID'] = \"\"\n", + " clinvar_raw['GENE'] = \"\"\n", + " print(\"✅ Added GENE_ID and GENE columns\")\n", + " print(f\"📊 Updated dataset shape: {clinvar_raw.shape}\")\n", + "else:\n", + " print(\"⚠️ Cannot add columns - data not loaded\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92b159f5-694d-4ee4-9616-1ebf00f71904", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CHROMPOSREFALTLABELSOURCECONSEQUENCEIDREVIEW_STATUSGENEsplitINT_LABELGENE_ID
0chr1976215AGPathogenicClinVarmissense_variant1320032no_assertion_criteria_providedtrain1
1chr11050449GAPathogenicClinVarmissense_variant1284257no_assertion_criteria_providedtrain1
2chr11050575GCPathogenicClinVarmissense_variant18241no_assertion_criteria_providedtrain1
3chr11213738GAPathogenicClinVarmissense_variant96692no_assertion_criteria_providedtrain1
4chr11232279AGPathogenicClinVarinitiatior_codon_variant,missense_variant60484criteria_provided,_multiple_submitters,_no_con...train1
..........................................
22249chrY2787412CTPathogenicClinVarmissense_variant9747no_assertion_criteria_providedtrain1
22250chrY2787426CGPathogenicClinVarmissense_variant9739criteria_provided,_single_submittertrain1
22251chrY2787515CAPathogenicClinVarmissense_variant492908no_assertion_criteria_providedtrain1
22252chrY2787551CTPathogenicClinVarmissense_variant9754no_assertion_criteria_providedtrain1
22253chrY7063898ATPathogenicClinVarmissense_variant625467no_assertion_criteria_providedtrain1
\n", + "

22254 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " CHROM POS REF ALT LABEL SOURCE \\\n", + "0 chr1 976215 A G Pathogenic ClinVar \n", + "1 chr1 1050449 G A Pathogenic ClinVar \n", + "2 chr1 1050575 G C Pathogenic ClinVar \n", + "3 chr1 1213738 G A Pathogenic ClinVar \n", + "4 chr1 1232279 A G Pathogenic ClinVar \n", + "... ... ... .. .. ... ... \n", + "22249 chrY 2787412 C T Pathogenic ClinVar \n", + "22250 chrY 2787426 C G Pathogenic ClinVar \n", + "22251 chrY 2787515 C A Pathogenic ClinVar \n", + "22252 chrY 2787551 C T Pathogenic ClinVar \n", + "22253 chrY 7063898 A T Pathogenic ClinVar \n", + "\n", + " CONSEQUENCE ID \\\n", + "0 missense_variant 1320032 \n", + "1 missense_variant 1284257 \n", + "2 missense_variant 18241 \n", + "3 missense_variant 96692 \n", + "4 initiatior_codon_variant,missense_variant 60484 \n", + "... ... ... \n", + "22249 missense_variant 9747 \n", + "22250 missense_variant 9739 \n", + "22251 missense_variant 492908 \n", + "22252 missense_variant 9754 \n", + "22253 missense_variant 625467 \n", + "\n", + " REVIEW_STATUS GENE split \\\n", + "0 no_assertion_criteria_provided train \n", + "1 no_assertion_criteria_provided train \n", + "2 no_assertion_criteria_provided train \n", + "3 no_assertion_criteria_provided train \n", + "4 criteria_provided,_multiple_submitters,_no_con... train \n", + "... ... ... ... \n", + "22249 no_assertion_criteria_provided train \n", + "22250 criteria_provided,_single_submitter train \n", + "22251 no_assertion_criteria_provided train \n", + "22252 no_assertion_criteria_provided train \n", + "22253 no_assertion_criteria_provided train \n", + "\n", + " INT_LABEL GENE_ID \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "... ... ... \n", + "22249 1 \n", + "22250 1 \n", + "22251 1 \n", + "22252 1 \n", + "22253 1 \n", + "\n", + "[22254 rows x 13 columns]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clinvar_raw\n", + "\n", + "# Display updated dataset with new columns\n", + "if clinvar_raw is not None:\n", + " print(f\"📊 Dataset with new columns: {clinvar_raw.shape}\")\n", + " print(f\"📋 All columns: {list(clinvar_raw.columns)}\")\n", + " display(clinvar_raw.head())\n", + "else:\n", + " print(\"❌ No data to display\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f36db716-392a-46a8-a404-d78165a4623c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import xml.etree.ElementTree as ET\n", + "import os\n", + "\n", + "# Parse ClinVar XML files to extract gene information\n", + "# This processes each ClinVar ID and extracts gene symbols and IDs from XML records\n", + "\n", + "if clinvar_raw is not None:\n", + " # Load list of ClinVar IDs\n", + " try:\n", + " with open(\"Clinvar_ID.txt\", \"r\") as f:\n", + " clinvar_ids = [line.strip() for line in f if line.strip()]\n", + " \n", + " print(f\"📋 Processing {len(clinvar_ids)} ClinVar IDs\")\n", + " \n", + " processed_count = 0\n", + " error_count = 0\n", + " \n", + " # Process each ClinVar ID\n", + " for i, clinvar_id in enumerate(clinvar_ids):\n", + " if i % 100 == 0: # Progress indicator\n", + " print(f\"📊 Processing ID {i+1}/{len(clinvar_ids)}...\")\n", + " \n", + " try:\n", + " id_int = int(clinvar_id)\n", + " xml_path = f'data/{clinvar_id}.xml'\n", + " \n", + " # Check if XML file exists\n", + " if not os.path.exists(xml_path):\n", + " print(f\"⚠️ XML file not found: {xml_path}\")\n", + " continue\n", + " \n", + " # Parse XML file\n", + " with open(xml_path, 'r', encoding='utf-8') as file:\n", + " tree = ET.parse(file)\n", + " root = tree.getroot()\n", + " \n", + " # Check for error in XML\n", + " error_element = root.find(\".//error\")\n", + " if error_element is not None:\n", + " # Remove entries with errors\n", + " clinvar_raw = clinvar_raw[clinvar_raw[\"ID\"] != id_int]\n", + " error_count += 1\n", + " continue\n", + " \n", + " # Extract gene information\n", + " gene_names = []\n", + " gene_ids = []\n", + " \n", + " for gene in root.findall(\".//genes/gene\"):\n", + " symbol = gene.findtext(\"symbol\")\n", + " gene_id_data = gene.findtext(\"GeneID\")\n", + " \n", + " if symbol:\n", + " gene_names.append(symbol)\n", + " if gene_id_data:\n", + " gene_ids.append(gene_id_data)\n", + " \n", + " # Join multiple entries with commas\n", + " gene_name_str = \", \".join(gene_names) if gene_names else \"\"\n", + " gene_id_str = \", \".join(gene_ids) if gene_ids else \"\"\n", + " \n", + " # Update DataFrame\n", + " mask = clinvar_raw[\"ID\"] == id_int\n", + " if mask.any():\n", + " clinvar_raw.loc[mask, \"GENE\"] = gene_name_str\n", + " clinvar_raw.loc[mask, \"GENE_ID\"] = gene_id_str\n", + " processed_count += 1\n", + " \n", + " except ET.ParseError as e:\n", + " print(f\"⚠️ XML parsing error for {clinvar_id}: {e}\")\n", + " error_count += 1\n", + " except ValueError as e:\n", + " print(f\"⚠️ Invalid ClinVar ID {clinvar_id}: {e}\")\n", + " error_count += 1\n", + " except Exception as e:\n", + " print(f\"⚠️ Unexpected error processing {clinvar_id}: {e}\")\n", + " error_count += 1\n", + " \n", + " print(f\"\\n✅ Processing complete:\")\n", + " print(f\" 📊 Successfully processed: {processed_count}\")\n", + " print(f\" ❌ Errors encountered: {error_count}\")\n", + " print(f\" 📋 Final dataset shape: {clinvar_raw.shape}\")\n", + " \n", + " except FileNotFoundError:\n", + " print(\"❌ Error: Clinvar_ID.txt not found\")\n", + " print(\"Please run the ID extraction step first\")\n", + " except Exception as e:\n", + " print(f\"❌ Error during XML processing: {e}\")\n", + "else:\n", + " print(\"⚠️ Cannot process XML files - ClinVar data not loaded\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae0c9d8b-1b12-40a4-82ec-c3452e9dda90", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CHROMPOSREFALTLABELSOURCECONSEQUENCEIDREVIEW_STATUSGENEsplitINT_LABELGENE_ID
0chr1976215AGPathogenicClinVarmissense_variant1320032no_assertion_criteria_providedPERM1train184808
1chr11050449GAPathogenicClinVarmissense_variant1284257no_assertion_criteria_providedAGRNtrain1375790
2chr11050575GCPathogenicClinVarmissense_variant18241no_assertion_criteria_providedAGRNtrain1375790
3chr11213738GAPathogenicClinVarmissense_variant96692no_assertion_criteria_providedTNFRSF4train17293
4chr11232279AGPathogenicClinVarinitiatior_codon_variant,missense_variant60484criteria_provided,_multiple_submitters,_no_con...B3GALT6train1126792
..........................................
22249chrY2787412CTPathogenicClinVarmissense_variant9747no_assertion_criteria_providedSRYtrain16736
22250chrY2787426CGPathogenicClinVarmissense_variant9739criteria_provided,_single_submitterSRYtrain16736
22251chrY2787515CAPathogenicClinVarmissense_variant492908no_assertion_criteria_providedSRYtrain16736
22252chrY2787551CTPathogenicClinVarmissense_variant9754no_assertion_criteria_providedSRYtrain16736
22253chrY7063898ATPathogenicClinVarmissense_variant625467no_assertion_criteria_providedLOC126057105, TBL1Ytrain1126057105, 90665
\n", + "

22150 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " CHROM POS REF ALT LABEL SOURCE \\\n", + "0 chr1 976215 A G Pathogenic ClinVar \n", + "1 chr1 1050449 G A Pathogenic ClinVar \n", + "2 chr1 1050575 G C Pathogenic ClinVar \n", + "3 chr1 1213738 G A Pathogenic ClinVar \n", + "4 chr1 1232279 A G Pathogenic ClinVar \n", + "... ... ... .. .. ... ... \n", + "22249 chrY 2787412 C T Pathogenic ClinVar \n", + "22250 chrY 2787426 C G Pathogenic ClinVar \n", + "22251 chrY 2787515 C A Pathogenic ClinVar \n", + "22252 chrY 2787551 C T Pathogenic ClinVar \n", + "22253 chrY 7063898 A T Pathogenic ClinVar \n", + "\n", + " CONSEQUENCE ID \\\n", + "0 missense_variant 1320032 \n", + "1 missense_variant 1284257 \n", + "2 missense_variant 18241 \n", + "3 missense_variant 96692 \n", + "4 initiatior_codon_variant,missense_variant 60484 \n", + "... ... ... \n", + "22249 missense_variant 9747 \n", + "22250 missense_variant 9739 \n", + "22251 missense_variant 492908 \n", + "22252 missense_variant 9754 \n", + "22253 missense_variant 625467 \n", + "\n", + " REVIEW_STATUS GENE \\\n", + "0 no_assertion_criteria_provided PERM1 \n", + "1 no_assertion_criteria_provided AGRN \n", + "2 no_assertion_criteria_provided AGRN \n", + "3 no_assertion_criteria_provided TNFRSF4 \n", + "4 criteria_provided,_multiple_submitters,_no_con... B3GALT6 \n", + "... ... ... \n", + "22249 no_assertion_criteria_provided SRY \n", + "22250 criteria_provided,_single_submitter SRY \n", + "22251 no_assertion_criteria_provided SRY \n", + "22252 no_assertion_criteria_provided SRY \n", + "22253 no_assertion_criteria_provided LOC126057105, TBL1Y \n", + "\n", + " split INT_LABEL GENE_ID \n", + "0 train 1 84808 \n", + "1 train 1 375790 \n", + "2 train 1 375790 \n", + "3 train 1 7293 \n", + "4 train 1 126792 \n", + "... ... ... ... \n", + "22249 train 1 6736 \n", + "22250 train 1 6736 \n", + "22251 train 1 6736 \n", + "22252 train 1 6736 \n", + "22253 train 1 126057105, 90665 \n", + "\n", + "[22150 rows x 13 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clinvar_raw\n", + "\n", + "# Display the dataset with extracted gene information\n", + "if clinvar_raw is not None:\n", + " print(f\"📊 Dataset after gene extraction: {clinvar_raw.shape}\")\n", + " \n", + " # Show statistics\n", + " gene_filled = (clinvar_raw['GENE'] != '').sum()\n", + " gene_id_filled = (clinvar_raw['GENE_ID'] != '').sum()\n", + " \n", + " print(f\"📋 Entries with gene names: {gene_filled} ({gene_filled/len(clinvar_raw)*100:.1f}%)\")\n", + " print(f\"📋 Entries with gene IDs: {gene_id_filled} ({gene_id_filled/len(clinvar_raw)*100:.1f}%)\")\n", + " \n", + " # Show sample data\n", + " display(clinvar_raw.head(10))\n", + "else:\n", + " print(\"❌ No data to display\")" + ] + }, + { + "cell_type": "markdown", + "id": "b76910bd-aa86-4943-a0f2-dcf9756ad81d", + "metadata": {}, + "source": [ + "## Disease/Phenotype Information Extraction\n", + "\n", + "This section extracts disease and phenotype information from the ClinVar XML records. Each variant may be associated with multiple diseases, so the data is expanded to create one row per variant-disease combination.\n", + "\n", + "### Putting in the Disease Name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54ccd972-5804-4d63-9012-5531034d2b60", + "metadata": {}, + "outputs": [], + "source": [ + "# Extract disease/phenotype information from ClinVar XML files\n", + "# This creates multiple rows for variants associated with multiple diseases\n", + "\n", + "if clinvar_raw is not None:\n", + " try:\n", + " # Load ClinVar IDs\n", + " with open(\"Clinvar_ID.txt\", \"r\") as f:\n", + " clinvar_ids = [line.strip() for line in f if line.strip()]\n", + " \n", + " print(f\"📋 Processing {len(clinvar_ids)} ClinVar IDs for disease extraction\")\n", + " \n", + " # Ensure ID column is integer type\n", + " clinvar_raw[\"ID\"] = clinvar_raw[\"ID\"].astype(int)\n", + " \n", + " # Create new DataFrame to store expanded data\n", + " clinvar_data = pd.DataFrame(columns=clinvar_raw.columns.tolist() + [\"Disease\"])\n", + " \n", + " processed_count = 0\n", + " disease_count = 0\n", + " \n", + " # Process each ClinVar ID\n", + " for i, clinvar_id in enumerate(clinvar_ids):\n", + " if i % 100 == 0: # Progress indicator\n", + " print(f\"📊 Processing disease info {i+1}/{len(clinvar_ids)}...\")\n", + " \n", + " try:\n", + " id_int = int(clinvar_id)\n", + " xml_path = f\"data/{clinvar_id}.xml\"\n", + " \n", + " if not os.path.exists(xml_path):\n", + " continue\n", + " \n", + " # Parse XML\n", + " tree = ET.parse(xml_path)\n", + " root = tree.getroot()\n", + " \n", + " # Extract all trait names (diseases/phenotypes)\n", + " trait_names = []\n", + " for trait in root.findall(\".//trait\"):\n", + " trait_name = trait.findtext(\"trait_name\")\n", + " if trait_name:\n", + " trait_names.append(trait_name)\n", + " \n", + " # Filter out 'not provided' if other traits exist\n", + " filtered_traits = [t for t in trait_names if t.lower() != \"not provided\"]\n", + " if not filtered_traits and \"not provided\" in [t.lower() for t in trait_names]:\n", + " filtered_traits = [\"not provided\"]\n", + " \n", + " # If no traits found, use empty string\n", + " if not filtered_traits:\n", + " filtered_traits = [\"\"]\n", + " \n", + " # Create one row for each disease/trait\n", + " base_row = clinvar_raw[clinvar_raw[\"ID\"] == id_int]\n", + " if not base_row.empty:\n", + " for disease_name in filtered_traits:\n", + " new_row = base_row.copy()\n", + " new_row[\"Disease\"] = disease_name\n", + " clinvar_data = pd.concat([clinvar_data, new_row], ignore_index=True)\n", + " disease_count += 1\n", + " processed_count += 1\n", + " \n", + " except ET.ParseError as e:\n", + " print(f\"⚠️ XML parsing error for {clinvar_id}: {e}\")\n", + " except Exception as e:\n", + " print(f\"⚠️ Error processing {clinvar_id}: {e}\")\n", + " \n", + " print(f\"\\n✅ Disease extraction complete:\")\n", + " print(f\" 📊 Variants processed: {processed_count}\")\n", + " print(f\" 🔬 Total variant-disease pairs: {disease_count}\")\n", + " print(f\" 📋 Final dataset shape: {clinvar_data.shape}\")\n", + " \n", + " # Save intermediate results\n", + " clinvar_data.to_csv(\"clinvar_with_disease.csv\", sep='\\t', index=False)\n", + " print(\"💾 Saved results to clinvar_with_disease.csv\")\n", + " \n", + " except FileNotFoundError:\n", + " print(\"❌ Error: Required files not found\")\n", + " print(\"Please ensure Clinvar_ID.txt exists and XML files are downloaded\")\n", + " clinvar_data = None\n", + " except Exception as e:\n", + " print(f\"❌ Error during disease extraction: {e}\")\n", + " clinvar_data = None\n", + "else:\n", + " print(\"⚠️ Cannot extract diseases - ClinVar data not loaded\")\n", + " clinvar_data = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "277445cd-72b9-44a4-a257-49cd3202e501", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CHROMPOSREFALTLABELSOURCECONSEQUENCEIDREVIEW_STATUSGENEsplitINT_LABELGENE_IDDisease
0chr1976215AGPathogenicClinVarmissense_variant1320032no_assertion_criteria_providedPERM1train184808Renal tubular epithelial cell apoptosis
1chr1976215AGPathogenicClinVarmissense_variant1320032no_assertion_criteria_providedPERM1train184808Neutrophil inclusion bodies
2chr11050449GAPathogenicClinVarmissense_variant1284257no_assertion_criteria_providedAGRNtrain1375790Congenital myasthenic syndrome 8
3chr11050575GCPathogenicClinVarmissense_variant18241no_assertion_criteria_providedAGRNtrain1375790Congenital myasthenic syndrome 8
4chr11213738GAPathogenicClinVarmissense_variant96692no_assertion_criteria_providedTNFRSF4train17293Combined immunodeficiency due to OX40 deficiency
.............................................
32680chrY2787412CTPathogenicClinVarmissense_variant9747no_assertion_criteria_providedSRYtrain1673646,XY sex reversal 1
32681chrY2787426CGPathogenicClinVarmissense_variant9739criteria_provided,_single_submitterSRYtrain16736not provided
32682chrY2787515CAPathogenicClinVarmissense_variant492908no_assertion_criteria_providedSRYtrain1673646,XY sex reversal 1
32683chrY2787551CTPathogenicClinVarmissense_variant9754no_assertion_criteria_providedSRYtrain1673646,XY sex reversal 1
32684chrY7063898ATPathogenicClinVarmissense_variant625467no_assertion_criteria_providedLOC126057105, TBL1Ytrain1126057105, 90665Deafness, Y-linked 2
\n", + "

32685 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " CHROM POS REF ALT LABEL SOURCE CONSEQUENCE ID \\\n", + "0 chr1 976215 A G Pathogenic ClinVar missense_variant 1320032 \n", + "1 chr1 976215 A G Pathogenic ClinVar missense_variant 1320032 \n", + "2 chr1 1050449 G A Pathogenic ClinVar missense_variant 1284257 \n", + "3 chr1 1050575 G C Pathogenic ClinVar missense_variant 18241 \n", + "4 chr1 1213738 G A Pathogenic ClinVar missense_variant 96692 \n", + "... ... ... .. .. ... ... ... ... \n", + "32680 chrY 2787412 C T Pathogenic ClinVar missense_variant 9747 \n", + "32681 chrY 2787426 C G Pathogenic ClinVar missense_variant 9739 \n", + "32682 chrY 2787515 C A Pathogenic ClinVar missense_variant 492908 \n", + "32683 chrY 2787551 C T Pathogenic ClinVar missense_variant 9754 \n", + "32684 chrY 7063898 A T Pathogenic ClinVar missense_variant 625467 \n", + "\n", + " REVIEW_STATUS GENE split \\\n", + "0 no_assertion_criteria_provided PERM1 train \n", + "1 no_assertion_criteria_provided PERM1 train \n", + "2 no_assertion_criteria_provided AGRN train \n", + "3 no_assertion_criteria_provided AGRN train \n", + "4 no_assertion_criteria_provided TNFRSF4 train \n", + "... ... ... ... \n", + "32680 no_assertion_criteria_provided SRY train \n", + "32681 criteria_provided,_single_submitter SRY train \n", + "32682 no_assertion_criteria_provided SRY train \n", + "32683 no_assertion_criteria_provided SRY train \n", + "32684 no_assertion_criteria_provided LOC126057105, TBL1Y train \n", + "\n", + " INT_LABEL GENE_ID \\\n", + "0 1 84808 \n", + "1 1 84808 \n", + "2 1 375790 \n", + "3 1 375790 \n", + "4 1 7293 \n", + "... ... ... \n", + "32680 1 6736 \n", + "32681 1 6736 \n", + "32682 1 6736 \n", + "32683 1 6736 \n", + "32684 1 126057105, 90665 \n", + "\n", + " Disease \n", + "0 Renal tubular epithelial cell apoptosis \n", + "1 Neutrophil inclusion bodies \n", + "2 Congenital myasthenic syndrome 8 \n", + "3 Congenital myasthenic syndrome 8 \n", + "4 Combined immunodeficiency due to OX40 deficiency \n", + "... ... \n", + "32680 46,XY sex reversal 1 \n", + "32681 not provided \n", + "32682 46,XY sex reversal 1 \n", + "32683 46,XY sex reversal 1 \n", + "32684 Deafness, Y-linked 2 \n", + "\n", + "[32685 rows x 14 columns]" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clinvar_data\n", + "\n", + "# Display the dataset with disease information\n", + "if 'clinvar_data' in locals() and clinvar_data is not None:\n", + " print(f\"📊 Dataset with diseases: {clinvar_data.shape}\")\n", + " \n", + " # Show disease statistics\n", + " disease_counts = clinvar_data['Disease'].value_counts()\n", + " print(f\"\\n🔬 Disease distribution (top 10):\")\n", + " print(disease_counts.head(10))\n", + " \n", + " # Show sample data\n", + " print(\"\\n🔍 Sample data:\")\n", + " display(clinvar_data.head())\n", + "else:\n", + " print(\"❌ No disease data to display\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6b1c6dc-33ed-4f57-a385-29816f4c9984", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.int64(2749)" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Count entries with 'not provided' disease information\n", + "if 'clinvar_data' in locals() and clinvar_data is not None:\n", + " not_provided_count = (clinvar_data[\"Disease\"] == \"not provided\").sum()\n", + " total_count = len(clinvar_data)\n", + " \n", + " print(f\"📊 Entries with 'not provided' disease: {not_provided_count}\")\n", + " print(f\"📊 Total entries: {total_count}\")\n", + " print(f\"📊 Percentage: {not_provided_count/total_count*100:.1f}%\")\n", + "else:\n", + " print(\"❌ Cannot calculate statistics - data not available\")" + ] + }, + { + "cell_type": "markdown", + "id": "8a7513ee-96b2-4c7d-8678-0195eb826aa5", + "metadata": {}, + "source": [ + "## Gene ID to Gene Name Mapping\n", + "\n", + "This section converts gene IDs to human-readable gene names using NCBI Entrez utilities.\n", + "\n", + "**Prerequisites**: NCBI Entrez Direct tools must be installed:\n", + "- macOS: `brew install brewsci/bio/edirect`\n", + "- Linux: Follow NCBI EDirect installation guide\n", + "\n", + "The process:\n", + "1. Extract unique gene IDs from the dataset\n", + "2. Use `esummary` to fetch gene descriptions from NCBI\n", + "3. Create a mapping dictionary\n", + "4. Apply the mapping to add gene names to the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee0d3632-d11e-4429-bb50-5eb9ba55d424", + "metadata": {}, + "outputs": [], + "source": [ + "#!/usr/bin/env python3\n", + "\n", + "import os\n", + "import pandas as pd\n", + "\n", + "# Extract unique gene IDs and create mapping file\n", + "# This prepares the gene ID list for NCBI lookup\n", + "\n", + "if 'clinvar_data' in locals() and clinvar_data is not None:\n", + " # Extract all unique gene IDs\n", + " all_gene_ids = set()\n", + " \n", + " for gene_id_str in clinvar_data['GENE_ID'].dropna():\n", + " if gene_id_str.strip(): # Skip empty strings\n", + " # Split comma-separated IDs\n", + " ids = [gid.strip() for gid in gene_id_str.split(',') if gid.strip()]\n", + " all_gene_ids.update(ids)\n", + " \n", + " # Save unique gene IDs to file\n", + " with open(\"gene_id.txt\", 'w') as f:\n", + " for gene_id in sorted(all_gene_ids):\n", + " f.write(f\"{gene_id}\\n\")\n", + " \n", + " print(f\"✅ Extracted {len(all_gene_ids)} unique gene IDs to gene_id.txt\")\n", + " \n", + " # Create the shell script for NCBI lookup\n", + " script_content = '''#!/bin/bash\n", + "\n", + "input_file=\"gene_id.txt\"\n", + "output_file=\"gene_id_to_name.json\"\n", + "\n", + "# Check if input file exists\n", + "if [ ! -f \"$input_file\" ]; then\n", + " echo \"❌ Error: $input_file not found\"\n", + " exit 1\n", + "fi\n", + "\n", + "# Check if EDirect tools are available\n", + "if ! command -v esummary &> /dev/null; then\n", + " echo \"❌ Error: NCBI EDirect tools not found\"\n", + " echo \"Please install: brew install brewsci/bio/edirect (macOS)\"\n", + " exit 1\n", + "fi\n", + "\n", + "echo \"🚀 Starting gene ID to name mapping...\"\n", + "\n", + "# Start JSON object\n", + "echo \"{\" > \"$output_file\"\n", + "\n", + "first_entry=true\n", + "total_lines=$(wc -l < \"$input_file\")\n", + "current_line=0\n", + "\n", + "while IFS= read -r gene_id; do\n", + " # Skip empty lines\n", + " [[ -z \"$gene_id\" ]] && continue\n", + " \n", + " current_line=$((current_line + 1))\n", + " \n", + " # Progress indicator\n", + " if (( current_line % 50 == 0 )); then\n", + " echo \"📊 Processing $current_line/$total_lines gene IDs...\"\n", + " fi\n", + " \n", + " # Fetch gene description using Entrez Direct\n", + " description=$(esummary -db gene -id \"$gene_id\" 2>/dev/null | xtract -pattern DocumentSummary -element Description)\n", + " \n", + " # Handle empty description\n", + " if [ -z \"$description\" ]; then\n", + " description=\"Unknown\"\n", + " fi\n", + " \n", + " # JSON escape quotes and other special characters\n", + " description=$(printf '%s' \"$description\" | sed 's/\"/\\\\\"/g')\n", + " \n", + " # Add comma if not the first entry\n", + " if [ \"$first_entry\" = true ]; then\n", + " first_entry=false\n", + " else\n", + " echo \",\" >> \"$output_file\"\n", + " fi\n", + " \n", + " # Append key-value pair\n", + " echo \" \\\"$gene_id\\\": \\\"$description\\\"\" >> \"$output_file\"\n", + " \n", + "done < \"$input_file\"\n", + "\n", + "# Close JSON object\n", + "echo \"\" >> \"$output_file\"\n", + "echo \"}\" >> \"$output_file\"\n", + "\n", + "echo \"✅ Gene ID to name mapping completed\"\n", + "echo \"💾 Results saved to $output_file\"\n", + "'''\n", + " \n", + " # Write the script\n", + " with open(\"gene_mapping.sh\", 'w') as f:\n", + " f.write(script_content)\n", + " \n", + " # Make executable\n", + " os.chmod(\"gene_mapping.sh\", 0o755)\n", + " \n", + " print(\"✅ Created gene_mapping.sh script\")\n", + " print(\"\\n🚀 To run the gene mapping:\")\n", + " print(\" ./gene_mapping.sh\")\n", + " print(\"\\n⚠️ Note: This requires NCBI EDirect tools to be installed\")\n", + " \n", + "else:\n", + " print(\"⚠️ Cannot create gene mapping - data not available\")" + ] + }, + { + "cell_type": "markdown", + "id": "1957ef57-1af8-46a1-8d1b-147f6b423619", + "metadata": {}, + "source": [ + "## Apply Gene Name Mapping\n", + "\n", + "Load the gene ID to name mapping and apply it to the dataset to add human-readable gene names.\n", + "\n", + "Read json and add it to the clinvar_data df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b39be718-c0ae-4aae-b1d8-d0c872947ec2", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "# Load gene ID to name mapping and apply to dataset\n", + "\n", + "if 'clinvar_data' in locals() and clinvar_data is not None:\n", + " try:\n", + " # Load gene ID → name mapping\n", + " with open(\"gene_id_to_name.json\", \"r\") as f:\n", + " gene_id_dict = json.load(f)\n", + " \n", + " print(f\"✅ Loaded mapping for {len(gene_id_dict)} gene IDs\")\n", + " \n", + " # Function to convert gene IDs to gene names\n", + " def get_gene_names(gene_id_str):\n", + " if pd.isna(gene_id_str) or not gene_id_str.strip():\n", + " return \"\"\n", + " \n", + " gene_ids = [gid.strip() for gid in gene_id_str.split(\",\") if gid.strip()]\n", + " gene_names = []\n", + " \n", + " for gid in gene_ids:\n", + " gene_name = gene_id_dict.get(gid, f\"Unknown_ID_{gid}\")\n", + " gene_names.append(gene_name)\n", + " \n", + " return \" | \".join(gene_names)\n", + " \n", + " # Apply mapping to create gene names column\n", + " print(\"📊 Applying gene name mapping...\")\n", + " clinvar_data[\"GENE_Name\"] = clinvar_data[\"GENE_ID\"].apply(get_gene_names)\n", + " \n", + " # Statistics\n", + " mapped_count = (clinvar_data[\"GENE_Name\"] != \"\").sum()\n", + " print(f\"✅ Gene names mapped for {mapped_count} entries ({mapped_count/len(clinvar_data)*100:.1f}%)\")\n", + " \n", + " # Show sample mappings\n", + " sample_data = clinvar_data[clinvar_data[\"GENE_Name\"] != \"\"][[\"GENE_ID\", \"GENE_Name\"]].head()\n", + " if not sample_data.empty:\n", + " print(\"\\n🔍 Sample gene ID to name mappings:\")\n", + " for _, row in sample_data.iterrows():\n", + " print(f\" {row['GENE_ID']} → {row['GENE_Name'][:100]}{'...' if len(row['GENE_Name']) > 100 else ''}\")\n", + " \n", + " except FileNotFoundError:\n", + " print(\"❌ Error: gene_id_to_name.json not found\")\n", + " print(\"Please run the gene mapping script first: ./gene_mapping.sh\")\n", + " # Create empty column as fallback\n", + " clinvar_data[\"GENE_Name\"] = \"\"\n", + " except json.JSONDecodeError as e:\n", + " print(f\"❌ Error parsing JSON mapping file: {e}\")\n", + " clinvar_data[\"GENE_Name\"] = \"\"\n", + " except Exception as e:\n", + " print(f\"❌ Error applying gene mapping: {e}\")\n", + " clinvar_data[\"GENE_Name\"] = \"\"\n", + "else:\n", + " print(\"⚠️ Cannot apply gene mapping - data not available\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b7a44c2-7823-47c1-b268-22a1815ffd09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CHROMPOSREFALTLABELSOURCECONSEQUENCEIDREVIEW_STATUSGENEsplitINT_LABELGENE_IDDiseaseGENE_Name
0chr1976215AGPathogenicClinVarmissense_variant1320032no_assertion_criteria_providedPERM1train184808Renal tubular epithelial cell apoptosisPPARGC1 and ESRR induced regulator, muscle 1
1chr1976215AGPathogenicClinVarmissense_variant1320032no_assertion_criteria_providedPERM1train184808Neutrophil inclusion bodiesPPARGC1 and ESRR induced regulator, muscle 1
2chr11050449GAPathogenicClinVarmissense_variant1284257no_assertion_criteria_providedAGRNtrain1375790Congenital myasthenic syndrome 8agrin
3chr11050575GCPathogenicClinVarmissense_variant18241no_assertion_criteria_providedAGRNtrain1375790Congenital myasthenic syndrome 8agrin
4chr11213738GAPathogenicClinVarmissense_variant96692no_assertion_criteria_providedTNFRSF4train17293Combined immunodeficiency due to OX40 deficiencyTNF receptor superfamily member 4
................................................
32680chrY2787412CTPathogenicClinVarmissense_variant9747no_assertion_criteria_providedSRYtrain1673646,XY sex reversal 1sex determining region Y
32681chrY2787426CGPathogenicClinVarmissense_variant9739criteria_provided,_single_submitterSRYtrain16736not providedsex determining region Y
32682chrY2787515CAPathogenicClinVarmissense_variant492908no_assertion_criteria_providedSRYtrain1673646,XY sex reversal 1sex determining region Y
32683chrY2787551CTPathogenicClinVarmissense_variant9754no_assertion_criteria_providedSRYtrain1673646,XY sex reversal 1sex determining region Y
32684chrY7063898ATPathogenicClinVarmissense_variant625467no_assertion_criteria_providedLOC126057105, TBL1Ytrain1126057105, 90665Deafness, Y-linked 2P300/CBP strongly-dependent group 1 enhancer G...
\n", + "

32685 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " CHROM POS REF ALT LABEL SOURCE CONSEQUENCE ID \\\n", + "0 chr1 976215 A G Pathogenic ClinVar missense_variant 1320032 \n", + "1 chr1 976215 A G Pathogenic ClinVar missense_variant 1320032 \n", + "2 chr1 1050449 G A Pathogenic ClinVar missense_variant 1284257 \n", + "3 chr1 1050575 G C Pathogenic ClinVar missense_variant 18241 \n", + "4 chr1 1213738 G A Pathogenic ClinVar missense_variant 96692 \n", + "... ... ... .. .. ... ... ... ... \n", + "32680 chrY 2787412 C T Pathogenic ClinVar missense_variant 9747 \n", + "32681 chrY 2787426 C G Pathogenic ClinVar missense_variant 9739 \n", + "32682 chrY 2787515 C A Pathogenic ClinVar missense_variant 492908 \n", + "32683 chrY 2787551 C T Pathogenic ClinVar missense_variant 9754 \n", + "32684 chrY 7063898 A T Pathogenic ClinVar missense_variant 625467 \n", + "\n", + " REVIEW_STATUS GENE split \\\n", + "0 no_assertion_criteria_provided PERM1 train \n", + "1 no_assertion_criteria_provided PERM1 train \n", + "2 no_assertion_criteria_provided AGRN train \n", + "3 no_assertion_criteria_provided AGRN train \n", + "4 no_assertion_criteria_provided TNFRSF4 train \n", + "... ... ... ... \n", + "32680 no_assertion_criteria_provided SRY train \n", + "32681 criteria_provided,_single_submitter SRY train \n", + "32682 no_assertion_criteria_provided SRY train \n", + "32683 no_assertion_criteria_provided SRY train \n", + "32684 no_assertion_criteria_provided LOC126057105, TBL1Y train \n", + "\n", + " INT_LABEL GENE_ID \\\n", + "0 1 84808 \n", + "1 1 84808 \n", + "2 1 375790 \n", + "3 1 375790 \n", + "4 1 7293 \n", + "... ... ... \n", + "32680 1 6736 \n", + "32681 1 6736 \n", + "32682 1 6736 \n", + "32683 1 6736 \n", + "32684 1 126057105, 90665 \n", + "\n", + " Disease \\\n", + "0 Renal tubular epithelial cell apoptosis \n", + "1 Neutrophil inclusion bodies \n", + "2 Congenital myasthenic syndrome 8 \n", + "3 Congenital myasthenic syndrome 8 \n", + "4 Combined immunodeficiency due to OX40 deficiency \n", + "... ... \n", + "32680 46,XY sex reversal 1 \n", + "32681 not provided \n", + "32682 46,XY sex reversal 1 \n", + "32683 46,XY sex reversal 1 \n", + "32684 Deafness, Y-linked 2 \n", + "\n", + " GENE_Name \n", + "0 PPARGC1 and ESRR induced regulator, muscle 1 \n", + "1 PPARGC1 and ESRR induced regulator, muscle 1 \n", + "2 agrin \n", + "3 agrin \n", + "4 TNF receptor superfamily member 4 \n", + "... ... \n", + "32680 sex determining region Y \n", + "32681 sex determining region Y \n", + "32682 sex determining region Y \n", + "32683 sex determining region Y \n", + "32684 P300/CBP strongly-dependent group 1 enhancer G... \n", + "\n", + "[32685 rows x 15 columns]" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Display final dataset with all extracted information\n", + "if 'clinvar_data' in locals() and clinvar_data is not None:\n", + " print(f\"📊 Final dataset shape: {clinvar_data.shape}\")\n", + " print(f\"📋 Columns: {list(clinvar_data.columns)}\")\n", + " \n", + " # Data completeness statistics\n", + " print(\"\\n📈 Data Completeness:\")\n", + " for col in ['GENE', 'GENE_ID', 'GENE_Name', 'Disease']:\n", + " if col in clinvar_data.columns:\n", + " filled_count = (clinvar_data[col] != '').sum()\n", + " print(f\" {col}: {filled_count}/{len(clinvar_data)} ({filled_count/len(clinvar_data)*100:.1f}%)\")\n", + " \n", + " # Sample data\n", + " print(\"\\n🔍 Sample data:\")\n", + " display(clinvar_data.head())\n", + " \n", + " # Memory usage\n", + " memory_mb = clinvar_data.memory_usage(deep=True).sum() / 1024 / 1024\n", + " print(f\"\\n💾 Dataset memory usage: {memory_mb:.1f} MB\")\n", + "else:\n", + " print(\"❌ No final data to display\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c545ae83-5cd1-4e29-87fd-69389bdb153f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'P300/CBP strongly-dependent group 1 enhancer GRCh37_chrY:6931456-6932655| transducin beta like 1 Y-linked'" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Show example of gene name mapping\n", + "if 'clinvar_data' in locals() and clinvar_data is not None and len(clinvar_data) > 32684:\n", + " example_gene_name = clinvar_data.iloc[32684]['GENE_Name']\n", + " example_gene_id = clinvar_data.iloc[32684]['GENE_ID']\n", + " \n", + " print(f\"🔍 Example gene mapping for row 32684:\")\n", + " print(f\" Gene ID: {example_gene_id}\")\n", + " print(f\" Gene Name: {example_gene_name}\")\n", + "else:\n", + " # Show any available example\n", + " if 'clinvar_data' in locals() and clinvar_data is not None and not clinvar_data.empty:\n", + " # Find first row with gene name data\n", + " example_row = clinvar_data[clinvar_data['GENE_Name'] != ''].iloc[0] if (clinvar_data['GENE_Name'] != '').any() else clinvar_data.iloc[0]\n", + " \n", + " print(f\"🔍 Example gene mapping:\")\n", + " print(f\" Gene ID: {example_row.get('GENE_ID', 'N/A')}\")\n", + " print(f\" Gene Name: {example_row.get('GENE_Name', 'N/A')}\")\n", + " else:\n", + " print(\"❌ No data available for example\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a214c29d-a4f1-4af6-a914-e6b4a14a1c49", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Save the final processed dataset\n", + "if 'clinvar_data' in locals() and clinvar_data is not None:\n", + " output_file = \"clinvar_with_disease.csv\"\n", + " \n", + " try:\n", + " clinvar_data.to_csv(output_file, index=False)\n", + " \n", + " print(f\"✅ Final dataset saved to {output_file}\")\n", + " print(f\"📊 Saved {len(clinvar_data)} records with {len(clinvar_data.columns)} columns\")\n", + " \n", + " # File size\n", + " file_size = os.path.getsize(output_file) / 1024 / 1024\n", + " print(f\"💾 File size: {file_size:.1f} MB\")\n", + " \n", + " # Summary of what was accomplished\n", + " print(\"\\n🎯 Processing Summary:\")\n", + " print(f\" ✓ Extracted ClinVar coding variants\")\n", + " print(f\" ✓ Parsed XML records for gene information\")\n", + " print(f\" ✓ Mapped diseases/phenotypes\")\n", + " print(f\" ✓ Added human-readable gene names\")\n", + " print(f\" ✓ Created comprehensive dataset\")\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Error saving dataset: {e}\")\n", + "else:\n", + " print(\"⚠️ No data available to save\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6c4c1f4-4b87-4624-8f8a-c568e40b2e63", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import shutil\n", + "\n", + "# Optional: Clean up temporary XML data directory\n", + "# Uncomment the following lines if you want to remove the XML files to save space\n", + "\n", + "if os.path.exists(\"data\") and os.path.isdir(\"data\"):\n", + " # Count files before cleanup\n", + " xml_files = [f for f in os.listdir(\"data\") if f.endswith('.xml')]\n", + " \n", + " print(f\"🗂️ Found {len(xml_files)} XML files in data directory\")\n", + " \n", + " # Uncomment to actually remove the directory\n", + " # shutil.rmtree(\"data\")\n", + " # print(\"🗑️ Removed temporary XML data directory\")\n", + " \n", + " print(\"ℹ️ XML files preserved. Uncomment the cleanup code to remove them.\")\n", + "else:\n", + " print(\"ℹ️ No XML data directory found to clean up\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c08beea6-6ff7-4900-a8b8-8a719db36189", + "metadata": {}, + "outputs": [], + "source": [ + "## Processing Complete ✅\n", + "\n", + "The ClinVar coding variants have been successfully processed with the following enhancements:\n", + "\n", + "### Generated Files:\n", + "- `clinvar_coding_raw.csv` - Raw ClinVar entries extracted from VEP data\n", + "- `Clinvar_ID.txt` - List of ClinVar IDs for processing\n", + "- `gene_id.txt` - Unique gene IDs for name mapping\n", + "- `gene_id_to_name.json` - Gene ID to name mapping dictionary\n", + "- `clinvar_with_disease.csv` - **Final comprehensive dataset**\n", + "\n", + "### Dataset Features:\n", + "- **Variant Information**: Genomic coordinates, alleles, and annotations\n", + "- **Gene Data**: Symbols, IDs, and human-readable names\n", + "- **Disease/Phenotype**: Associated conditions and clinical significance\n", + "- **Expanded Format**: One row per variant-disease combination\n", + "\n", + "### Next Steps:\n", + "1. **Quality Control**: Review the data for completeness and accuracy\n", + "2. **Analysis**: Use the dataset for downstream genetic analysis\n", + "3. **Integration**: Combine with other datasets as needed\n", + "4. **Documentation**: Update metadata and create data dictionary\n", + "\n", + "### File Cleanup:\n", + "- XML files in `data/` directory can be removed to save space\n", + "- Intermediate files can be archived or removed as needed" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/BioReason-main/data/Clinvar_SNV_Non_SNV.ipynb b/BioReason-main/data/Clinvar_SNV_Non_SNV.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..62657133d3efd260415bab1cfbf958e9551843e3 --- /dev/null +++ b/BioReason-main/data/Clinvar_SNV_Non_SNV.ipynb @@ -0,0 +1,3425 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ClinVar SNV and Non-SNV Processing Pipeline\n", + "\n", + "This notebook processes ClinVar genetic variants to create machine learning datasets for variant effect prediction. See `Clinvar_SNV_Non_SNV_README.md` for detailed documentation.\n", + "\n", + "## Quick Start\n", + "\n", + "1. Update file paths in the configuration section\n", + "2. Ensure all dependencies are installed\n", + "3. Run cells in order\n", + "4. Monitor progress and memory usage\n", + "\n", + "**⚠️ Important**: This pipeline requires significant computational resources and storage space." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "Update these paths for your environment:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration - Update these paths for your environment\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "# File paths (update these for your system)\n", + "CONFIG = {\n", + " # Input data\n", + " 'clinvar_vcf': 'data/clinvar_grch38.vcf.gz',\n", + " 'reference_genome': 'data/reference/GRCh38.fa',\n", + " 'hgnc_mapping': 'data/hgnc_complete_set.txt',\n", + " \n", + " # VEP configuration\n", + " 'vep_root': '/path/to/vep',\n", + " 'vep_cache': '/path/to/vep/cache',\n", + " \n", + " # Output paths\n", + " 'output_dir': 'output',\n", + " 'temp_dir': 'temp',\n", + " \n", + " # Processing parameters\n", + " 'window_size': 4096,\n", + " 'max_variant_size': 64,\n", + " 'num_threads': 8,\n", + " 'batch_size': 100000\n", + "}\n", + "\n", + "SCRATCH_DIR = '/your/scratch/directory' # Update this to your scratch directory\n", + "\n", + "# Create output directories\n", + "for dir_path in [CONFIG['output_dir'], CONFIG['temp_dir']]:\n", + " os.makedirs(dir_path, exist_ok=True)\n", + " \n", + "print(\"Configuration loaded. Please verify all paths are correct:\")\n", + "for key, value in CONFIG.items():\n", + " if 'path' in key or 'dir' in key:\n", + " exists = os.path.exists(value) if not key.endswith('dir') else True\n", + " status = \"✅\" if exists else \"❌\"\n", + " print(f\" {status} {key}: {value}\")\n", + " \n", + "print(\"\\n📝 Update CONFIG dictionary above with your actual file paths\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ClinVar SNV and Non-SNV Variant Processing Pipeline\n", + "\n", + "This notebook processes ClinVar genetic variants (both SNVs and non-SNVs) to create a comprehensive machine learning dataset for variant effect prediction. The pipeline includes:\n", + "\n", + "## Overview\n", + "\n", + "1. **Data Processing**: Download and process ClinVar VCF data using VEP (Variant Effect Predictor)\n", + "2. **Sequence Window Extraction**: Generate 4096bp genomic windows centered on variants\n", + "3. **Feature Engineering**: Extract pathogenicity, disease associations, and gene information\n", + "4. **Dataset Creation**: Build training/test datasets with disjoint disease splits\n", + "5. **Quality Control**: Comprehensive statistics and validation\n", + "\n", + "## Key Features\n", + "\n", + "- **Genomic Windows**: 4096bp sequences with centered mutations\n", + "- **Variant Types**: Both SNVs and structural variants (insertions, deletions, etc.)\n", + "- **Clinical Annotations**: Pathogenicity classification and disease associations\n", + "- **Gene Mapping**: Integration with HGNC gene nomenclature\n", + "- **Disjoint Splits**: Train/test splits ensuring no disease overlap\n", + "\n", + "## Requirements\n", + "\n", + "- **Computational Resources**: High-memory system (recommended for large datasets)\n", + "- **Software Dependencies**: VEP, Python libraries (pandas, pysam, pyarrow, hgvs)\n", + "- **Reference Data**: GRCh38 genome assembly, HGNC gene mapping\n", + "- **Storage**: Sufficient space for intermediate files (~100GB+)\n", + "\n", + "## Output\n", + "\n", + "Final datasets suitable for:\n", + "- Variant effect prediction models\n", + "- Pathogenicity classification\n", + "- Disease association studies\n", + "- Genomic language model training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initial Setup (For HPC/Cluster Environments)\n", + "\n", + "**Note**: This section contains setup instructions for high-performance computing environments. Adapt paths and module loading commands for your specific system.\n", + "\n", + "### Prerequisites Installation\n", + "If running on a cluster, you may need to download Python wheels and reference data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Download required Python packages and reference data\n", + "# Adjust paths and module loading for your specific environment\n", + "\n", + "# Example for cluster environments:\n", + "# module load python gcc arrow postgresql\n", + "\n", + "# Create directory for Python wheels (adjust path as needed)\n", + "# mkdir -p /path/to/your/pywheels\n", + "# pip download hgvs -d /path/to/your/pywheels\n", + "\n", + "# Download HGNC gene mapping data\n", + "# wget -O hgnc_complete_set.txt \"https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt\"\n", + "\n", + "print(\"Setup instructions provided above. Adjust paths for your environment.\")\n", + "print(\"Required data:\")\n", + "print(\"- HGNC complete gene set\")\n", + "print(\"- Python packages: hgvs, pandas, pyarrow, pysam, tqdm\")\n", + "print(\"- VEP installation with cache\")\n", + "print(\"- GRCh38 reference genome\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environment Setup\n", + "\n", + "**For cluster/HPC environments**: Configure virtual environment and load required modules.\n", + "**For local environments**: Ensure all dependencies are installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Environment setup for cluster/HPC systems\n", + "# Adjust module loading and paths for your specific environment\n", + "\n", + "# Example cluster setup:\n", + "\"\"\"\n", + "# Create virtual environment\n", + "python -m venv /tmp/clinvar_env\n", + "\n", + "# Load required modules (adjust for your system)\n", + "module load python gcc arrow postgresql\n", + "module load perl samtools tabix bcftools mariadb\n", + "\n", + "# Activate virtual environment\n", + "source /tmp/clinvar_env/bin/activate\n", + "\n", + "# Install packages\n", + "pip install notebook pandas pyarrow pysam hgvs tqdm networkx\n", + "\n", + "# Start Jupyter (for remote access)\n", + "jupyter notebook --no-browser --ip=$(hostname -f) --port=8888\n", + "\"\"\"\n", + "\n", + "# For local environments, ensure these packages are installed:\n", + "required_packages = [\n", + " 'pandas>=1.3.0',\n", + " 'pyarrow>=5.0.0', \n", + " 'pysam>=0.19.0',\n", + " 'hgvs>=1.5.0',\n", + " 'tqdm>=4.60.0',\n", + " 'networkx>=2.6.0'\n", + "]\n", + "\n", + "print(\"Required packages:\")\n", + "for pkg in required_packages:\n", + " print(f\" - {pkg}\")\n", + " \n", + "print(\"\\nFor VEP processing, also required:\")\n", + "print(\" - VEP (Ensembl Variant Effect Predictor)\")\n", + "print(\" - BCFtools, SAMtools, Tabix\")\n", + "print(\" - Reference genome and VEP cache files\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/localscratch/naimerja.43836119.0/clinvar_env/bin/python\n" + ] + } + ], + "source": [ + "!which python\n", + "# Verify Python environment and core dependencies\n", + "import sys\n", + "import subprocess\n", + "\n", + "print(f\"Python executable: {sys.executable}\")\n", + "print(f\"Python version: {sys.version}\")\n", + "\n", + "# Check for required packages\n", + "try:\n", + " import pandas as pd\n", + " import pyarrow as pa\n", + " import pysam\n", + " import hgvs\n", + " import tqdm\n", + " import networkx as nx\n", + " \n", + " print(\"\\n✅ Core dependencies available:\")\n", + " print(f\" - pandas: {pd.__version__}\")\n", + " print(f\" - pyarrow: {pa.__version__}\")\n", + " print(f\" - pysam: {pysam.__version__}\")\n", + " print(f\" - hgvs: {hgvs.__version__}\")\n", + " print(f\" - networkx: {nx.__version__}\")\n", + " \n", + "except ImportError as e:\n", + " print(f\"❌ Missing dependency: {e}\")\n", + " print(\"Please install required packages first\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install required packages\n", + "# Adjust installation method based on your environment\n", + "\n", + "# For environments with pre-downloaded wheels:\n", + "# !pip install --no-index --find-links /path/to/pywheels hgvs\n", + "# !pip install --no-index tqdm pandas pyarrow\n", + "\n", + "# For standard environments:\n", + "# !pip install hgvs tqdm pandas pyarrow pysam networkx\n", + "\n", + "print(\"Package installation commands provided above.\")\n", + "print(\"Choose the appropriate method for your environment:\")\n", + "print(\" - Standard: pip install \")\n", + "print(\" - Offline: pip install --no-index --find-links \")\n", + "print(\" - Conda: conda install \")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "View possible fields from clinvar\n", + "\n", + "## ClinVar VCF Data Exploration\n", + "\n", + "Examine the structure and metadata of the ClinVar VCF file to understand available annotations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "##fileformat=VCFv4.1\n", + "##FILTER=\n", + "##fileDate=2025-04-29\n", + "##source=ClinVar\n", + "##reference=GRCh38\n", + "##ID=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##bcftools_viewVersion=1.19+htslib-1.18\n", + "##bcftools_viewCommand=view -h /scratch/naimerja/DNASNVData113/clinvar_data/clinvar_grch38.vcf.gz; Date=Fri May 9 12:41:08 2025\n", + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" + ] + } + ], + "source": [ + "# Explore ClinVar VCF file structure\n", + "# Update the file path to point to your ClinVar VCF file\n", + "\n", + "import subprocess\n", + "import os\n", + "\n", + "# Example VCF file path (update for your data)\n", + "vcf_file = \"data/clinvar_grch38.vcf.gz\" # Update this path\n", + "\n", + "# Check if file exists\n", + "if os.path.exists(vcf_file):\n", + " try:\n", + " # View VCF header to understand available fields\n", + " result = subprocess.run(\n", + " [\"bcftools\", \"view\", \"-h\", vcf_file],\n", + " capture_output=True, text=True, check=True\n", + " )\n", + " \n", + " print(\"ClinVar VCF Header (first 50 lines):\")\n", + " print(\"=\" * 50)\n", + " header_lines = result.stdout.split('\\n')[:50]\n", + " for line in header_lines:\n", + " print(line)\n", + " \n", + " except (subprocess.CalledProcessError, FileNotFoundError) as e:\n", + " print(f\"Error reading VCF file: {e}\")\n", + " print(\"Please ensure bcftools is installed and VCF file path is correct\")\n", + "else:\n", + " print(f\"VCF file not found: {vcf_file}\")\n", + " print(\"Please update the file path to point to your ClinVar VCF file\")\n", + " print(\"Download from: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/\")\n", + "\n", + "print(\"\\nKey ClinVar INFO fields to look for:\")\n", + "print(\"- CLNSIG: Clinical significance\")\n", + "print(\"- CLNDN: Disease name\")\n", + "print(\"- GENEINFO: Gene information\")\n", + "print(\"- CLNREVSTAT: Review status\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "VEP to clean raw clinvar vcf to cleaned coding only vcf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 2) Point to your VEP install and cache, and wire up Perl libs:\n", + "import os\n", + "\n", + "os.environ['VEP_ROOT'] = 'SCRATCH_DIR/DNASNVData113/clinvar_data/vep-code-113'\n", + "os.environ['VEP_CACHE'] = 'SCRATCH_DIR/DNASNVData113/clinvar_data/vep-cache-113'\n", + "os.environ['PERL5LIB'] = 'SCRATCH_DIR/perl5/lib/perl5:' + os.environ.get('PERL5LIB','')\n", + "# prepend VEP_ROOT onto the existing PATH\n", + "os.environ['PATH'] = os.environ['VEP_ROOT'] + ':' + os.environ.get('PATH','')\n", + "\n", + "# now this will actually show your full, correct PATH:\n", + "!echo $PATH\n", + "!which bash\n", + "!which vep\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "/usr/bin/time -v $VEP_ROOT/vep \\\n", + " --input_file SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_grch38.vcf.gz \\\n", + " --output_file SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_coding_only.vcf \\\n", + " --cache \\\n", + " --dir_cache $VEP_CACHE \\\n", + " --offline \\\n", + " --fasta $VEP_CACHE/homo_sapiens/113_GRCh38/Homo_sapiens.GRCh38.dna.toplevel.fa \\\n", + " --species homo_sapiens \\\n", + " --assembly GRCh38 \\\n", + " --vcf \\\n", + " --hgvs \\\n", + " --pick \\\n", + " --fork 48 \\\n", + " --force_overwrite \\\n", + " --verbose \\\n", + " --coding_only\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: VEP Processing\n", + "\n", + "Process ClinVar VCF through VEP to add annotations and filter for coding variants." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!/usr/bin/env python3\n", + "import hgvs.edit as HEdit\n", + "from hgvs.parser import Parser\n", + "from hgvs.exceptions import HGVSError\n", + "from hgvs.enums import Datum\n", + "import hgvs.location as loc\n", + "\n", + "from collections import Counter\n", + "from concurrent.futures import ProcessPoolExecutor\n", + "from tqdm import tqdm\n", + "\n", + "def is_coding_pos(pos):\n", + " \"\"\"\n", + " Return True if the given position is within the translated CDS.\n", + " Excludes:\n", + " - intronic offsets (BaseOffsetPosition.is_intronic)\n", + " - 5′ UTR (datum=CDS_START and base < 1)\n", + " - 3′ UTR (datum=CDS_END)\n", + " \"\"\"\n", + " p = pos.start if hasattr(pos, \"start\") else pos\n", + " if isinstance(p, loc.BaseOffsetPosition):\n", + " dbg = f\"(base={p.base}, datum={p.datum}, offset={p.offset})\"\n", + " if p.is_intronic:\n", + " return False\n", + " if p.datum == Datum.CDS_START and p.base < 1:\n", + " return False\n", + " if p.datum == Datum.CDS_END:\n", + " return False\n", + " if p.datum == Datum.CDS_START and p.base >= 1:\n", + " return True\n", + " # any other datum we don’t recognize\n", + " raise ValueError(f\"Unrecognized BaseOffsetPosition {dbg}, full pos object: {pos!r}\")\n", + "\n", + "def _init_worker(idx):\n", + " # runs once in each worker\n", + " global parser, hgvsc_idx\n", + " parser = Parser()\n", + " hgvsc_idx = idx\n", + "\n", + "\n", + "def _classify_line(line):\n", + " # split on tabs to get INFO (column 7)\n", + " cols = line.rstrip(\"\\n\").split(\"\\t\")\n", + " if len(cols) < 8:\n", + " return (\"unmatched\", None, \"\")\n", + "\n", + " info = cols[7]\n", + " # pull CSQ=\n", + " csq_entries = [kv.split(\"=\",1)[1]\n", + " for kv in info.split(\";\")\n", + " if kv.startswith(\"CSQ=\")]\n", + " if not csq_entries:\n", + " return (\"unmatched\", None, \"\")\n", + "\n", + " # first allele in CSQ, then HGVSc field\n", + " hfull = csq_entries[0].split(\",\")[0].split(\"|\")[hgvsc_idx]\n", + " if not hfull:\n", + " return (\"unmatched\", None, \"\")\n", + "\n", + " # parse HGVS\n", + " try:\n", + " var = parser.parse_hgvs_variant(hfull)\n", + " except HGVSError:\n", + " return (\"unmatched\", None, hfull)\n", + "\n", + " edit = var.posedit.edit\n", + " pos = var.posedit.pos\n", + "\n", + " # get 1-based start/end\n", + " if hasattr(pos, \"start\") and hasattr(pos, \"end\"):\n", + " start = pos.start.base\n", + " end = pos.end.base\n", + " else:\n", + " start = end = pos.base\n", + "\n", + " # generic type key\n", + " etype = edit.type # attribute, not method\n", + " if etype in (\"del\", \"dup\", \"inv\"):\n", + " key = f\"{etype}_{'single' if start == end else 'range'}\"\n", + " else:\n", + " key = etype # covers sub, ins, delins, etc.\n", + "\n", + " # coding vs noncoding\n", + " coding = is_coding_pos(pos)\n", + "\n", + " return (key, coding, None)\n", + "\n", + "\n", + "def scan_hgvsc_types(vcf_path, max_workers=24):\n", + " # 1) find CSQ header → HGVSc index\n", + " csq_fields = None\n", + " with open(vcf_path) as f:\n", + " for line in f:\n", + " if line.startswith(\"##INFO=')[0].strip()\n", + " csq_fields = desc.split(\"|\")\n", + " break\n", + " if not csq_fields:\n", + " raise RuntimeError(\"Couldn't find CSQ header in VCF\")\n", + " idx = csq_fields.index(\"HGVSc\")\n", + "\n", + " # 2) count lines for progress bar\n", + " total = sum(1 for _ in open(vcf_path) if not _.startswith(\"#\"))\n", + "\n", + " coding_counts = Counter()\n", + " noncoding_counts = Counter()\n", + " unmatched_counts = Counter()\n", + "\n", + " # 3) parallel processing\n", + " with ProcessPoolExecutor(\n", + " max_workers=max_workers,\n", + " initializer=_init_worker,\n", + " initargs=(idx,)\n", + " ) as exe:\n", + " # only non-header lines\n", + " lines = (l for l in open(vcf_path) if not l.startswith(\"#\"))\n", + " for key, coding, extra in tqdm(\n", + " exe.map(_classify_line, lines, chunksize=1000),\n", + " total=total,\n", + " desc=\"Scanning variants\"\n", + " ):\n", + " if key == \"unmatched\":\n", + " unmatched_counts[extra] += 1\n", + " else:\n", + " if coding:\n", + " coding_counts[key] += 1\n", + " else:\n", + " noncoding_counts[key] += 1\n", + "\n", + " # 4) report\n", + " print(\"\\n=== Coding-region variants ===\")\n", + " for name, cnt in coding_counts.most_common():\n", + " print(f\" {name}: {cnt}\")\n", + "\n", + " print(\"\\n=== Non-coding variants (UTR & intronic) ===\")\n", + " for name, cnt in noncoding_counts.most_common():\n", + " print(f\" {name}: {cnt}\")\n", + "\n", + " print(\"\\n=== Unmatched HGVSc patterns ===\")\n", + " for h, cnt in unmatched_counts.most_common():\n", + " print(f\" {h}: {cnt}\")\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " scan_hgvsc_types(\n", + " \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_coding_only.vcf\",\n", + " max_workers=24\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Creating data table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!/usr/bin/env python3\n", + "import os\n", + "import pandas as pd\n", + "# Use 24 threads for PyArrow encoding\n", + "os.environ[\"ARROW_NUM_THREADS\"] = \"24\"\n", + "\n", + "import pysam\n", + "import pyarrow as pa\n", + "import pyarrow.parquet as pq\n", + "from tqdm import tqdm\n", + "\n", + "def get_window(genome, chrom, pos0, window_size=4096, pad_char=\"N\"):\n", + " \"\"\"\n", + " Fetch exactly `window_size` bases centered at 0-based pos0\n", + " from the pysam.FastaFile `genome`, padding with `pad_char`.\n", + " \"\"\"\n", + " half = window_size // 2\n", + " start = pos0 - half\n", + " end = start + window_size\n", + "\n", + " parts = []\n", + " chrom_len = genome.get_reference_length(chrom)\n", + "\n", + " # left padding\n", + " if start < 0:\n", + " parts.append(pad_char * -start)\n", + " fetch_start = 0\n", + " else:\n", + " fetch_start = start\n", + "\n", + " # fetch middle\n", + " fetch_end = min(end, chrom_len)\n", + " parts.append(genome.fetch(chrom, fetch_start, fetch_end))\n", + "\n", + " # right padding\n", + " if fetch_end < end:\n", + " parts.append(pad_char * (end - fetch_end))\n", + "\n", + " return \"\".join(parts)\n", + "\n", + "\n", + "def main(vcf_path, genome_fasta_path, out_parquet_path):\n", + " use_cols = [\"symbol\", \"name\", \"entrez_id\"]\n", + " hgnc_df = pd.read_csv(\n", + " \"SCRATCH_DIR/DNASNVData113/clinvar_data/hgnc_complete_set.txt\",\n", + " sep=\"\\t\", usecols=use_cols,\n", + " dtype={\"entrez_id\": \"Int64\"}\n", + " )\n", + " # build a dict mapping Entrez ID → approved name\n", + " gene_desc_map = dict(zip(\n", + " hgnc_df[\"entrez_id\"].astype(str), # ensure keys are strings if your gene_id is str\n", + " hgnc_df[\"name\"]\n", + " ))\n", + "\n", + " missing_genes = 0\n", + " # definitions\n", + " PATHOGENIC_ALLOWED = {\n", + " \"pathogenic\",\n", + " \"pathogenic/likely_pathogenic\",\n", + " \"likely_pathogenic\",\n", + " \"benign\",\n", + " \"likely_benign\",\n", + " \"benign/likely_benign\",\n", + " }\n", + "\n", + " REVIEW_STATUS_ALLOWED = {\n", + " \"criteria_provided,_multiple_submitters,_no_conflicts\",\n", + " \"reviewed_by_expert_panel\",\n", + " \"practice_guideline\",\n", + " }\n", + "\n", + " # 0) explicitly remove any old output\n", + " try:\n", + " os.remove(out_parquet_path)\n", + " except FileNotFoundError:\n", + " pass\n", + "\n", + " # count variants for progress bar\n", + " total = sum(1 for line in open(vcf_path) if not line.startswith(\"#\"))\n", + "\n", + " # open the genomic FASTA\n", + " genome = pysam.FastaFile(genome_fasta_path)\n", + " fasta_contigs = set(genome.references) # <<< build this once\n", + "\n", + " # prepare for Parquet writing\n", + " writer = None\n", + " batch = {col: [] for col in (\n", + " \"clinvar_id\",\n", + " \"original_window\",\n", + " \"mutated_window\",\n", + " \"cleaned_pathogenicity\",\n", + " \"disease_name\",\n", + " \"gene_name\",\n", + " \"gene_desc\",\n", + " \"chromosome\",\n", + " \"chromosome_position\",\n", + " \"variant_type\",\n", + " \"clinvar_link\",\n", + " \"gene_id\",\n", + " \"mutation_instruction\",\n", + " \"pathogenicity\",\n", + " \"review_status\"\n", + " )}\n", + " batch_size = 100_000\n", + "\n", + " def flush_batch():\n", + " nonlocal writer, batch\n", + " table = pa.Table.from_pydict(batch)\n", + " if writer is None:\n", + " writer = pq.ParquetWriter(\n", + " out_parquet_path,\n", + " table.schema,\n", + " compression=\"snappy\",\n", + " use_dictionary=True\n", + " )\n", + " writer.write_table(table)\n", + " for col in batch:\n", + " batch[col].clear()\n", + "\n", + " # process VCF\n", + " with open(vcf_path) as vf:\n", + " for line in tqdm(vf, total=total, desc=\"Writing Parquet\"):\n", + " if line.startswith(\"#\"):\n", + " continue\n", + " cols = line.rstrip(\"\\n\").split(\"\\t\")\n", + " chrom, pos1, clinvar_id, ref, alt = cols[:5]\n", + "\n", + " # --- SKIP if this contig is not in your FASTA --- or mitochondrial chromosome (keeps only nuclear chromosomes as in Evo2)\n", + " if chrom not in fasta_contigs or chrom == \"MT\":\n", + " continue\n", + "\n", + " # Skip variants too large to fit sensibly in a 4 096 bp window\n", + " MAX_EDIT = 64 # 64 bp\n", + " if len(ref) > MAX_EDIT or len(alt) > MAX_EDIT:\n", + " continue\n", + "\n", + "\n", + " info = {\n", + " kv.split(\"=\", 1)[0]: kv.split(\"=\", 1)[1]\n", + " for kv in cols[7].split(\";\") if \"=\" in kv\n", + " }\n", + "\n", + " # mutation instruction\n", + " instr = f\"{ref}>{alt}\"\n", + "\n", + " # extract 4096-bp window\n", + " pos0 = int(pos1) - 1\n", + " orig_win = get_window(genome, chrom, pos0, window_size=4096)\n", + "\n", + " # apply REF→ALT at center\n", + " half = 4096 // 2\n", + " i0 = half\n", + " i1 = half + len(ref)\n", + " mut_win = orig_win[:i0] + alt + orig_win[i1:]\n", + " # enforce fixed length\n", + " if len(mut_win) < 4096:\n", + " mut_win = mut_win.ljust(4096, \"N\")\n", + " elif len(mut_win) > 4096:\n", + " mut_win = mut_win[:4096]\n", + "\n", + " # pathogenicity, disease, variant type\n", + " path = info.get(\"CLNSIG\", \"\").lower()\n", + " dis = info.get(\"CLNDN\", \"\")\n", + " gene_info = info.get(\"GENEINFO\", \"\")\n", + "\n", + " #filter out variants with no gene info\n", + " if gene_info ==\"\":\n", + " missing_genes +=1\n", + " continue\n", + " else:\n", + " gene_name = gene_info.split(\":\")[0]\n", + " gene_id = gene_info.split(\":\")[1]\n", + "\n", + "\n", + " vart = \"SNV\" if len(ref) == 1 == len(alt) else \"non_SNV\"\n", + " rev_stat = info.get(\"CLNREVSTAT\", \"\").lower()\n", + "\n", + " # filter for pathogenic/(|)likely pathogenic or benign/(|)likely benign only\n", + " # only keep if ANY of the pipe-delimited terms is in our allowed set\n", + " terms = path.split(\"|\")\n", + " if not any(term in PATHOGENIC_ALLOWED for term in terms):\n", + " continue\n", + "\n", + " # filter for review status\n", + " if rev_stat not in REVIEW_STATUS_ALLOWED:\n", + " continue\n", + "\n", + " if \"pathogenic\" in path:\n", + " clean_pathogenicity = \"pathogenic\"\n", + " elif \"benign\" in path:\n", + " clean_pathogenicity = \"benign\"\n", + " else:\n", + " raise ValueError(f\"Unknown pathogenicity: {path}\")\n", + "\n", + "\n", + " # collect row\n", + " batch[\"clinvar_id\"].append(clinvar_id)\n", + " batch[\"mutation_instruction\"].append(instr)\n", + " batch[\"original_window\"].append(orig_win)\n", + " batch[\"mutated_window\"].append(mut_win)\n", + " batch[\"pathogenicity\"].append(path)\n", + " batch[\"cleaned_pathogenicity\"].append(clean_pathogenicity)\n", + " batch[\"disease_name\"].append(dis)\n", + " batch[\"variant_type\"].append(vart)\n", + " batch[\"review_status\"].append(rev_stat)\n", + " batch[\"gene_name\"].append(gene_name)\n", + " batch[\"gene_id\"].append(gene_id)\n", + " batch[\"chromosome\"].append(chrom)\n", + " batch[\"chromosome_position\"].append(pos1) # 1-based position on chromosome\n", + " batch[\"gene_desc\"].append(gene_desc_map.get(gene_id))\n", + " batch[\"clinvar_link\"].append(f\"https://www.ncbi.nlm.nih.gov/clinvar/variation/{clinvar_id}/\")\n", + "\n", + " # flush when batch is full\n", + " if len(batch[\"mutation_instruction\"]) >= batch_size:\n", + " flush_batch()\n", + "\n", + " # final flush & close\n", + " if batch[\"mutation_instruction\"]:\n", + " flush_batch()\n", + " if writer is not None:\n", + " writer.close()\n", + "\n", + " print(\"Finished writing →\", out_parquet_path)\n", + " print(f\"# Removed due to missing gene info: {missing_genes}\")\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " main(\n", + " \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_coding_only.vcf\",\n", + " \"SCRATCH_DIR/DNASNVData113/clinvar_data/\"\n", + " \"vep-cache-113/homo_sapiens/113_GRCh38/\"\n", + " \"Homo_sapiens.GRCh38.dna.toplevel.fa\",\n", + " \"SCRATCH_DIR/DNASNVData113/clinvar_data/\"\n", + " \"clinvar_windowed_4096.parquet\"\n", + " )\n", + "\n", + "# note to visually inspect the dna sequences and modified sequences go to https://www.ncbi.nlm.nih.gov/gdv/browser/genome/?id=GCF_000001405.40 and then click tools and then sequence text view" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Hereditary_factor_VIII_deficiency_disease|not_provided'" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['clinvar_id']=='10152']['disease_name'][342667]\n", + "# https://www.ncbi.nlm.nih.gov/clinvar/variation/10152/\n", + "# shows that only diseases with stars are included in the associated diseases (since hemophelia not included)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "[print(x) for x in (df[(df['pathogenicity']=='pathogenic') & df['disease_name'].str.contains(r'\\|')]['clinvar_link'])]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "On login node upload table to huggingface" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --no-index huggingface-hub\n", + "from huggingface_hub import HfApi\n", + "import os\n", + "import glob\n", + "\n", + "# 0) config\n", + "repo_id = \"wanglab/bioR_tasks\" # your dataset repo\n", + "repo_type = \"dataset\"\n", + "subfolder = \"variant_effect_non_snv_and_snv\"\n", + "local_dir = \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_windowed_4096.parquet\"\n", + "\n", + "api = HfApi()\n", + "\n", + "# 1) list all files in that subfolder\n", + "all_files = api.list_repo_files(repo_id, repo_type=repo_type)\n", + "old_files = [f for f in all_files if f.startswith(subfolder + \"/\")]\n", + "\n", + "print(f\"Will delete {len(old_files)} old files:\")\n", + "for f in old_files:\n", + " print(\" \", f)\n", + "\n", + "# 2) delete them (one commit per file, or you can batch by reusing the same commit_message)\n", + "for f in old_files:\n", + " api.delete_file(\n", + " path_in_repo = f,\n", + " repo_id = repo_id,\n", + " repo_type = repo_type,\n", + " commit_message = f\"remove old dataset file\"\n", + " )\n", + "\n", + "# 3) upload your single Parquet file\n", + "new_file = \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_windowed_4096.parquet\"\n", + "basename = os.path.basename(new_file)\n", + "dest_path = f\"{subfolder}/{basename}\"\n", + "\n", + "print(f\"Uploading {new_file!r} to {repo_id}/{dest_path} …\")\n", + "api.upload_file(\n", + " path_or_fileobj = new_file,\n", + " path_in_repo = dest_path,\n", + " repo_id = repo_id,\n", + " repo_type = repo_type,\n", + " commit_message = f\"add updated parquet {basename}\"\n", + ")\n", + "\n", + "print(\"Done! Your dataset has been updated on the Hub.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --no-index huggingface-hub\n", + "from huggingface_hub import HfApi\n", + "import os\n", + "import glob\n", + "\n", + "# 0) config\n", + "repo_id = \"wanglab/bioR_tasks\" # your dataset repo\n", + "repo_type = \"dataset\"\n", + "subfolder = \"variant_effect_non_snv_and_snv\"\n", + "local_dir = \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_windowed_4096.parquet\"\n", + "\n", + "api = HfApi()\n", + "\n", + "# 1) list all files in that subfolder\n", + "all_files = api.list_repo_files(repo_id, repo_type=repo_type)\n", + "old_files = [f for f in all_files if f.startswith(subfolder + \"/\")]\n", + "\n", + "\n", + "import io\n", + "\n", + "# Upload cleaned DataFrame\n", + "buffer = io.BytesIO()\n", + "final_df.to_parquet(buffer, index=False)\n", + "buffer.seek(0)\n", + "\n", + "# Construct cleaned filename by appending '_cleaned'\n", + "basename = os.path.splitext(os.path.basename(local_dir))[0] + \"_cleaned.parquet\"\n", + "dest_path = f\"{subfolder}/{basename}\"\n", + "\n", + "print(f\"Uploading cleaned DataFrame to {repo_id}/{dest_path} …\")\n", + "api.upload_file(\n", + " path_or_fileobj=buffer,\n", + " path_in_repo=dest_path,\n", + " repo_id=repo_id,\n", + " repo_type=repo_type,\n", + " commit_message=f\"add cleaned parquet {basename}\"\n", + ")\n", + "\n", + "print(\"Done! Cleaned DataFrame uploaded.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "read table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "→ Discovering data under '/scratch/naimerja/DNASNVData113/clinvar_data/clinvar_windowed_4096.parquet'\n", + "→ Scanning & reading all fragments in parallel …\n", + "→ Converting to pandas DataFrame…\n", + "✅ Loaded 342,689 rows in 3.3s\n", + "DataFrame shape: (342689, 15)\n", + "Memory usage: 3.18 GB\n" + ] + } + ], + "source": [ + "#!/usr/bin/env python3\n", + "import time, os\n", + "import pandas as pd\n", + "import pyarrow as pa\n", + "import pyarrow.parquet as pq\n", + "import pyarrow.dataset as ds\n", + "\n", + "from tqdm import tqdm\n", + "\n", + "def load_parquet_to_pandas(parquet_dir, num_threads=24):\n", + " # configure PyArrow global thread pool\n", + " pa.set_cpu_count(num_threads)\n", + " pa.set_io_thread_count(num_threads)\n", + "\n", + " start = time.time()\n", + " print(f\"→ Discovering data under {parquet_dir!r}\")\n", + "\n", + " # Option A: use the ParquetDataset API\n", + " # dataset = pq.ParquetDataset(parquet_dir) # older PyArrow\n", + " # table = dataset.read(use_threads=True) # uses all threads by default\n", + "\n", + " # Option B (recommended): use the Dataset API\n", + " dataset = ds.dataset(parquet_dir, format=\"parquet\")\n", + " print(\"→ Scanning & reading all fragments in parallel …\")\n", + " # to_table will read all row-groups/files in parallel (use_threads defaults to True) :contentReference[oaicite:0]{index=0}\n", + " table = dataset.to_table()\n", + "\n", + " print(\"→ Converting to pandas DataFrame…\")\n", + " df = table.to_pandas()\n", + "\n", + " end = time.time()\n", + " print(f\"✅ Loaded {len(df):,} rows in {end - start:.1f}s\")\n", + " print(f\"DataFrame shape: {df.shape}\")\n", + " print(f\"Memory usage: {df.memory_usage(deep=True).sum() / 1e9:.2f} GB\")\n", + "\n", + " return df\n", + "\n", + "if __name__ == \"__main__\":\n", + " PARQUET_DIR = \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_windowed_4096.parquet\"\n", + " df = load_parquet_to_pandas(PARQUET_DIR, num_threads=24)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create final training dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import re\n", + "\n", + "#list of 50 questions\n", + "\n", + "question_synonyms = {\n", + " \"A genetic variant on chromosome , position , affects the gene (). Is this variant benign or pathogenic? If pathogenic, what disease(s) does it cause?\",\n", + " \"A mutation at chromosome position on chromosome in gene (): benign or pathogenic? If pathogenic, which disease(s) is it linked to?\",\n", + " \"Considering the variant on chromosome , location , involving gene (), would you classify it as benign or pathogenic? What disease(s), if any, does a pathogenic variant indicate?\",\n", + " \"Is the genetic mutation found on chromosome at position , within the gene (), considered benign or pathogenic? If pathogenic, specify the associated disease(s).\",\n", + " \"Assess the clinical significance (benign or pathogenic) of the variant at chromosome , position , gene (). What disease(s) is it linked to if pathogenic?\",\n", + " \"Does the genetic variant at chromosome , position , impacting gene (), appear benign or pathogenic? If pathogenic, name the associated disease(s).\",\n", + " \"Variant in gene (), located at chromosome position : benign or pathogenic? What disease(s) does it cause if pathogenic?\",\n", + " \"Gene () variant at chromosome , position —is it benign or pathogenic? If pathogenic, what are the associated condition(s)?\",\n", + " \"A genetic alteration at chromosome , position , in gene ()—benign or pathogenic? If pathogenic, which disease(s) is involved?\",\n", + " \"Chromosome , position , gene (): Is this mutation clinically benign or pathogenic? If pathogenic, identify the related disease(s).\",\n", + " \"Does the variant on chromosome at location affecting gene () have a clinical significance of benign or pathogenic? If pathogenic, what disease(s) is associated?\",\n", + " \"Mutation at chromosome , position , within (): benign or pathogenic? If pathogenic, indicate the disease(s).\",\n", + " \"Evaluate this variant at chromosome , position , gene (): benign or pathogenic? If pathogenic, what are the disease connection(s)?\",\n", + " \"Gene mutation in () at chromosome , position —is it benign or pathogenic? If pathogenic, specify the disease(s).\",\n", + " \"Located at chromosome position , the variant affecting gene ()—benign or pathogenic? If pathogenic, which disease(s) does it relate to?\",\n", + " \"Is the chromosome , position variant in () clinically benign or pathogenic? If pathogenic, what condition(s) is associated?\",\n", + " \"Clinical significance of chromosome , position , gene (): benign or pathogenic? Name the disease(s) if pathogenic.\",\n", + " \"Is the genetic variant on chromosome , position , gene (), benign or pathogenic? If pathogenic, what disease(s) is indicated?\",\n", + " \"Regarding the variant at chromosome and position , affecting gene (): benign or pathogenic? If pathogenic, what are the associated illness(es)?\",\n", + " \"The mutation in gene () at chromosome , position —clinically benign or pathogenic? If pathogenic, identify the related disease(s).\",\n", + " \"Assess the variant on chromosome , position , impacting (): is it benign or pathogenic? If pathogenic, specify the associated condition(s).\",\n", + " \"Variant in (), chromosome , position —is this benign or pathogenic? If pathogenic, what disease(s) is linked?\",\n", + " \"Clinical impact (benign or pathogenic) of the variant at chromosome , location , gene (): what disease(s) if pathogenic?\",\n", + " \"The chromosome , position genetic variant in gene (): benign or pathogenic? If pathogenic, indicate disease(s).\",\n", + " \"Determine if the mutation at chromosome , position in gene () is benign or pathogenic. If pathogenic, what disease(s) is associated?\",\n", + " \"Is chromosome , position , gene () variant benign or pathogenic? If pathogenic, what condition(s) is it related to?\",\n", + " \"The mutation impacting () on chromosome at position : benign or pathogenic? Name the associated disease(s) if pathogenic.\",\n", + " \"Variant at chromosome , position , gene (): clinically benign or pathogenic? If pathogenic, specify the disease(s) involved.\",\n", + " \"Chromosome , position , gene (): benign or pathogenic variant? If pathogenic, what are the linked illness(es)?\",\n", + " \"A genetic variant at chromosome , position , affecting gene ()—is it benign or pathogenic? If pathogenic, identify the associated disorder(s).\",\n", + " \"Mutation found at chromosome position , gene (): benign or pathogenic? If pathogenic, indicate the relevant disease(s).\",\n", + " \"Benign or pathogenic: chromosome , position , gene () variant? Disease(s) if pathogenic?\",\n", + " \"Evaluate if the mutation on chromosome at position in () is benign or pathogenic. Disease name(s) if pathogenic?\",\n", + " \"Clinical classification of chromosome , position , gene (): benign or pathogenic? Disease(s) if pathogenic?\",\n", + " \"Variant chromosome , position , gene (): benign or pathogenic? Disease(s)?\",\n", + " \"Variant on chromosome , at position , affecting (): is it benign or pathogenic? If pathogenic, specify the associated disease(s).\",\n", + " \"Does the chromosome mutation at position within gene () classify as benign or pathogenic? If pathogenic, indicate the related illness(es).\",\n", + " \"Determine whether the variant at chromosome , position , in gene () is benign or pathogenic. If pathogenic, identify the relevant disease(s).\",\n", + " \"Gene () variant at chromosome position on chromosome : benign or pathogenic? If pathogenic, what disease(s) is it associated with?\",\n", + " \"Considering the genetic mutation at chromosome , position , impacting (): is it clinically benign or pathogenic? Name the associated disease(s) if pathogenic.\",\n", + " \"Evaluate the clinical significance of the mutation at chromosome , position in gene (): benign or pathogenic? What disease(s) does a pathogenic variant suggest?\",\n", + " \"Is the variant located on chromosome at position , gene (), benign or pathogenic? If pathogenic, specify the disease(s) linked.\",\n", + " \"Classify the chromosome variant at position affecting gene () as benign or pathogenic. If pathogenic, which disease(s) is associated?\",\n", + " \"For chromosome , position , gene (): benign or pathogenic mutation? If pathogenic, what are the associated disease(s)?\",\n", + " \"Is the genetic change at chromosome , position , within gene () benign or pathogenic? Name the disease(s) if pathogenic.\",\n", + " \"Does the variant impacting () on chromosome , position , classify as benign or pathogenic? If pathogenic, what disease(s) is it associated with?\",\n", + " \"Variant at chromosome position , chromosome , gene (): benign or pathogenic? If pathogenic, what condition(s) does it relate to?\",\n", + " \"Regarding the variant found on chromosome at position in gene (): is it benign or pathogenic? If pathogenic, identify the disease(s).\",\n", + " \"The genetic variant at chromosome , position , affecting gene (): benign or pathogenic? Disease name(s) if pathogenic?\",\n", + " \"Clinically, how would you classify the variant at chromosome , position , gene (): benign or pathogenic? If pathogenic, specify the associated illness(es).\"\n", + "}\n", + "\n", + "question_df = pd.DataFrame({'question': list(question_synonyms)})\n", + "question_df.index.name = 'question_number'\n", + "\n", + "# copy the df to training_df\n", + "training_df = df.copy()\n", + "training_df = training_df.rename(columns={'original_window': 'reference_sequence', 'mutated_window': 'mutated_sequence'})\n", + "training_df['question_number'] = np.random.randint(0, 50, size=len(training_df)) # generate random question number between 0 and 49 inclusive\n", + "\n", + "# merge the training_df with the question_df\n", + "training_df = pd.merge(training_df, question_df, on='question_number', how='left')\n", + "\n", + "# drop the question_number column\n", + "training_df = training_df.drop(columns=['question_number'])\n", + "\n", + "def fill_placeholders(row):\n", + " q = row['question']\n", + " # always replace these\n", + " q = q.replace('', str(row['chromosome']))\n", + " q = q.replace('', str(row['chromosome_position']))\n", + " q = q.replace('', row['gene_name'])\n", + " \n", + " # gene_full_name may be None\n", + " if pd.notnull(row['gene_desc']):\n", + " q = q.replace('', row['gene_desc'])\n", + " else:\n", + " # remove the entire \"()\" including surrounding space\n", + " q = re.sub(r'\\s*\\(\\s*\\s*\\)', '', q)\n", + " \n", + " return q\n", + "\n", + "training_df['question'] = training_df.apply(fill_placeholders, axis=1)\n", + "\n", + "\n", + "\n", + "def format_answer(row):\n", + " path = row['cleaned_pathogenicity']\n", + " disease = row['disease_name']\n", + " \n", + " # If disease_name is exactly 'not_provided' or 'not_specified'\n", + " if disease in ('not_provided', 'not_specified', 'not_specified|not_provided', 'not_provided|not_specified'):\n", + " return path\n", + " \n", + " # Split on '|' into a list and drop 'not_provided'\n", + " diseases = [d for d in disease.split('|') if d != 'not_provided']\n", + " \n", + " # Handle 'not_specified': note it, then drop it\n", + " unspecified = 'not_specified' in diseases\n", + " diseases = [d for d in diseases if d != 'not_specified']\n", + " \n", + " # Sort the disease names alphabetically\n", + " diseases = sorted(diseases)\n", + " \n", + " # If unspecified, append the note as an element at the end\n", + " if unspecified:\n", + " diseases.append('likely other unspecified diseases')\n", + " \n", + " # Represent diseases as a Python-style list literal\n", + " disease_text = str(diseases) # e.g. \"['DiseaseA', 'DiseaseB']\"\n", + " \n", + " # Build the answer, adding semicolon only for pathogenic\n", + " if path == 'pathogenic' and diseases:\n", + " return f\"{path}; {disease_text}\"\n", + " else:\n", + " return path\n", + "\n", + "# Apply to your DataFrame\n", + "training_df['answer'] = training_df.apply(format_answer, axis=1)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "disease_name\n", + "not_provided 73241\n", + "not_specified|not_provided 6405\n", + "not_provided|not_specified 5466\n", + "Inborn_genetic_diseases|not_provided 2289\n", + "not_provided|Inborn_genetic_diseases 1929\n", + " ... \n", + "not_provided|VAMP7-related_disorder 1\n", + "46,XY_sex_reversal_1|not_provided 1\n", + "Hereditary_factor_VIII_deficiency_disease|Thrombophilia,_X-linked,_due_to_factor_8_defect|not_provided 1\n", + "Mendelian_susceptibility_to_mycobacterial_diseases_due_to_complete_ISG15_deficiency|not_specified|not_provided 1\n", + "not_provided|not_specified|Mendelian_susceptibility_to_mycobacterial_diseases_due_to_complete_ISG15_deficiency 1\n", + "Name: count, Length: 87193, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_df['disease_name'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in links: /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo2023/x86-64-v3, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo2023/generic, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/generic\n", + "Processing /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/generic/networkx-3.4.2+computecanada-py3-none-any.whl\n", + "Installing collected packages: networkx\n", + "Successfully installed networkx-3.4.2+computecanada\n" + ] + } + ], + "source": [ + "!pip install --no-index networkx\n", + "import networkx as nx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "disjoint diseases" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting split assignment...\n", + "Step 1/5: Building graph & disease→row index mapping (excluding specials)...\n", + " → Built graph with 13326 nodes, 48265 edges in 1.0s\n", + "Step 2/5: Checking for existing disconnected components...\n", + " → Found 3099 components; skipping node removal.\n", + "Step 5/5: Assigning rows to splits…\n", + "Done! Total time: 9.8s; achieved train fraction = 0.1000\n", + "Dropped diseases: []\n", + "Rows dropped: 0\n", + "Final train fraction: 0.100\n" + ] + } + ], + "source": [ + "special_diseases = {\"not_provided\", \"not_specified\", \"Inborn_genetic_diseases\", \"See_cases\"}\n", + "import pandas as pd\n", + "import networkx as nx\n", + "import itertools\n", + "import numpy as np\n", + "import re\n", + "import time\n", + "from tqdm import tqdm\n", + "from math import comb\n", + "import multiprocessing as mp\n", + "from functools import partial\n", + "from collections import defaultdict\n", + "\n", + "def _evaluate_subset(subset, G, disease_to_rows, train_frac):\n", + " \"\"\"\n", + " Worker to evaluate one subset removal:\n", + " - removes `subset` from G,\n", + " - checks for ≥2 components,\n", + " - if so, computes the train/test split score using disease_to_rows.\n", + " Returns (score, subset, components) or None.\n", + " \"\"\"\n", + " H = G.copy()\n", + " H.remove_nodes_from(subset)\n", + " ccs = list(nx.connected_components(H))\n", + " if len(ccs) < 2:\n", + " return None\n", + "\n", + " # compute unique row counts for each component\n", + " sizes = []\n", + " for comp in ccs:\n", + " rows = set()\n", + " for d in comp:\n", + " rows |= disease_to_rows.get(d, set())\n", + " sizes.append(len(rows))\n", + "\n", + " # pick two largest comps\n", + " idx = np.argsort(sizes)[::-1][:2]\n", + " train_count, test_count = sizes[idx[0]], sizes[idx[1]]\n", + " frac = train_count / (train_count + test_count)\n", + " score = abs(frac - train_frac)\n", + " return (score, subset, ccs)\n", + "\n", + "def assign_disjoint_splits(\n", + " df: pd.DataFrame,\n", + " special_diseases: set,\n", + " train_frac: float = 0.9,\n", + " max_remove: int = 3,\n", + " random_state: int = 42,\n", + " n_procs: int = 24\n", + ") -> (pd.DataFrame, dict):\n", + " \"\"\"\n", + " Add a 'split' column to df (0=train, 1=test) so that:\n", + " - No disease outside special_diseases appears in both splits.\n", + " - The overall train/test row ratio is as close to train_frac as possible.\n", + " - SNV/non-SNV and pathogenic/benign proportions stay balanced automatically\n", + " by sampling at the end for any rows containing only special diseases.\n", + " Uses up to `n_procs` parallel processes for the removal search, but only if needed.\n", + " Prints progress at every major step.\n", + " \"\"\"\n", + " rng = np.random.RandomState(random_state)\n", + " start_time = time.time()\n", + " print(\"Starting split assignment...\")\n", + "\n", + " # 1) Build graph and disease→rows mapping\n", + " print(\"Step 1/5: Building graph & disease→row index mapping (excluding specials)...\")\n", + " G = nx.Graph()\n", + " disease_to_rows = defaultdict(set)\n", + " for idx, name_str in enumerate(df['disease_name']):\n", + " names = name_str.split('|')\n", + " non_special = [d for d in names if d not in special_diseases]\n", + " for d in non_special:\n", + " disease_to_rows[d].add(idx)\n", + " G.add_node(d)\n", + " for u, v in itertools.combinations(non_special, 2):\n", + " G.add_edge(u, v)\n", + " elapsed = time.time() - start_time\n", + " print(f\" → Built graph with {G.number_of_nodes()} nodes, {G.number_of_edges()} edges in {elapsed:.1f}s\")\n", + "\n", + " # 2) Check connectivity\n", + " print(\"Step 2/5: Checking for existing disconnected components...\")\n", + " comps = list(nx.connected_components(G))\n", + " if len(comps) >= 2:\n", + " print(f\" → Found {len(comps)} components; skipping node removal.\")\n", + " # compute rows-per-component sets\n", + " comp_rows = []\n", + " for comp in comps:\n", + " rows_set = set()\n", + " for d in comp:\n", + " rows_set |= disease_to_rows[d]\n", + " comp_rows.append((comp, rows_set))\n", + "\n", + " # total non-special rows\n", + " total_ns_rows = len(set().union(*(rows for _, rows in comp_rows)))\n", + " target_train_ns = train_frac * total_ns_rows\n", + "\n", + " # sort components by descending size\n", + " comp_rows.sort(key=lambda x: len(x[1]), reverse=True)\n", + "\n", + " # greedy pack to hit target_train_ns\n", + " train_comp = set()\n", + " train_rows = set()\n", + " for comp, rows_set in comp_rows:\n", + " if len(train_rows | rows_set) <= target_train_ns or not train_rows:\n", + " train_comp |= comp\n", + " train_rows |= rows_set\n", + "\n", + " all_nodes = set(G.nodes())\n", + " test_comp = all_nodes - train_comp\n", + " dropped = []\n", + " else:\n", + " # 3) Removal search\n", + " print(\"Step 3/5: Graph is connected; searching for node removals…\")\n", + " best = {'score': float('inf')}\n", + " all_nodes = list(G.nodes())\n", + " worker = partial(_evaluate_subset,\n", + " G=G,\n", + " disease_to_rows=disease_to_rows,\n", + " train_frac=train_frac)\n", + " for k in range(1, max_remove + 1):\n", + " total_combs = comb(len(all_nodes), k)\n", + " print(f\" → Trying removals of size {k} ({total_combs} combos)…\")\n", + " with mp.Pool(processes=n_procs) as pool:\n", + " for result in tqdm(pool.imap_unordered(worker, itertools.combinations(all_nodes, k)),\n", + " total=total_combs,\n", + " desc=f\" size={k}\"):\n", + " if not result:\n", + " continue\n", + " score, subset, ccs = result\n", + " if score < best['score']:\n", + " best.update(score=score, subset=subset, components=ccs)\n", + " elapsed_k = time.time() - start_time\n", + " print(f\" → Done size-{k} in {elapsed_k:.1f}s; best score = {best['score']:.4f}\")\n", + " if best['score'] < float('inf'):\n", + " break\n", + "\n", + " dropped = list(best['subset'])\n", + " comps = best['components']\n", + "\n", + " # 4) select two largest comps\n", + " print(\"Step 4/5: Selecting two largest components for train/test…\")\n", + " comp_counts = []\n", + " for comp in comps:\n", + " rows_set = set()\n", + " for d in comp:\n", + " rows_set |= disease_to_rows[d]\n", + " comp_counts.append((comp, rows_set))\n", + " comp_counts.sort(key=lambda x: len(x[1]), reverse=True)\n", + " train_comp, test_comp = comp_counts[0][0], comp_counts[1][0]\n", + "\n", + " # 5) Assign rows\n", + " print(\"Step 5/5: Assigning rows to splits…\")\n", + " def which_split(dlist):\n", + " non_special = [d for d in dlist if d not in special_diseases]\n", + " if any(d in train_comp for d in non_special):\n", + " return 0\n", + " if any(d in test_comp for d in non_special):\n", + " return 1\n", + " return None\n", + "\n", + " df_out = df.copy()\n", + " df_out['split'] = df_out['disease_name'].str.split('|').apply(which_split)\n", + "\n", + " # fill None rows to achieve exact train_frac\n", + " mask_none = df_out['split'].isna()\n", + " n_none = mask_none.sum()\n", + " n_train_desired = int(train_frac * len(df_out))\n", + " n_current_train = (df_out['split'] == 0).sum()\n", + " n_to_train = max(0, n_train_desired - n_current_train)\n", + " assign = np.array([0]*n_to_train + [1]*(n_none - n_to_train))\n", + " rng.shuffle(assign)\n", + " df_out.loc[mask_none, 'split'] = assign\n", + " df_out['split'] = df_out['split'].astype(int)\n", + "\n", + " total_elapsed = time.time() - start_time\n", + " print(f\"Done! Total time: {total_elapsed:.1f}s; achieved train fraction = {df_out['split'].mean():.4f}\")\n", + "\n", + " info = {\n", + " 'dropped_nodes': dropped,\n", + " 'dropped_row_count': int(sum(len(disease_to_rows[d]) for d in dropped)),\n", + " 'achieved_frac': float(df_out['split'].mean())\n", + " }\n", + " return df_out, info\n", + "\n", + "# ── Usage ──\n", + "new_df, report = assign_disjoint_splits(\n", + " training_df,\n", + " special_diseases,\n", + " train_frac=0.9,\n", + " max_remove=3,\n", + " random_state=42,\n", + " n_procs=24\n", + ")\n", + "print(\"Dropped diseases:\", report['dropped_nodes'])\n", + "print(\"Rows dropped:\", report['dropped_row_count'])\n", + "print(f\"Final train fraction: {report['achieved_frac']:.3f}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "split\n", + "0 308420\n", + "1 34269\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df['split'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['SAMD11-related_disorder|not_provided',\n", + " 'not_provided|SAMD11-related_disorder', 'not_provided', ...,\n", + " 'not_provided|VAMP7-related_disorder',\n", + " '46,XY_sex_reversal_1|not_provided',\n", + " 'TBL1Y-related_disorder|Deafness,_Y-linked_2|not_provided'],\n", + " shape=(11445,), dtype=object)" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df[new_df['split']==1]['disease_name'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "split\n", + "0 308420\n", + "1 34269\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df['split'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Split 0 (n=308420) ===\n", + "\n", + "Pathogenicity counts:\n", + "cleaned_pathogenicity\n", + "benign 230709\n", + "pathogenic 77711\n", + "\n", + "Pathogenicity ratios:\n", + "cleaned_pathogenicity\n", + "benign 0.748035\n", + "pathogenic 0.251965\n", + "\n", + "Variant-type counts:\n", + "variant_type\n", + "SNV 274147\n", + "non_SNV 34273\n", + "\n", + "Variant-type ratios:\n", + "variant_type\n", + "SNV 0.888876\n", + "non_SNV 0.111124\n", + "\n", + "=== Split 1 (n=34269) ===\n", + "\n", + "Pathogenicity counts:\n", + "cleaned_pathogenicity\n", + "benign 30279\n", + "pathogenic 3990\n", + "\n", + "Pathogenicity ratios:\n", + "cleaned_pathogenicity\n", + "benign 0.883568\n", + "pathogenic 0.116432\n", + "\n", + "Variant-type counts:\n", + "variant_type\n", + "SNV 32454\n", + "non_SNV 1815\n", + "\n", + "Variant-type ratios:\n", + "variant_type\n", + "SNV 0.947037\n", + "non_SNV 0.052963\n", + "\n", + "Cross-tab: split × pathogenicity\n", + "cleaned_pathogenicity benign pathogenic\n", + "split \n", + "0 0.748035 0.251965\n", + "1 0.883568 0.116432\n", + "\n", + "Cross-tab: split × variant_type\n", + "variant_type SNV non_SNV\n", + "split \n", + "0 0.888876 0.111124\n", + "1 0.947037 0.052963\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# assuming new_df is your DataFrame with a 'split' column (0=train, 1=test)\n", + "\n", + "def print_ratio_stats(df, split_label):\n", + " sub = df[df['split'] == split_label]\n", + " total = len(sub)\n", + " print(f\"\\n=== Split {split_label} (n={total}) ===\")\n", + " \n", + " # Pathogenic vs. Benign\n", + " p_counts = sub['cleaned_pathogenicity'].value_counts()\n", + " p_ratios = p_counts / total\n", + " print(\"\\nPathogenicity counts:\")\n", + " print(p_counts.to_string())\n", + " print(\"\\nPathogenicity ratios:\")\n", + " print(p_ratios.to_string())\n", + " \n", + " # SNV vs. non-SNV\n", + " v_counts = sub['variant_type'].value_counts()\n", + " v_ratios = v_counts / total\n", + " print(\"\\nVariant-type counts:\")\n", + " print(v_counts.to_string())\n", + " print(\"\\nVariant-type ratios:\")\n", + " print(v_ratios.to_string())\n", + "\n", + "# Overall\n", + "print_ratio_stats(new_df, 0) # train\n", + "print_ratio_stats(new_df, 1) # test\n", + "\n", + "# If you also want a quick cross-tab view:\n", + "print(\"\\nCross-tab: split × pathogenicity\")\n", + "print(pd.crosstab(new_df['split'], new_df['cleaned_pathogenicity'], normalize='index'))\n", + "\n", + "print(\"\\nCross-tab: split × variant_type\")\n", + "print(pd.crosstab(new_df['split'], new_df['variant_type'], normalize='index'))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "final_df = new_df.copy()[['question', 'answer', 'reference_sequence', 'mutated_sequence', 'split', 'variant_type', 'cleaned_pathogenicity']]\n", + "\n", + "# if len(final_df['variant_type'].value_counts().keys().tolist()) > 2:\n", + "# raise ValueError(\"variant_type has more than 2 values, should just be SNV and non_SNV\")\n", + "\n", + "train_split_df = final_df[final_df['split']==0]\n", + "test_split_df = final_df[final_df['split']==1]\n", + "\n", + "train_split_df = train_split_df.drop('split', axis=1)\n", + "test_split_df = test_split_df.drop('split', axis=1)\n", + "\n", + "snv_train_split_df = train_split_df[train_split_df['variant_type']=='SNV']\n", + "non_snv_train_split_df = train_split_df[train_split_df['variant_type']=='non_SNV']\n", + "\n", + "snv_test_split_df = test_split_df[test_split_df['variant_type']=='SNV']\n", + "non_snv_test_split_df = test_split_df[test_split_df['variant_type']=='non_SNV']\n", + "\n", + "snv_test_split_df = snv_test_split_df.drop('variant_type', axis=1)\n", + "non_snv_test_split_df = non_snv_test_split_df.drop('variant_type', axis=1)\n", + "\n", + "snv_train_split_df = snv_train_split_df.drop('variant_type', axis=1)\n", + "non_snv_train_split_df = non_snv_train_split_df.drop('variant_type', axis=1)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questionanswerreference_sequencemutated_sequencecleaned_pathogenicity
0Assess the variant on chromosome 1, position 9...benignGGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTG...GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTG...benign
1Gene SAMD11 (sterile alpha motif domain contai...benignTGACTAACACGGTGAAACCCGTCTCTACTAAAAATACAAAAAATTA...TGACTAACACGGTGAAACCCGTCTCTACTAAAAATACAAAAAATTA...benign
2The mutation in gene SAMD11 (sterile alpha mot...benignCCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATG...CCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATG...benign
3Determine whether the variant at chromosome 1,...benignGAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAG...GAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAG...benign
4Variant on chromosome 1, at position 935779, a...benignCCTATGTGCCTGGGGGGGGCTTCCTTTCCCACTGGGAGCCGGTGGG...CCTATGTGCCTGGGGGGGGCTTCCTTTCCCACTGGGAGCCGGTGGG...benign
..................
342678Variant at chromosome X, position 155524483, g...benignGTGTGCATAGCTCTATGCAGTGTAATTACATGTGTAACTTTGTGTA...GTGTGCATAGCTCTATGCAGTGTAATTACATGTGTAACTTTGTGTA...benign
342680Mutation at chromosome X, position 155900534, ...benignAGCATTAAAGATCATCTAGTTGAACTACCCATCTGATGCTTAAATG...AGCATTAAAGATCATCTAGTTGAACTACCCATCTGATGCTTAAATG...benign
342681Does the variant on chromosome X at location 1...benignCAATTAGTCCCTTGATTATTGATCCTTCTCTTTTGGCTGTATTCTC...CAATTAGTCCCTTGATTATTGATCCTTCTCTTTTGGCTGTATTCTC...benign
342685Assess the clinical significance (benign or pa...benignTTTAGTCTTTCCAAAATGTATACATGCATGATGTCATAATTTTTAA...TTTAGTCTTTCCAAAATGTATACATGCATGATGTCATAATTTTTAA...benign
342686Is the variant located on chromosome Y at posi...benignAGGTGGCCGTGGCTGTCTGAGGGGAAAGACTGGGGACACTGAATGG...AGGTGGCCGTGGCTGTCTGAGGGGAAAGACTGGGGACACTGAATGG...benign
\n", + "

32454 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " question answer \\\n", + "0 Assess the variant on chromosome 1, position 9... benign \n", + "1 Gene SAMD11 (sterile alpha motif domain contai... benign \n", + "2 The mutation in gene SAMD11 (sterile alpha mot... benign \n", + "3 Determine whether the variant at chromosome 1,... benign \n", + "4 Variant on chromosome 1, at position 935779, a... benign \n", + "... ... ... \n", + "342678 Variant at chromosome X, position 155524483, g... benign \n", + "342680 Mutation at chromosome X, position 155900534, ... benign \n", + "342681 Does the variant on chromosome X at location 1... benign \n", + "342685 Assess the clinical significance (benign or pa... benign \n", + "342686 Is the variant located on chromosome Y at posi... benign \n", + "\n", + " reference_sequence \\\n", + "0 GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTG... \n", + "1 TGACTAACACGGTGAAACCCGTCTCTACTAAAAATACAAAAAATTA... \n", + "2 CCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATG... \n", + "3 GAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAG... \n", + "4 CCTATGTGCCTGGGGGGGGCTTCCTTTCCCACTGGGAGCCGGTGGG... \n", + "... ... \n", + "342678 GTGTGCATAGCTCTATGCAGTGTAATTACATGTGTAACTTTGTGTA... \n", + "342680 AGCATTAAAGATCATCTAGTTGAACTACCCATCTGATGCTTAAATG... \n", + "342681 CAATTAGTCCCTTGATTATTGATCCTTCTCTTTTGGCTGTATTCTC... \n", + "342685 TTTAGTCTTTCCAAAATGTATACATGCATGATGTCATAATTTTTAA... \n", + "342686 AGGTGGCCGTGGCTGTCTGAGGGGAAAGACTGGGGACACTGAATGG... \n", + "\n", + " mutated_sequence \\\n", + "0 GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTG... \n", + "1 TGACTAACACGGTGAAACCCGTCTCTACTAAAAATACAAAAAATTA... \n", + "2 CCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATG... \n", + "3 GAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAG... \n", + "4 CCTATGTGCCTGGGGGGGGCTTCCTTTCCCACTGGGAGCCGGTGGG... \n", + "... ... \n", + "342678 GTGTGCATAGCTCTATGCAGTGTAATTACATGTGTAACTTTGTGTA... \n", + "342680 AGCATTAAAGATCATCTAGTTGAACTACCCATCTGATGCTTAAATG... \n", + "342681 CAATTAGTCCCTTGATTATTGATCCTTCTCTTTTGGCTGTATTCTC... \n", + "342685 TTTAGTCTTTCCAAAATGTATACATGCATGATGTCATAATTTTTAA... \n", + "342686 AGGTGGCCGTGGCTGTCTGAGGGGAAAGACTGGGGACACTGAATGG... \n", + "\n", + " cleaned_pathogenicity \n", + "0 benign \n", + "1 benign \n", + "2 benign \n", + "3 benign \n", + "4 benign \n", + "... ... \n", + "342678 benign \n", + "342680 benign \n", + "342681 benign \n", + "342685 benign \n", + "342686 benign \n", + "\n", + "[32454 rows x 5 columns]" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# save all the final dataframes to parquet files\n", + "snv_train_split_df.to_parquet('SCRATCH_DIR/DNASNVData113/finaldata/snv_train_split_df.parquet')\n", + "non_snv_train_split_df.to_parquet('SCRATCH_DIR/DNASNVData113/finaldata/non_snv_train_split_df.parquet')\n", + "snv_test_split_df.to_parquet('SCRATCH_DIR/DNASNVData113/finaldata/snv_test_split_df.parquet')\n", + "non_snv_test_split_df.to_parquet('SCRATCH_DIR/DNASNVData113/finaldata/non_snv_test_split_df.parquet')\n", + "\n", + "#now upload to huggingface\n", + "!pip install --no-index huggingface-hub\n", + "from huggingface_hub import HfApi\n", + "import os\n", + "import glob\n", + "\n", + "# 0) config\n", + "repo_id = \"wanglab/bioR_tasks\" # your dataset repo\n", + "repo_type = \"dataset\"\n", + "subfolder = \"task4-variant_effect_non_snv_and_snv_with_split\"\n", + "local_dir = \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_windowed_4096.parquet\"\n", + "\n", + "api = HfApi()\n", + "\n", + "# 1) list all files in that subfolder\n", + "all_files = api.list_repo_files(repo_id, repo_type=repo_type)\n", + "old_files = [f for f in all_files if f.startswith(subfolder + \"/\")]\n", + "\n", + "print(f\"Will delete {len(old_files)} old files:\")\n", + "for f in old_files:\n", + " print(\" \", f)\n", + "\n", + "# 2) delete them (one commit per file, or you can batch by reusing the same commit_message)\n", + "for f in old_files:\n", + " api.delete_file(\n", + " path_in_repo = f,\n", + " repo_id = repo_id,\n", + " repo_type = repo_type,\n", + " commit_message = f\"remove old dataset file\"\n", + " )\n", + "\n", + "# 3) upload your single Parquet file\n", + "new_file = \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_windowed_4096.parquet\"\n", + "basename = os.path.basename(new_file)\n", + "dest_path = f\"{subfolder}/{basename}\"\n", + "\n", + "print(f\"Uploading {new_file!r} to {repo_id}/{dest_path} …\")\n", + "api.upload_file(\n", + " path_or_fileobj = new_file,\n", + " path_in_repo = dest_path,\n", + " repo_id = repo_id,\n", + " repo_type = repo_type,\n", + " commit_message = f\"add updated parquet {basename}\"\n", + ")\n", + "\n", + "print(\"Done! Your dataset has been updated on the Hub.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'question': 'Assess the variant on chromosome 1, position 930204, impacting SAMD11 (sterile alpha motif domain containing 11): is it benign or pathogenic? If pathogenic, specify the associated condition(s).',\n", + " 'answer': 'benign',\n", + " 'reference_sequence': 'GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGACTAACACGGTGAAACCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGTGGTGGCGGGTGCCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATGAACCCGGGAGGCGGAGCTTGCAGTGAGCCCAGATTGTGCCACCGCACTCCAGCCTGGGCAACAGAGTGAGACTCCGTCTCAAAAAACTAAAAAAGAAGAGAGGTGGGAGAGGAGAGGCTGTCAGAGCCTCTAAGCCCTGGTGCTTGGGCTGCAGAAGGGCAGAGCTAAGCGGGACTTCCCAGCACAGCACACTCCGGACAGGCTGTGGCTGTTGAAGGGACCCCCGAGCTCCAGCTGACACGCGGAGGCCCGGGCACAGACAGGCATCATACCTTCGGCCTTGGCCGCACTCTGTGGTCATTGGTGTTGGGGGCAGCCCAGGGTCAGGGCAGGGTCTCAGCCTCGGACCCCAGGCCCCACCCCTTGCCCAGCAGTGCTGCGTTTTCCCAGTGAGCTGTCGTGGAGAGAGCAGAGGGGACCCAGCGCAGGCCCAGTGGCCGGTGAGGGGAGACGTGGCTCTGGGACGGGGGCCTCCACCTGGGTGGGGGGATGCTCCAGCTTCCAGACCCTTGGGGAGGGGGCACTGCCCAAACTAAGCTGGCACTGGGGCTGTGCATTTGAAGGTGATGGTGGTTCTAGGTCTGAGGAGGACACCCTCCTAACAGCCTCATCCCCAAGCTCCGGGCTGTGTTGTGGCAATGGGAGGGAGGAAGTCTGAGGAGACCCTGGTGACTGAACGGAGGAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAGCCAGCGGCCTGTTACTACATTTAAAAAAGCCTCCCGCCCACTGGAAAATAATCAATAACTTTCCTTTATCCCTGGGGGTGGCAGGACCTAGAAACACTGGAGGAGTCCGGAAGTGCCTGGGGCTGGGCCGGCGCTGGTGTGCTGTGCAGGGTGCCGCGGGCACGTCCGCCGCGTGTGTGCGTCAGCTCGGGGCTCGGCTGTGCTCTGCAGGGACCACAGCGGGCGTGTCTGTGCTCCCACCCGAGGCACCCACAGCTCCACACGCTCGTTCCGTGGGTGCAAAGGAGATGGGAGAAAGAAGCCCTGTGAGAAATGCGGGGCAGGGTTTGCGGAACAGGGGACCTGGGCTGGTGAGGGCTCCTCGTCTGGTGACCTGTGAGCCCCGGGGCCTGCAGTCTGCGAGGGTTCAGCTCAGACAGTTGCCAGTGGCCTTGCACCAGGCTGCAGCTGCCCCTGAGCCGGGCTGTGCGTGGCGCTGATGAAATAGAAAAGGGCATTCGCTTGTCAACGTTGGCATCGGTGGCAGGGTGTGGTGGGCAGAAGGGTCACAAAGTACGGGTGGGATTGGCAGGCAGATACACGGAGGGAACGTGCGCATTTGAGTGCACGTCCACCAGCACCAGCCCCAGGCCACAGGCAGATCCCAGGAGACACGCAGGGGCCCTAAGAAGGGAGCTGGGAATGAGGGGCCACACAAGCCCGGGACGGAGGCCTGTCGCACATGGGGTGGCCCCGACTCAGGCCCTGGAGTTGGCCAGGACCCTCTAGCATCCTCAAGGGCTGGGCCAACCAGGCTGGCGTGGGGTGGGGCAGGGGAGGGCTGAGCCAGTGGGCGTCGTCTGTAGGGGGATGCCCAACTGCGGCCCCGTCTCTCGGCTCTCCTCTGGGTCTCTGGCCAGCTGTGGCTCCTGCTGGCCCCAGGCGCATCCCAGAGGCAGGTAGAGGGAGGATGGCTGCTCTGAGGGCACCTCTGCCGTGCTTGGGGCTCGGCCTGGGGTGCGAGACCAGGGCAGACCCCCGGGAGATGGAACGGCCCGGTCCAGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCAGAACCGGGGGCGGCTGGCAGACAAGAGGACAGTCGCCCTGCCTGCCGCCCGGAACCTGAAGAAGGAGCGAACTCCCAGCTTCTCTGCCAGCGATGGTGACAGCGACGGGAGTGGCCCCACCTGTGGGCGGCGGCCAGGCTTGAAGCAGGAGGATGGTCCGCACATCCGTATCATGAAGAGAAGGTACTTGGACCAGGGCCGGACAGGAAGGCGCAAGGCTCAGATGGGGCTGGAGCTTCAGGCCTTCAGCTGCTCAGATGAGAGTGTCCACACCGGCCTCCCACACCTTCCCTCAGATGCTGGTCTTTTTGGGGTCCTGTGTGGGTCGCAGGCAGGAGCTGTTTCCTCATCTGCCCCCTGTCTGGCGTCCCCTCCCACCTCTGCTCTGCGGCGCTCACTGGCAGAGGCAGGTTGGCAGCAGTTGGGACCCAGAGGTCTGCACCTTCCTGGGCCGACGCTCCAGCTACCCTTGCTGACCGGGTCCCAGTCTGGCCAGAGAGCAGCTCTAGCAACAGGGAGCTCCATTCAGGCTCGTGACTGGCTGTGCAGAAGCAGCCTCGGCCCCCACCTGCGGTACAACAGGAGGGCTCCTCTGAGTGCACGGCAACAAGCAAGAGGGAGAAGGGGCCTCGGTCCTGTTCTTCCTGATGCGTGTCTGCTGAGGCCAGGAGCTGGCTTTGGCCCATGGGCCTGTCCTAGTGGGAGGCCCCAGCATGTTGAGCCAGTAGCAGGTGGTGCTGGGCATGGCAGCCGCCCTCGTTCACTGCCCAGGGCTGTGGCCCAGCGGGGCACTGACCCGAGACAGGTCTGCGCACGCCCTGCTATCCTGAGGCTGGGGTCAGGGGCCTCCAGAGCAACATGGACCTTCTGCTTCCCTTCCTGCAGAGTCCACACCCACTGGGACGTGAACATCTCTTTCCGAGAGGCGTCCTGCAGGTAGGAGCCGTGCTGTGCGTGCATAAGAGGGGGCCGTGACTCCCCTCCCTCCCTCCCACCCCTGACCGTGCCCTGCTGTCTGCTGTCCGCTGTCTCAGCGTGAGCTGATGCTGTGATGCTGGCTGAGTGTCTGCCAGGTTTGACATGTGCTGCAAGGTTGTCCCCCATCCCGGGAGGCAGACAGTGTTGCACCCAGTTGGGACTGAGGGACCCCAGACCCAGTCAGATGCAGCTCTCGGCAGCAGCTCAGGTGTGAGTTCTGGGCAGCCCGGCCCTGGAGTTAGAGTGCACTTCCTCCCATGTGAGACTGGCCATTTGAGCCCAAAAATGAGGCTGTCACCTCCCCCTTCCCACCCTCCTAGAGACCCACAAGGAGGTGAGAATGCTGATGTGTGAGTGGGGCCCTGAAGGGTGTGTAGGAGCTCTAAGGCGAGGGGATGTCTGCAGAGTAGAGGAACAGGGAAGGGCGTGTAGGAGGGACGAGGAGTGAACCTGGCAGCTCTGGTTCAGTTGGATGCTGAAGAGTCATGGATGCTGGGCCTGTGGGCACCGTCCTCCAGGCGGGAGCCACCGAAAGTTCTTGAGCAGGGCAGTGACCAGGTGTATGTTTGGAGAAGGTCCCTCTGGAGGCCTTCCTGGCAGACAGGGGATTGGATTCAGGCTGTGGAAGCAGGACGGTAGGGGGTGTGATTCCAGGATGTGGAAAGGAGATAAAAATGAAGAGCCCCGGGGAAGAGGTCAAGGGAGTTGGGGGACCCGAGTTCCTGGCTCCAGGGGGAAGCGAGTGGTAAGTCTGTGAACAGAGCCCAGCTGTGGATTCTGTCAATGGGGTCAGGTCTCACCCTGTGGCTTCCAGGGCAGCAAGGCAGGAAGGAGGCGTCTGCCACAAGGCCAGCTTCCTGGGGCCAGAGCCGTGAAGGCCCAGGGGACCTGCGTGTCTTGGCTCCACGCCAGATGTGTTATTATTTATGTCTCTGAGAATGTCTGGATCTCAGAGCCGAATTACAATAAAAACATCTTTAAACTTATTTCTACCTCATTTTGGGGTTGCCAGCTCACCTGATCATTTTTATGAACTGTCATGAACACTGATGACATTTTATGAGCCTTTTACATGGGACACTACAGAATACATTTGTCAGCGAGGCCTGTAGGGAAACCC',\n", + " 'mutated_sequence': 'GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGACTAACACGGTGAAACCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGTGGTGGCGGGTGCCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATGAACCCGGGAGGCGGAGCTTGCAGTGAGCCCAGATTGTGCCACCGCACTCCAGCCTGGGCAACAGAGTGAGACTCCGTCTCAAAAAACTAAAAAAGAAGAGAGGTGGGAGAGGAGAGGCTGTCAGAGCCTCTAAGCCCTGGTGCTTGGGCTGCAGAAGGGCAGAGCTAAGCGGGACTTCCCAGCACAGCACACTCCGGACAGGCTGTGGCTGTTGAAGGGACCCCCGAGCTCCAGCTGACACGCGGAGGCCCGGGCACAGACAGGCATCATACCTTCGGCCTTGGCCGCACTCTGTGGTCATTGGTGTTGGGGGCAGCCCAGGGTCAGGGCAGGGTCTCAGCCTCGGACCCCAGGCCCCACCCCTTGCCCAGCAGTGCTGCGTTTTCCCAGTGAGCTGTCGTGGAGAGAGCAGAGGGGACCCAGCGCAGGCCCAGTGGCCGGTGAGGGGAGACGTGGCTCTGGGACGGGGGCCTCCACCTGGGTGGGGGGATGCTCCAGCTTCCAGACCCTTGGGGAGGGGGCACTGCCCAAACTAAGCTGGCACTGGGGCTGTGCATTTGAAGGTGATGGTGGTTCTAGGTCTGAGGAGGACACCCTCCTAACAGCCTCATCCCCAAGCTCCGGGCTGTGTTGTGGCAATGGGAGGGAGGAAGTCTGAGGAGACCCTGGTGACTGAACGGAGGAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAGCCAGCGGCCTGTTACTACATTTAAAAAAGCCTCCCGCCCACTGGAAAATAATCAATAACTTTCCTTTATCCCTGGGGGTGGCAGGACCTAGAAACACTGGAGGAGTCCGGAAGTGCCTGGGGCTGGGCCGGCGCTGGTGTGCTGTGCAGGGTGCCGCGGGCACGTCCGCCGCGTGTGTGCGTCAGCTCGGGGCTCGGCTGTGCTCTGCAGGGACCACAGCGGGCGTGTCTGTGCTCCCACCCGAGGCACCCACAGCTCCACACGCTCGTTCCGTGGGTGCAAAGGAGATGGGAGAAAGAAGCCCTGTGAGAAATGCGGGGCAGGGTTTGCGGAACAGGGGACCTGGGCTGGTGAGGGCTCCTCGTCTGGTGACCTGTGAGCCCCGGGGCCTGCAGTCTGCGAGGGTTCAGCTCAGACAGTTGCCAGTGGCCTTGCACCAGGCTGCAGCTGCCCCTGAGCCGGGCTGTGCGTGGCGCTGATGAAATAGAAAAGGGCATTCGCTTGTCAACGTTGGCATCGGTGGCAGGGTGTGGTGGGCAGAAGGGTCACAAAGTACGGGTGGGATTGGCAGGCAGATACACGGAGGGAACGTGCGCATTTGAGTGCACGTCCACCAGCACCAGCCCCAGGCCACAGGCAGATCCCAGGAGACACGCAGGGGCCCTAAGAAGGGAGCTGGGAATGAGGGGCCACACAAGCCCGGGACGGAGGCCTGTCGCACATGGGGTGGCCCCGACTCAGGCCCTGGAGTTGGCCAGGACCCTCTAGCATCCTCAAGGGCTGGGCCAACCAGGCTGGCGTGGGGTGGGGCAGGGGAGGGCTGAGCCAGTGGGCGTCGTCTGTAGGGGGATGCCCAACTGCGGCCCCGTCTCTCGGCTCTCCTCTGGGTCTCTGGCCAGCTGTGGCTCCTGCTGGCCCCAGGCGCATCCCAGAGGCAGGTAGAGGGAGGATGGCTGCTCTGAGGGCACCTCTGCCGTGCTTGGGGCTCGGCCTGGGGTGCGAGACCAGGGCAGACCCCCGGGAGATGGAACGGCCCGGTCCAGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCAGAACCGGGGGCGGCTGGCAGACAAGAGGACAGTCGCCCTGCCTGCCGCCCAGAACCTGAAGAAGGAGCGAACTCCCAGCTTCTCTGCCAGCGATGGTGACAGCGACGGGAGTGGCCCCACCTGTGGGCGGCGGCCAGGCTTGAAGCAGGAGGATGGTCCGCACATCCGTATCATGAAGAGAAGGTACTTGGACCAGGGCCGGACAGGAAGGCGCAAGGCTCAGATGGGGCTGGAGCTTCAGGCCTTCAGCTGCTCAGATGAGAGTGTCCACACCGGCCTCCCACACCTTCCCTCAGATGCTGGTCTTTTTGGGGTCCTGTGTGGGTCGCAGGCAGGAGCTGTTTCCTCATCTGCCCCCTGTCTGGCGTCCCCTCCCACCTCTGCTCTGCGGCGCTCACTGGCAGAGGCAGGTTGGCAGCAGTTGGGACCCAGAGGTCTGCACCTTCCTGGGCCGACGCTCCAGCTACCCTTGCTGACCGGGTCCCAGTCTGGCCAGAGAGCAGCTCTAGCAACAGGGAGCTCCATTCAGGCTCGTGACTGGCTGTGCAGAAGCAGCCTCGGCCCCCACCTGCGGTACAACAGGAGGGCTCCTCTGAGTGCACGGCAACAAGCAAGAGGGAGAAGGGGCCTCGGTCCTGTTCTTCCTGATGCGTGTCTGCTGAGGCCAGGAGCTGGCTTTGGCCCATGGGCCTGTCCTAGTGGGAGGCCCCAGCATGTTGAGCCAGTAGCAGGTGGTGCTGGGCATGGCAGCCGCCCTCGTTCACTGCCCAGGGCTGTGGCCCAGCGGGGCACTGACCCGAGACAGGTCTGCGCACGCCCTGCTATCCTGAGGCTGGGGTCAGGGGCCTCCAGAGCAACATGGACCTTCTGCTTCCCTTCCTGCAGAGTCCACACCCACTGGGACGTGAACATCTCTTTCCGAGAGGCGTCCTGCAGGTAGGAGCCGTGCTGTGCGTGCATAAGAGGGGGCCGTGACTCCCCTCCCTCCCTCCCACCCCTGACCGTGCCCTGCTGTCTGCTGTCCGCTGTCTCAGCGTGAGCTGATGCTGTGATGCTGGCTGAGTGTCTGCCAGGTTTGACATGTGCTGCAAGGTTGTCCCCCATCCCGGGAGGCAGACAGTGTTGCACCCAGTTGGGACTGAGGGACCCCAGACCCAGTCAGATGCAGCTCTCGGCAGCAGCTCAGGTGTGAGTTCTGGGCAGCCCGGCCCTGGAGTTAGAGTGCACTTCCTCCCATGTGAGACTGGCCATTTGAGCCCAAAAATGAGGCTGTCACCTCCCCCTTCCCACCCTCCTAGAGACCCACAAGGAGGTGAGAATGCTGATGTGTGAGTGGGGCCCTGAAGGGTGTGTAGGAGCTCTAAGGCGAGGGGATGTCTGCAGAGTAGAGGAACAGGGAAGGGCGTGTAGGAGGGACGAGGAGTGAACCTGGCAGCTCTGGTTCAGTTGGATGCTGAAGAGTCATGGATGCTGGGCCTGTGGGCACCGTCCTCCAGGCGGGAGCCACCGAAAGTTCTTGAGCAGGGCAGTGACCAGGTGTATGTTTGGAGAAGGTCCCTCTGGAGGCCTTCCTGGCAGACAGGGGATTGGATTCAGGCTGTGGAAGCAGGACGGTAGGGGGTGTGATTCCAGGATGTGGAAAGGAGATAAAAATGAAGAGCCCCGGGGAAGAGGTCAAGGGAGTTGGGGGACCCGAGTTCCTGGCTCCAGGGGGAAGCGAGTGGTAAGTCTGTGAACAGAGCCCAGCTGTGGATTCTGTCAATGGGGTCAGGTCTCACCCTGTGGCTTCCAGGGCAGCAAGGCAGGAAGGAGGCGTCTGCCACAAGGCCAGCTTCCTGGGGCCAGAGCCGTGAAGGCCCAGGGGACCTGCGTGTCTTGGCTCCACGCCAGATGTGTTATTATTTATGTCTCTGAGAATGTCTGGATCTCAGAGCCGAATTACAATAAAAACATCTTTAAACTTATTTCTACCTCATTTTGGGGTTGCCAGCTCACCTGATCATTTTTATGAACTGTCATGAACACTGATGACATTTTATGAGCCTTTTACATGGGACACTACAGAATACATTTGTCAGCGAGGCCTGTAGGGAAACCC'}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_training_df.iloc[0][['question', 'answer', 'reference_sequence', 'mutated_sequence']].to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clinvar_id': '1170208',\n", + " 'original_window': 'GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGACTAACACGGTGAAACCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGTGGTGGCGGGTGCCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATGAACCCGGGAGGCGGAGCTTGCAGTGAGCCCAGATTGTGCCACCGCACTCCAGCCTGGGCAACAGAGTGAGACTCCGTCTCAAAAAACTAAAAAAGAAGAGAGGTGGGAGAGGAGAGGCTGTCAGAGCCTCTAAGCCCTGGTGCTTGGGCTGCAGAAGGGCAGAGCTAAGCGGGACTTCCCAGCACAGCACACTCCGGACAGGCTGTGGCTGTTGAAGGGACCCCCGAGCTCCAGCTGACACGCGGAGGCCCGGGCACAGACAGGCATCATACCTTCGGCCTTGGCCGCACTCTGTGGTCATTGGTGTTGGGGGCAGCCCAGGGTCAGGGCAGGGTCTCAGCCTCGGACCCCAGGCCCCACCCCTTGCCCAGCAGTGCTGCGTTTTCCCAGTGAGCTGTCGTGGAGAGAGCAGAGGGGACCCAGCGCAGGCCCAGTGGCCGGTGAGGGGAGACGTGGCTCTGGGACGGGGGCCTCCACCTGGGTGGGGGGATGCTCCAGCTTCCAGACCCTTGGGGAGGGGGCACTGCCCAAACTAAGCTGGCACTGGGGCTGTGCATTTGAAGGTGATGGTGGTTCTAGGTCTGAGGAGGACACCCTCCTAACAGCCTCATCCCCAAGCTCCGGGCTGTGTTGTGGCAATGGGAGGGAGGAAGTCTGAGGAGACCCTGGTGACTGAACGGAGGAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAGCCAGCGGCCTGTTACTACATTTAAAAAAGCCTCCCGCCCACTGGAAAATAATCAATAACTTTCCTTTATCCCTGGGGGTGGCAGGACCTAGAAACACTGGAGGAGTCCGGAAGTGCCTGGGGCTGGGCCGGCGCTGGTGTGCTGTGCAGGGTGCCGCGGGCACGTCCGCCGCGTGTGTGCGTCAGCTCGGGGCTCGGCTGTGCTCTGCAGGGACCACAGCGGGCGTGTCTGTGCTCCCACCCGAGGCACCCACAGCTCCACACGCTCGTTCCGTGGGTGCAAAGGAGATGGGAGAAAGAAGCCCTGTGAGAAATGCGGGGCAGGGTTTGCGGAACAGGGGACCTGGGCTGGTGAGGGCTCCTCGTCTGGTGACCTGTGAGCCCCGGGGCCTGCAGTCTGCGAGGGTTCAGCTCAGACAGTTGCCAGTGGCCTTGCACCAGGCTGCAGCTGCCCCTGAGCCGGGCTGTGCGTGGCGCTGATGAAATAGAAAAGGGCATTCGCTTGTCAACGTTGGCATCGGTGGCAGGGTGTGGTGGGCAGAAGGGTCACAAAGTACGGGTGGGATTGGCAGGCAGATACACGGAGGGAACGTGCGCATTTGAGTGCACGTCCACCAGCACCAGCCCCAGGCCACAGGCAGATCCCAGGAGACACGCAGGGGCCCTAAGAAGGGAGCTGGGAATGAGGGGCCACACAAGCCCGGGACGGAGGCCTGTCGCACATGGGGTGGCCCCGACTCAGGCCCTGGAGTTGGCCAGGACCCTCTAGCATCCTCAAGGGCTGGGCCAACCAGGCTGGCGTGGGGTGGGGCAGGGGAGGGCTGAGCCAGTGGGCGTCGTCTGTAGGGGGATGCCCAACTGCGGCCCCGTCTCTCGGCTCTCCTCTGGGTCTCTGGCCAGCTGTGGCTCCTGCTGGCCCCAGGCGCATCCCAGAGGCAGGTAGAGGGAGGATGGCTGCTCTGAGGGCACCTCTGCCGTGCTTGGGGCTCGGCCTGGGGTGCGAGACCAGGGCAGACCCCCGGGAGATGGAACGGCCCGGTCCAGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCAGAACCGGGGGCGGCTGGCAGACAAGAGGACAGTCGCCCTGCCTGCCGCCCGGAACCTGAAGAAGGAGCGAACTCCCAGCTTCTCTGCCAGCGATGGTGACAGCGACGGGAGTGGCCCCACCTGTGGGCGGCGGCCAGGCTTGAAGCAGGAGGATGGTCCGCACATCCGTATCATGAAGAGAAGGTACTTGGACCAGGGCCGGACAGGAAGGCGCAAGGCTCAGATGGGGCTGGAGCTTCAGGCCTTCAGCTGCTCAGATGAGAGTGTCCACACCGGCCTCCCACACCTTCCCTCAGATGCTGGTCTTTTTGGGGTCCTGTGTGGGTCGCAGGCAGGAGCTGTTTCCTCATCTGCCCCCTGTCTGGCGTCCCCTCCCACCTCTGCTCTGCGGCGCTCACTGGCAGAGGCAGGTTGGCAGCAGTTGGGACCCAGAGGTCTGCACCTTCCTGGGCCGACGCTCCAGCTACCCTTGCTGACCGGGTCCCAGTCTGGCCAGAGAGCAGCTCTAGCAACAGGGAGCTCCATTCAGGCTCGTGACTGGCTGTGCAGAAGCAGCCTCGGCCCCCACCTGCGGTACAACAGGAGGGCTCCTCTGAGTGCACGGCAACAAGCAAGAGGGAGAAGGGGCCTCGGTCCTGTTCTTCCTGATGCGTGTCTGCTGAGGCCAGGAGCTGGCTTTGGCCCATGGGCCTGTCCTAGTGGGAGGCCCCAGCATGTTGAGCCAGTAGCAGGTGGTGCTGGGCATGGCAGCCGCCCTCGTTCACTGCCCAGGGCTGTGGCCCAGCGGGGCACTGACCCGAGACAGGTCTGCGCACGCCCTGCTATCCTGAGGCTGGGGTCAGGGGCCTCCAGAGCAACATGGACCTTCTGCTTCCCTTCCTGCAGAGTCCACACCCACTGGGACGTGAACATCTCTTTCCGAGAGGCGTCCTGCAGGTAGGAGCCGTGCTGTGCGTGCATAAGAGGGGGCCGTGACTCCCCTCCCTCCCTCCCACCCCTGACCGTGCCCTGCTGTCTGCTGTCCGCTGTCTCAGCGTGAGCTGATGCTGTGATGCTGGCTGAGTGTCTGCCAGGTTTGACATGTGCTGCAAGGTTGTCCCCCATCCCGGGAGGCAGACAGTGTTGCACCCAGTTGGGACTGAGGGACCCCAGACCCAGTCAGATGCAGCTCTCGGCAGCAGCTCAGGTGTGAGTTCTGGGCAGCCCGGCCCTGGAGTTAGAGTGCACTTCCTCCCATGTGAGACTGGCCATTTGAGCCCAAAAATGAGGCTGTCACCTCCCCCTTCCCACCCTCCTAGAGACCCACAAGGAGGTGAGAATGCTGATGTGTGAGTGGGGCCCTGAAGGGTGTGTAGGAGCTCTAAGGCGAGGGGATGTCTGCAGAGTAGAGGAACAGGGAAGGGCGTGTAGGAGGGACGAGGAGTGAACCTGGCAGCTCTGGTTCAGTTGGATGCTGAAGAGTCATGGATGCTGGGCCTGTGGGCACCGTCCTCCAGGCGGGAGCCACCGAAAGTTCTTGAGCAGGGCAGTGACCAGGTGTATGTTTGGAGAAGGTCCCTCTGGAGGCCTTCCTGGCAGACAGGGGATTGGATTCAGGCTGTGGAAGCAGGACGGTAGGGGGTGTGATTCCAGGATGTGGAAAGGAGATAAAAATGAAGAGCCCCGGGGAAGAGGTCAAGGGAGTTGGGGGACCCGAGTTCCTGGCTCCAGGGGGAAGCGAGTGGTAAGTCTGTGAACAGAGCCCAGCTGTGGATTCTGTCAATGGGGTCAGGTCTCACCCTGTGGCTTCCAGGGCAGCAAGGCAGGAAGGAGGCGTCTGCCACAAGGCCAGCTTCCTGGGGCCAGAGCCGTGAAGGCCCAGGGGACCTGCGTGTCTTGGCTCCACGCCAGATGTGTTATTATTTATGTCTCTGAGAATGTCTGGATCTCAGAGCCGAATTACAATAAAAACATCTTTAAACTTATTTCTACCTCATTTTGGGGTTGCCAGCTCACCTGATCATTTTTATGAACTGTCATGAACACTGATGACATTTTATGAGCCTTTTACATGGGACACTACAGAATACATTTGTCAGCGAGGCCTGTAGGGAAACCC',\n", + " 'mutated_window': 'GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGACTAACACGGTGAAACCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGTGGTGGCGGGTGCCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATGAACCCGGGAGGCGGAGCTTGCAGTGAGCCCAGATTGTGCCACCGCACTCCAGCCTGGGCAACAGAGTGAGACTCCGTCTCAAAAAACTAAAAAAGAAGAGAGGTGGGAGAGGAGAGGCTGTCAGAGCCTCTAAGCCCTGGTGCTTGGGCTGCAGAAGGGCAGAGCTAAGCGGGACTTCCCAGCACAGCACACTCCGGACAGGCTGTGGCTGTTGAAGGGACCCCCGAGCTCCAGCTGACACGCGGAGGCCCGGGCACAGACAGGCATCATACCTTCGGCCTTGGCCGCACTCTGTGGTCATTGGTGTTGGGGGCAGCCCAGGGTCAGGGCAGGGTCTCAGCCTCGGACCCCAGGCCCCACCCCTTGCCCAGCAGTGCTGCGTTTTCCCAGTGAGCTGTCGTGGAGAGAGCAGAGGGGACCCAGCGCAGGCCCAGTGGCCGGTGAGGGGAGACGTGGCTCTGGGACGGGGGCCTCCACCTGGGTGGGGGGATGCTCCAGCTTCCAGACCCTTGGGGAGGGGGCACTGCCCAAACTAAGCTGGCACTGGGGCTGTGCATTTGAAGGTGATGGTGGTTCTAGGTCTGAGGAGGACACCCTCCTAACAGCCTCATCCCCAAGCTCCGGGCTGTGTTGTGGCAATGGGAGGGAGGAAGTCTGAGGAGACCCTGGTGACTGAACGGAGGAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAGCCAGCGGCCTGTTACTACATTTAAAAAAGCCTCCCGCCCACTGGAAAATAATCAATAACTTTCCTTTATCCCTGGGGGTGGCAGGACCTAGAAACACTGGAGGAGTCCGGAAGTGCCTGGGGCTGGGCCGGCGCTGGTGTGCTGTGCAGGGTGCCGCGGGCACGTCCGCCGCGTGTGTGCGTCAGCTCGGGGCTCGGCTGTGCTCTGCAGGGACCACAGCGGGCGTGTCTGTGCTCCCACCCGAGGCACCCACAGCTCCACACGCTCGTTCCGTGGGTGCAAAGGAGATGGGAGAAAGAAGCCCTGTGAGAAATGCGGGGCAGGGTTTGCGGAACAGGGGACCTGGGCTGGTGAGGGCTCCTCGTCTGGTGACCTGTGAGCCCCGGGGCCTGCAGTCTGCGAGGGTTCAGCTCAGACAGTTGCCAGTGGCCTTGCACCAGGCTGCAGCTGCCCCTGAGCCGGGCTGTGCGTGGCGCTGATGAAATAGAAAAGGGCATTCGCTTGTCAACGTTGGCATCGGTGGCAGGGTGTGGTGGGCAGAAGGGTCACAAAGTACGGGTGGGATTGGCAGGCAGATACACGGAGGGAACGTGCGCATTTGAGTGCACGTCCACCAGCACCAGCCCCAGGCCACAGGCAGATCCCAGGAGACACGCAGGGGCCCTAAGAAGGGAGCTGGGAATGAGGGGCCACACAAGCCCGGGACGGAGGCCTGTCGCACATGGGGTGGCCCCGACTCAGGCCCTGGAGTTGGCCAGGACCCTCTAGCATCCTCAAGGGCTGGGCCAACCAGGCTGGCGTGGGGTGGGGCAGGGGAGGGCTGAGCCAGTGGGCGTCGTCTGTAGGGGGATGCCCAACTGCGGCCCCGTCTCTCGGCTCTCCTCTGGGTCTCTGGCCAGCTGTGGCTCCTGCTGGCCCCAGGCGCATCCCAGAGGCAGGTAGAGGGAGGATGGCTGCTCTGAGGGCACCTCTGCCGTGCTTGGGGCTCGGCCTGGGGTGCGAGACCAGGGCAGACCCCCGGGAGATGGAACGGCCCGGTCCAGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCAGAACCGGGGGCGGCTGGCAGACAAGAGGACAGTCGCCCTGCCTGCCGCCCAGAACCTGAAGAAGGAGCGAACTCCCAGCTTCTCTGCCAGCGATGGTGACAGCGACGGGAGTGGCCCCACCTGTGGGCGGCGGCCAGGCTTGAAGCAGGAGGATGGTCCGCACATCCGTATCATGAAGAGAAGGTACTTGGACCAGGGCCGGACAGGAAGGCGCAAGGCTCAGATGGGGCTGGAGCTTCAGGCCTTCAGCTGCTCAGATGAGAGTGTCCACACCGGCCTCCCACACCTTCCCTCAGATGCTGGTCTTTTTGGGGTCCTGTGTGGGTCGCAGGCAGGAGCTGTTTCCTCATCTGCCCCCTGTCTGGCGTCCCCTCCCACCTCTGCTCTGCGGCGCTCACTGGCAGAGGCAGGTTGGCAGCAGTTGGGACCCAGAGGTCTGCACCTTCCTGGGCCGACGCTCCAGCTACCCTTGCTGACCGGGTCCCAGTCTGGCCAGAGAGCAGCTCTAGCAACAGGGAGCTCCATTCAGGCTCGTGACTGGCTGTGCAGAAGCAGCCTCGGCCCCCACCTGCGGTACAACAGGAGGGCTCCTCTGAGTGCACGGCAACAAGCAAGAGGGAGAAGGGGCCTCGGTCCTGTTCTTCCTGATGCGTGTCTGCTGAGGCCAGGAGCTGGCTTTGGCCCATGGGCCTGTCCTAGTGGGAGGCCCCAGCATGTTGAGCCAGTAGCAGGTGGTGCTGGGCATGGCAGCCGCCCTCGTTCACTGCCCAGGGCTGTGGCCCAGCGGGGCACTGACCCGAGACAGGTCTGCGCACGCCCTGCTATCCTGAGGCTGGGGTCAGGGGCCTCCAGAGCAACATGGACCTTCTGCTTCCCTTCCTGCAGAGTCCACACCCACTGGGACGTGAACATCTCTTTCCGAGAGGCGTCCTGCAGGTAGGAGCCGTGCTGTGCGTGCATAAGAGGGGGCCGTGACTCCCCTCCCTCCCTCCCACCCCTGACCGTGCCCTGCTGTCTGCTGTCCGCTGTCTCAGCGTGAGCTGATGCTGTGATGCTGGCTGAGTGTCTGCCAGGTTTGACATGTGCTGCAAGGTTGTCCCCCATCCCGGGAGGCAGACAGTGTTGCACCCAGTTGGGACTGAGGGACCCCAGACCCAGTCAGATGCAGCTCTCGGCAGCAGCTCAGGTGTGAGTTCTGGGCAGCCCGGCCCTGGAGTTAGAGTGCACTTCCTCCCATGTGAGACTGGCCATTTGAGCCCAAAAATGAGGCTGTCACCTCCCCCTTCCCACCCTCCTAGAGACCCACAAGGAGGTGAGAATGCTGATGTGTGAGTGGGGCCCTGAAGGGTGTGTAGGAGCTCTAAGGCGAGGGGATGTCTGCAGAGTAGAGGAACAGGGAAGGGCGTGTAGGAGGGACGAGGAGTGAACCTGGCAGCTCTGGTTCAGTTGGATGCTGAAGAGTCATGGATGCTGGGCCTGTGGGCACCGTCCTCCAGGCGGGAGCCACCGAAAGTTCTTGAGCAGGGCAGTGACCAGGTGTATGTTTGGAGAAGGTCCCTCTGGAGGCCTTCCTGGCAGACAGGGGATTGGATTCAGGCTGTGGAAGCAGGACGGTAGGGGGTGTGATTCCAGGATGTGGAAAGGAGATAAAAATGAAGAGCCCCGGGGAAGAGGTCAAGGGAGTTGGGGGACCCGAGTTCCTGGCTCCAGGGGGAAGCGAGTGGTAAGTCTGTGAACAGAGCCCAGCTGTGGATTCTGTCAATGGGGTCAGGTCTCACCCTGTGGCTTCCAGGGCAGCAAGGCAGGAAGGAGGCGTCTGCCACAAGGCCAGCTTCCTGGGGCCAGAGCCGTGAAGGCCCAGGGGACCTGCGTGTCTTGGCTCCACGCCAGATGTGTTATTATTTATGTCTCTGAGAATGTCTGGATCTCAGAGCCGAATTACAATAAAAACATCTTTAAACTTATTTCTACCTCATTTTGGGGTTGCCAGCTCACCTGATCATTTTTATGAACTGTCATGAACACTGATGACATTTTATGAGCCTTTTACATGGGACACTACAGAATACATTTGTCAGCGAGGCCTGTAGGGAAACCC',\n", + " 'cleaned_pathogenicity': 'benign',\n", + " 'disease_name': 'SAMD11-related_disorder|not_provided',\n", + " 'gene_name': 'SAMD11',\n", + " 'gene_desc': 'sterile alpha motif domain containing 11',\n", + " 'chromosome': '1',\n", + " 'chromosome_position': '930204',\n", + " 'variant_type': 'SNV',\n", + " 'clinvar_link': 'https://www.ncbi.nlm.nih.gov/clinvar/variation/1170208/',\n", + " 'gene_id': '148398',\n", + " 'mutation_instruction': 'G>A',\n", + " 'pathogenicity': 'benign',\n", + " 'review_status': 'criteria_provided,_multiple_submitters,_no_conflicts'}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.iloc[0].to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "answer\n", + "benign 80\n", + "pathogenic; ['Intellectual_disability,_X-linked_102'] 1\n", + "pathogenic; ['Familial_adenomatous_polyposis_2', 'Hereditary_cancer-predisposing_syndrome'] 1\n", + "pathogenic; ['Familial_thoracic_aortic_aneurysm_and_aortic_dissection', 'Hereditary_cancer-predisposing_syndrome', 'Juvenile_polyposis_syndrome'] 1\n", + "pathogenic; ['Familial_cancer_of_breast', 'Hereditary_cancer-predisposing_syndrome'] 1\n", + "pathogenic; ['Bardet-Biedl_syndrome_2', 'Retinitis_pigmentosa_74'] 1\n", + "pathogenic; ['Early-onset_retinal_dystrophy', 'Leber_congenital_amaurosis', 'Leber_congenital_amaurosis_8', 'Pigmented_paravenous_retinochoroidal_atrophy', 'Retinal_dystrophy', 'Retinitis_pigmentosa_12'] 1\n", + "pathogenic; ['Autosomal_recessive_limb-girdle_muscular_dystrophy_type_2E'] 1\n", + "pathogenic; ['Childhood_Onset_Dystonias', 'Dystonia,_childhood-onset,_with_optic_atrophy_and_basal_ganglia_abnormalities', 'MECR-related_disorder', 'Mitochondrial_disease', 'Optic_atrophy'] 1\n", + "pathogenic; ['Autoimmune_thyroid_disease,_susceptibility_to,_3', 'Iodotyrosyl_coupling_defect'] 1\n", + "pathogenic; ['Duchenne_muscular_dystrophy'] 1\n", + "pathogenic; ['Ataxia-telangiectasia_syndrome', 'Hereditary_cancer-predisposing_syndrome'] 1\n", + "pathogenic; ['Autosomal_dominant_nonsyndromic_hearing_loss_6', 'Cataract_41', 'Type_2_diabetes_mellitus', 'Wolfram-like_syndrome', 'Wolfram_syndrome_1'] 1\n", + "pathogenic; ['Autosomal_recessive_limb-girdle_muscular_dystrophy_type_2B', 'Distal_myopathy_with_anterior_tibial_onset', 'Miyoshi_muscular_dystrophy_1'] 1\n", + "pathogenic; ['Breast-ovarian_cancer,_familial,_susceptibility_to,_1'] 1\n", + "pathogenic; ['Arterial_calcification,_generalized,_of_infancy,_2', 'Autosomal_recessive_inherited_pseudoxanthoma_elasticum', 'Pseudoxanthoma_elasticum,_forme_fruste'] 1\n", + "pathogenic; ['Hereditary_cancer-predisposing_syndrome', 'Juvenile_polyposis_syndrome'] 1\n", + "pathogenic; ['Monogenic_diabetes'] 1\n", + "pathogenic; ['Autosomal_recessive_osteopetrosis_1'] 1\n", + "pathogenic; ['Autosomal_dominant_nonsyndromic_hearing_loss_11', 'Autosomal_recessive_nonsyndromic_hearing_loss_2', 'Rare_genetic_deafness', 'Retinal_dystrophy', 'Usher_syndrome_type_1'] 1\n", + "pathogenic; ['Wilson_disease'] 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_df['answer'].sample(100).value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "visualization of table" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
clinvar_idoriginal_windowmutated_windowcleaned_pathogenicitydisease_namevariant_typeclinvar_linkmutation_instructionpathogenicityreview_status
01170208GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTG...GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTG...benignSAMD11-related_disorder|not_providedSNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...G>Abenigncriteria_provided,_multiple_submitters,_no_con...
21170010CCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATG...CCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATG...benignSAMD11-related_disorder|not_providedSNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...C>Tbenigncriteria_provided,_multiple_submitters,_no_con...
31170044GAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAG...GAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAG...benignnot_provided|SAMD11-related_disorderSNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...C>Tbenigncriteria_provided,_multiple_submitters,_no_con...
51170011AGCCGTCATCTAGGTCTCCTGGAAGGTTTAGAGCCCAGCCTGGGAG...AGCCGTCATCTAGGTCTCCTGGAAGGTTTAGAGCCCAGCCTGGGAG...benignSAMD11-related_disorder|not_providedSNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...C>Gbenigncriteria_provided,_multiple_submitters,_no_con...
71169668GGTTTAGAGCCCAGCCTGGGAGTCTTTGGTGCTGAAACGGATCTGC...GGTTTAGAGCCCAGCCTGGGAGTCTTTGGTGCTGAAACGGATCTGC...benignnot_providedSNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...C>Tbenigncriteria_provided,_multiple_submitters,_no_con...
.................................
342875522717TGTCATCCCTCTTATTAATCATCATCCTAGCCCTAAGTCTGGCCTA...TGTCATCCCTCTTATTAATCATCATCCTAGCCCTAAGTCTGGCCTA...benignMitochondrial_disease|not_specifiedSNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...G>Abenigncriteria_provided,_multiple_submitters,_no_con...
34287865510CTAAAACTAATCGTCCCAACAATTATATTACTACCACTGACATGAC...CTAAAACTAATCGTCCCAACAATTATATTACTACCACTGACATGAC...benignLeber_optic_atrophy|Leigh_syndrome|Mitochondri...SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...T>Cbenignreviewed_by_expert_panel
342905140592AGTTACAATCGGCATCAACCAACCACACCTAGCATTCCTGCACATC...AGTTACAATCGGCATCAACCAACCACACCTAGCATTCCTGCACATC...benignFamilial_cancer_of_breast|Mitochondrial_diseas...SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...A>Gbenignreviewed_by_expert_panel
342907235623TAAACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACT...TAAACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACT...benignLeigh_syndrome|not_providedSNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...A>Gbenigncriteria_provided,_multiple_submitters,_no_con...
342909252455AGCCCTAGACCTCAACTACCTAACCAACAAACTTAAAATAAAATCC...AGCCCTAGACCTCAACTACCTAACCAACAAACTTAAAATAAAATCC...benignnot_specified|Leigh_syndromeSNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...G>Cbenigncriteria_provided,_multiple_submitters,_no_con...
\n", + "

93800 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " clinvar_id original_window \\\n", + "0 1170208 GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTG... \n", + "2 1170010 CCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATG... \n", + "3 1170044 GAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAG... \n", + "5 1170011 AGCCGTCATCTAGGTCTCCTGGAAGGTTTAGAGCCCAGCCTGGGAG... \n", + "7 1169668 GGTTTAGAGCCCAGCCTGGGAGTCTTTGGTGCTGAAACGGATCTGC... \n", + "... ... ... \n", + "342875 522717 TGTCATCCCTCTTATTAATCATCATCCTAGCCCTAAGTCTGGCCTA... \n", + "342878 65510 CTAAAACTAATCGTCCCAACAATTATATTACTACCACTGACATGAC... \n", + "342905 140592 AGTTACAATCGGCATCAACCAACCACACCTAGCATTCCTGCACATC... \n", + "342907 235623 TAAACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACT... \n", + "342909 252455 AGCCCTAGACCTCAACTACCTAACCAACAAACTTAAAATAAAATCC... \n", + "\n", + " mutated_window \\\n", + "0 GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTG... \n", + "2 CCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATG... \n", + "3 GAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAG... \n", + "5 AGCCGTCATCTAGGTCTCCTGGAAGGTTTAGAGCCCAGCCTGGGAG... \n", + "7 GGTTTAGAGCCCAGCCTGGGAGTCTTTGGTGCTGAAACGGATCTGC... \n", + "... ... \n", + "342875 TGTCATCCCTCTTATTAATCATCATCCTAGCCCTAAGTCTGGCCTA... \n", + "342878 CTAAAACTAATCGTCCCAACAATTATATTACTACCACTGACATGAC... \n", + "342905 AGTTACAATCGGCATCAACCAACCACACCTAGCATTCCTGCACATC... \n", + "342907 TAAACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACT... \n", + "342909 AGCCCTAGACCTCAACTACCTAACCAACAAACTTAAAATAAAATCC... \n", + "\n", + " cleaned_pathogenicity \\\n", + "0 benign \n", + "2 benign \n", + "3 benign \n", + "5 benign \n", + "7 benign \n", + "... ... \n", + "342875 benign \n", + "342878 benign \n", + "342905 benign \n", + "342907 benign \n", + "342909 benign \n", + "\n", + " disease_name variant_type \\\n", + "0 SAMD11-related_disorder|not_provided SNV \n", + "2 SAMD11-related_disorder|not_provided SNV \n", + "3 not_provided|SAMD11-related_disorder SNV \n", + "5 SAMD11-related_disorder|not_provided SNV \n", + "7 not_provided SNV \n", + "... ... ... \n", + "342875 Mitochondrial_disease|not_specified SNV \n", + "342878 Leber_optic_atrophy|Leigh_syndrome|Mitochondri... SNV \n", + "342905 Familial_cancer_of_breast|Mitochondrial_diseas... SNV \n", + "342907 Leigh_syndrome|not_provided SNV \n", + "342909 not_specified|Leigh_syndrome SNV \n", + "\n", + " clinvar_link \\\n", + "0 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "2 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "3 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "5 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "7 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "... ... \n", + "342875 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "342878 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "342905 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "342907 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "342909 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "\n", + " mutation_instruction pathogenicity \\\n", + "0 G>A benign \n", + "2 C>T benign \n", + "3 C>T benign \n", + "5 C>G benign \n", + "7 C>T benign \n", + "... ... ... \n", + "342875 G>A benign \n", + "342878 T>C benign \n", + "342905 A>G benign \n", + "342907 A>G benign \n", + "342909 G>C benign \n", + "\n", + " review_status \n", + "0 criteria_provided,_multiple_submitters,_no_con... \n", + "2 criteria_provided,_multiple_submitters,_no_con... \n", + "3 criteria_provided,_multiple_submitters,_no_con... \n", + "5 criteria_provided,_multiple_submitters,_no_con... \n", + "7 criteria_provided,_multiple_submitters,_no_con... \n", + "... ... \n", + "342875 criteria_provided,_multiple_submitters,_no_con... \n", + "342878 reviewed_by_expert_panel \n", + "342905 reviewed_by_expert_panel \n", + "342907 criteria_provided,_multiple_submitters,_no_con... \n", + "342909 criteria_provided,_multiple_submitters,_no_con... \n", + "\n", + "[93800 rows x 10 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['pathogenicity']=='benign']" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
clinvar_idoriginal_windowmutated_windowcleaned_pathogenicitydisease_namevariant_typeclinvar_linkmutation_instructionpathogenicityreview_status
421185392TTATTGATGTGAAATTCATATAACATAAAACTAACCATTTTAAAGA...TTATTGATGTGAAATTCATATAACATAAAACTAACCATTTTAAAGA...benignMendelian_susceptibility_to_mycobacterial_dise...non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...T>TAbenigncriteria_provided,_multiple_submitters,_no_con...
67666960TGGTGCAGGGAGGTGACTGGGTCCTTGGCCATGGGGTTGGGACCTG...TGGTGCAGGGAGGTGACTGGGTCCTTGGCCATGGGGTTGGGACCTG...pathogenicCongenital_myasthenic_syndrome|Congenital_myas...non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...G>GGGGCCpathogenic/likely_pathogeniccriteria_provided,_multiple_submitters,_no_con...
69970311ATCAGCAGGTGCCCGTTGGATTTGGACTGGGAGTCCCAGGGCCTTG...ATCAGCAGGTGCCCGTTGGATTTGGACTGGGAGTCCCAGGGCCTTG...pathogenicCongenital_myasthenic_syndrome_8non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...G>GCpathogenic/likely_pathogeniccriteria_provided,_multiple_submitters,_no_con...
80930633GTGCCTGAGGCAGCTTTGTTGGCCACGTTGAGGTCTGGTGATGGGA...GTGCCTGAGGCAGCTTTGTTGGCCACGTTGAGGTCTGGTGATGGGA...pathogenicPresynaptic_congenital_myasthenic_syndrome|Con...non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...CGCTCCGGCCAGTGCCAGGGTCGAGGTGAGCGGCTCCCCCGGGGGA...likely_pathogeniccriteria_provided,_multiple_submitters,_no_con...
90263160TCGCGGGACCCCTGCTCCAACGTGACCTGCAGCTTCGGCAGCACCT...TCGCGGGACCCCTGCTCCAACGTGACCTGCAGCTTCGGCAGCACCT...benignnot_provided|not_specified|Congenital_myasthen...non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...CCT>Cbenigncriteria_provided,_multiple_submitters,_no_con...
.................................
3428449654TACATAAAATCTAGACAAAAAAGGAAGGAATCGAACCCCCCAAAGC...TACATAAAATCTAGACAAAAAAGGAAGGAATCGAACCCCCCAAAGC...pathogenicMitochondrial_disease|Mitochondrial_complex_IV...non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...TTTTTTCTTCGCAGGA>Tlikely_pathogenicreviewed_by_expert_panel
3428459656CAAGCCAACCCCATGGCCTCCATGACTTTTTCAAAAAGGTATTAGA...CAAGCCAACCCCATGGCCTCCATGACTTTTTCAAAAAGGTATTAGA...pathogenicMitochondrial_disease|Mitochondrial_complex_IV...non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...A>AClikely_pathogenicreviewed_by_expert_panel
342876693440ATGAGTGACTACAAAAAGGATTAGACTGAACCGAATTGGTATATAG...ATGAGTGACTACAAAAAGGATTAGACTGAACCGAATTGGTATATAG...pathogenicMitochondrial_myopathy_with_reversible_cytochr...non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...CA>Clikely_pathogenicreviewed_by_expert_panel
342895800503ACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAG...ACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAG...pathogenicMitochondrial_diseasenon_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...CTA>Clikely_pathogenicreviewed_by_expert_panel
3429019686TACCGCTAACAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTA...TACCGCTAACAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTA...pathogenicMitochondrial_disease|Parkinsonism/MELAS_overl...non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...AAATT>Alikely_pathogenicreviewed_by_expert_panel
\n", + "

36097 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " clinvar_id original_window \\\n", + "42 1185392 TTATTGATGTGAAATTCATATAACATAAAACTAACCATTTTAAAGA... \n", + "67 666960 TGGTGCAGGGAGGTGACTGGGTCCTTGGCCATGGGGTTGGGACCTG... \n", + "69 970311 ATCAGCAGGTGCCCGTTGGATTTGGACTGGGAGTCCCAGGGCCTTG... \n", + "80 930633 GTGCCTGAGGCAGCTTTGTTGGCCACGTTGAGGTCTGGTGATGGGA... \n", + "90 263160 TCGCGGGACCCCTGCTCCAACGTGACCTGCAGCTTCGGCAGCACCT... \n", + "... ... ... \n", + "342844 9654 TACATAAAATCTAGACAAAAAAGGAAGGAATCGAACCCCCCAAAGC... \n", + "342845 9656 CAAGCCAACCCCATGGCCTCCATGACTTTTTCAAAAAGGTATTAGA... \n", + "342876 693440 ATGAGTGACTACAAAAAGGATTAGACTGAACCGAATTGGTATATAG... \n", + "342895 800503 ACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAG... \n", + "342901 9686 TACCGCTAACAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTA... \n", + "\n", + " mutated_window \\\n", + "42 TTATTGATGTGAAATTCATATAACATAAAACTAACCATTTTAAAGA... \n", + "67 TGGTGCAGGGAGGTGACTGGGTCCTTGGCCATGGGGTTGGGACCTG... \n", + "69 ATCAGCAGGTGCCCGTTGGATTTGGACTGGGAGTCCCAGGGCCTTG... \n", + "80 GTGCCTGAGGCAGCTTTGTTGGCCACGTTGAGGTCTGGTGATGGGA... \n", + "90 TCGCGGGACCCCTGCTCCAACGTGACCTGCAGCTTCGGCAGCACCT... \n", + "... ... \n", + "342844 TACATAAAATCTAGACAAAAAAGGAAGGAATCGAACCCCCCAAAGC... \n", + "342845 CAAGCCAACCCCATGGCCTCCATGACTTTTTCAAAAAGGTATTAGA... \n", + "342876 ATGAGTGACTACAAAAAGGATTAGACTGAACCGAATTGGTATATAG... \n", + "342895 ACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAG... \n", + "342901 TACCGCTAACAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTA... \n", + "\n", + " cleaned_pathogenicity \\\n", + "42 benign \n", + "67 pathogenic \n", + "69 pathogenic \n", + "80 pathogenic \n", + "90 benign \n", + "... ... \n", + "342844 pathogenic \n", + "342845 pathogenic \n", + "342876 pathogenic \n", + "342895 pathogenic \n", + "342901 pathogenic \n", + "\n", + " disease_name variant_type \\\n", + "42 Mendelian_susceptibility_to_mycobacterial_dise... non_SNV \n", + "67 Congenital_myasthenic_syndrome|Congenital_myas... non_SNV \n", + "69 Congenital_myasthenic_syndrome_8 non_SNV \n", + "80 Presynaptic_congenital_myasthenic_syndrome|Con... non_SNV \n", + "90 not_provided|not_specified|Congenital_myasthen... non_SNV \n", + "... ... ... \n", + "342844 Mitochondrial_disease|Mitochondrial_complex_IV... non_SNV \n", + "342845 Mitochondrial_disease|Mitochondrial_complex_IV... non_SNV \n", + "342876 Mitochondrial_myopathy_with_reversible_cytochr... non_SNV \n", + "342895 Mitochondrial_disease non_SNV \n", + "342901 Mitochondrial_disease|Parkinsonism/MELAS_overl... non_SNV \n", + "\n", + " clinvar_link \\\n", + "42 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "67 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "69 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "80 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "90 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "... ... \n", + "342844 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "342845 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "342876 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "342895 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "342901 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "\n", + " mutation_instruction \\\n", + "42 T>TA \n", + "67 G>GGGGCC \n", + "69 G>GC \n", + "80 CGCTCCGGCCAGTGCCAGGGTCGAGGTGAGCGGCTCCCCCGGGGGA... \n", + "90 CCT>C \n", + "... ... \n", + "342844 TTTTTTCTTCGCAGGA>T \n", + "342845 A>AC \n", + "342876 CA>C \n", + "342895 CTA>C \n", + "342901 AAATT>A \n", + "\n", + " pathogenicity \\\n", + "42 benign \n", + "67 pathogenic/likely_pathogenic \n", + "69 pathogenic/likely_pathogenic \n", + "80 likely_pathogenic \n", + "90 benign \n", + "... ... \n", + "342844 likely_pathogenic \n", + "342845 likely_pathogenic \n", + "342876 likely_pathogenic \n", + "342895 likely_pathogenic \n", + "342901 likely_pathogenic \n", + "\n", + " review_status \n", + "42 criteria_provided,_multiple_submitters,_no_con... \n", + "67 criteria_provided,_multiple_submitters,_no_con... \n", + "69 criteria_provided,_multiple_submitters,_no_con... \n", + "80 criteria_provided,_multiple_submitters,_no_con... \n", + "90 criteria_provided,_multiple_submitters,_no_con... \n", + "... ... \n", + "342844 reviewed_by_expert_panel \n", + "342845 reviewed_by_expert_panel \n", + "342876 reviewed_by_expert_panel \n", + "342895 reviewed_by_expert_panel \n", + "342901 reviewed_by_expert_panel \n", + "\n", + "[36097 rows x 10 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['variant_type']=='non_SNV']" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "variant_type\n", + "SNV 306816\n", + "non_SNV 36097\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['variant_type'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['likely_benign', 'benign', 'benign/likely_benign', 'pathogenic',\n", + " 'pathogenic/likely_pathogenic', 'likely_pathogenic',\n", + " 'pathogenic|drug_response', 'likely_pathogenic|drug_response',\n", + " 'benign/likely_benign|other', 'likely_benign|other', 'benign|other',\n", + " 'pathogenic/likely_pathogenic|other', 'pathogenic|other',\n", + " 'benign|association', 'likely_benign|drug_response|other',\n", + " 'pathogenic/likely_pathogenic|risk_factor', 'benign|drug_response',\n", + " 'benign/likely_benign|drug_response|other',\n", + " 'likely_pathogenic|risk_factor', 'pathogenic|risk_factor',\n", + " 'benign/likely_benign|drug_response', 'benign|risk_factor',\n", + " 'likely_benign|association', 'benign/likely_benign|other|risk_factor',\n", + " 'benign/likely_benign|association', 'likely_pathogenic|affects',\n", + " 'likely_pathogenic|other', 'benign/likely_benign|risk_factor',\n", + " 'likely_pathogenic|association',\n", + " 'pathogenic/likely_pathogenic|association',\n", + " 'benign|confers_sensitivity', 'likely_benign|risk_factor'],\n", + " dtype='object', name='pathogenicity')" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['pathogenicity'].value_counts().keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total variants: 3,493,400\n", + "\n", + "Variant type counts:\n" + ] + }, + { + "data": { + "text/plain": [ + "variant_type\n", + "SNV 3226063\n", + "non_SNV 267337\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Pathogenicity counts:\n" + ] + }, + { + "data": { + "text/plain": [ + "pathogenicity\n", + "not_pathogenic 3043681\n", + "pathogenic 449719\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Top 10 disease names:\n" + ] + }, + { + "data": { + "text/plain": [ + "disease_name\n", + "not_provided 861927\n", + "not_specified 719547\n", + "Inborn_genetic_diseases 133139\n", + "Hereditary_cancer-predisposing_syndrome 47592\n", + "Cardiovascular_phenotype 25149\n", + "Primary_ciliary_dyskinesia 17996\n", + "Inborn_genetic_diseases|not_provided 16863\n", + "not_specified|not_provided 16518\n", + "not_provided|Inborn_genetic_diseases 15874\n", + "not_provided|not_specified 14489\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Length‐difference (alt − ref) distribution:\n" + ] + }, + { + "data": { + "text/plain": [ + "len_diff\n", + "-2046 1\n", + "-2037 1\n", + "-2032 1\n", + "-2031 1\n", + "-2030 1\n", + " ..\n", + " 1951 1\n", + " 1989 1\n", + " 1992 1\n", + " 2004 1\n", + " 2019 1\n", + "Name: count, Length: 1266, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "SNVs: 3,226,063 → Transitions: 2,104,260 Transversions: 1,121,803\n", + "\n", + "Original‐window GC content (sample):\n" + ] + }, + { + "data": { + "text/plain": [ + "count 10000.000000\n", + "mean 0.471380\n", + "std 0.094873\n", + "min 0.244629\n", + "25% 0.389404\n", + "50% 0.461914\n", + "75% 0.548340\n", + "max 0.744385\n", + "Name: orig_gc, dtype: float64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Mutated‐window GC content (sample):\n" + ] + }, + { + "data": { + "text/plain": [ + "count 10000.000000\n", + "mean 0.471290\n", + "std 0.094818\n", + "min 0.244385\n", + "25% 0.389404\n", + "50% 0.461792\n", + "75% 0.548157\n", + "max 0.744385\n", + "Name: mut_gc, dtype: float64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Non-SNV events by net length change:\n", + " Insertions (len_diff>0) : 86,857\n", + " Deletions (len_diff<0) : 169,730\n", + " Balanced Delins (len_diff=0) : 10,750\n" + ] + } + ], + "source": [ + "# ─── Basic cohort statistics ─────────────────────────────────\n", + "\n", + "print(f\"Total variants: {len(df):,}\\n\")\n", + "\n", + "# Variant type\n", + "print(\"Variant type counts:\")\n", + "display(df['variant_type'].value_counts())\n", + "\n", + "# Pathogenicity\n", + "print(\"\\nPathogenicity counts:\")\n", + "display(df['pathogenicity'].value_counts())\n", + "\n", + "# Top diseases\n", + "print(\"\\nTop 10 disease names:\")\n", + "display(df['disease_name']\n", + " .replace('', 'Unknown') # collapse blanks\n", + " .value_counts()\n", + " .head(10))\n", + "\n", + "# ─── Indel vs. SNP breakdown ────────────────────────────────\n", + "\n", + "# parse ref/alt lengths\n", + "ref_alt = df['mutation_instruction'].str.split('>', expand=True)\n", + "df['ref_len'] = ref_alt[0].str.len().astype(int)\n", + "df['alt_len'] = ref_alt[1].str.len().astype(int)\n", + "df['len_diff'] = df['alt_len'] - df['ref_len']\n", + "\n", + "print(\"\\nLength‐difference (alt − ref) distribution:\")\n", + "display(df['len_diff']\n", + " .value_counts()\n", + " .sort_index())\n", + "\n", + "# ─── Transition / transversion in SNVs ─────────────────────\n", + "\n", + "# only look at true SNVs (ref_len==alt_len==1)\n", + "snv = df[(df['variant_type']=='SNV') & (df['len_diff']==0)].copy()\n", + "def is_transition(instr):\n", + " pur = {'A','G'}\n", + " pyr = {'C','T'}\n", + " r,a = instr.split('>')\n", + " return (r in pur and a in pur) or (r in pyr and a in pyr)\n", + "\n", + "snv['is_transition'] = snv['mutation_instruction'].map(is_transition)\n", + "t1 = snv['is_transition'].sum()\n", + "t2 = (~snv['is_transition']).sum()\n", + "print(f\"\\nSNVs: {len(snv):,} → Transitions: {t1:,} Transversions: {t2:,}\\n\")\n", + "\n", + "# ─── GC‐content in windows (sampled) ─────────────────────────\n", + "\n", + "# sampling to speed up\n", + "sample = df.sample(min(len(df), 10000), random_state=0)\n", + "def gc_frac(s): return (s.count('G')+s.count('C'))/len(s)\n", + "\n", + "sample['orig_gc'] = sample['original_window'].map(gc_frac)\n", + "sample['mut_gc' ] = sample['mutated_window'].map(gc_frac)\n", + "\n", + "print(\"Original‐window GC content (sample):\")\n", + "display(sample['orig_gc'].describe())\n", + "\n", + "print(\"\\nMutated‐window GC content (sample):\")\n", + "display(sample['mut_gc'].describe())\n", + "\n", + "\n", + "# ─── Better Non-SNV event breakdown ────────────────────────────────\n", + "\n", + "non_snv = df[df['variant_type'] != 'SNV']\n", + "\n", + "# counts\n", + "n_ins = (non_snv['len_diff'] > 0).sum()\n", + "n_del = (non_snv['len_diff'] < 0).sum()\n", + "n_bal = ((non_snv['len_diff']==0) & (non_snv['ref_len']>1)).sum()\n", + "\n", + "print(\"Non-SNV events by net length change:\")\n", + "print(f\" Insertions (len_diff>0) : {n_ins:,}\")\n", + "print(f\" Deletions (len_diff<0) : {n_del:,}\")\n", + "print(f\" Balanced Delins (len_diff=0) : {n_bal:,}\")\n", + "\n", + "# catch any explicit VCF-style inversions () if they exist\n", + "n_inv = df['mutation_instruction'].str.contains('').sum()\n", + "if n_inv:\n", + " print(f\" Inversions : {n_inv:,}\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mutation_instructionoriginal_windowmutated_windowpathogenicitydisease_namevariant_typeref_lenalt_lenlen_diffabs_len_diff
370378A>CGAACTGAGGAGATAGTTTTTGTTTTTAATGATTGTGCTCTTTTAAC...GAACTGAGGAGATAGTTTTTGTTTTTAATGATTGTGCTCTTTTAAC...not_pathogenicHereditary_cancer-predisposing_syndromeSNV1100
47441C>ATCTTGCTGGTTTCAGGGGAGGAGCCCGCTGTGCCAGGCCCTCATCT...TCTTGCTGGTTTCAGGGGAGGAGCCCGCTGTGCCAGGCCCTCATCT...not_pathogenicnot_specifiedSNV1100
2370658C>GACAGAAATAATGGAGTTAGAAAATCATTTAGTAGCCATCATAGTAA...ACAGAAATAATGGAGTTAGAAAATCATTTAGTAGCCATCATAGTAA...not_pathogenicDICER1-related_tumor_predispositionSNV1100
2479341C>ATGAATGCTTTTAGTTGTATGTGTTTTACGTTCATAAAAGTAAAATC...TGAATGCTTTTAGTTGTATGTGTTTTACGTTCATAAAAGTAAAATC...not_pathogenicnot_specifiedSNV1100
2340733G>ATAAGTGGGGAAGGGCCTGCTTCCTGAGTCGGAGGCTGAGAGGATGG...TAAGTGGGGAAGGGCCTGCTTCCTGAGTCGGAGGCTGAGAGGATGG...not_pathogenicnot_specifiedSNV1100
312980C>TGTCGGCCAGGGCCGCCGCGGGGCTACCGGGCGGGCTCGGGGCGGCG...GTCGGCCAGGGCCGCCGCGGGGCTACCGGGCGGGCTCGGGGCGGCG...not_pathogenicIntellectual_developmental_disorder_with_micro...SNV1100
1829920T>GGAAGGGAATACAAGGAAGGAGGAAAGGGAGTGTTAGTTTGGGCTAT...GAAGGGAATACAAGGAAGGAGGAAAGGGAGTGTTAGTTTGGGCTAT...not_pathogenicDilated_cardiomyopathy_1DD|Cardiovascular_phen...SNV1100
315617C>TTCCTGGTCCCAACCCCCTGCGCAGTATCTCTGGACGGGGCTAGACC...TCCTGGTCCCAACCCCCTGCGCAGTATCTCTGGACGGGGCTAGACC...not_pathogenicnot_providedSNV1100
2279534C>TTTACTTAGAAAAGCTCAACAAGTCTTTGGATATTTAGAGACTTTTT...TTACTTAGAAAAGCTCAACAAGTCTTTGGATATTTAGAGACTTTTT...not_pathogenicnot_providedSNV1100
2536550C>TGGGTGACACACCGGGAGAGGCTAGCAGTAAACAAAGGGAAAGGCGG...GGGTGACACACCGGGAGAGGCTAGCAGTAAACAAAGGGAAAGGCGG...not_pathogenicnot_provided|Hereditary_cancer-predisposing_sy...SNV1100
\n", + "
" + ], + "text/plain": [ + " mutation_instruction \\\n", + "370378 A>C \n", + "47441 C>A \n", + "2370658 C>G \n", + "2479341 C>A \n", + "2340733 G>A \n", + "312980 C>T \n", + "1829920 T>G \n", + "315617 C>T \n", + "2279534 C>T \n", + "2536550 C>T \n", + "\n", + " original_window \\\n", + "370378 GAACTGAGGAGATAGTTTTTGTTTTTAATGATTGTGCTCTTTTAAC... \n", + "47441 TCTTGCTGGTTTCAGGGGAGGAGCCCGCTGTGCCAGGCCCTCATCT... \n", + "2370658 ACAGAAATAATGGAGTTAGAAAATCATTTAGTAGCCATCATAGTAA... \n", + "2479341 TGAATGCTTTTAGTTGTATGTGTTTTACGTTCATAAAAGTAAAATC... \n", + "2340733 TAAGTGGGGAAGGGCCTGCTTCCTGAGTCGGAGGCTGAGAGGATGG... \n", + "312980 GTCGGCCAGGGCCGCCGCGGGGCTACCGGGCGGGCTCGGGGCGGCG... \n", + "1829920 GAAGGGAATACAAGGAAGGAGGAAAGGGAGTGTTAGTTTGGGCTAT... \n", + "315617 TCCTGGTCCCAACCCCCTGCGCAGTATCTCTGGACGGGGCTAGACC... \n", + "2279534 TTACTTAGAAAAGCTCAACAAGTCTTTGGATATTTAGAGACTTTTT... \n", + "2536550 GGGTGACACACCGGGAGAGGCTAGCAGTAAACAAAGGGAAAGGCGG... \n", + "\n", + " mutated_window pathogenicity \\\n", + "370378 GAACTGAGGAGATAGTTTTTGTTTTTAATGATTGTGCTCTTTTAAC... not_pathogenic \n", + "47441 TCTTGCTGGTTTCAGGGGAGGAGCCCGCTGTGCCAGGCCCTCATCT... not_pathogenic \n", + "2370658 ACAGAAATAATGGAGTTAGAAAATCATTTAGTAGCCATCATAGTAA... not_pathogenic \n", + "2479341 TGAATGCTTTTAGTTGTATGTGTTTTACGTTCATAAAAGTAAAATC... not_pathogenic \n", + "2340733 TAAGTGGGGAAGGGCCTGCTTCCTGAGTCGGAGGCTGAGAGGATGG... not_pathogenic \n", + "312980 GTCGGCCAGGGCCGCCGCGGGGCTACCGGGCGGGCTCGGGGCGGCG... not_pathogenic \n", + "1829920 GAAGGGAATACAAGGAAGGAGGAAAGGGAGTGTTAGTTTGGGCTAT... not_pathogenic \n", + "315617 TCCTGGTCCCAACCCCCTGCGCAGTATCTCTGGACGGGGCTAGACC... not_pathogenic \n", + "2279534 TTACTTAGAAAAGCTCAACAAGTCTTTGGATATTTAGAGACTTTTT... not_pathogenic \n", + "2536550 GGGTGACACACCGGGAGAGGCTAGCAGTAAACAAAGGGAAAGGCGG... not_pathogenic \n", + "\n", + " disease_name variant_type \\\n", + "370378 Hereditary_cancer-predisposing_syndrome SNV \n", + "47441 not_specified SNV \n", + "2370658 DICER1-related_tumor_predisposition SNV \n", + "2479341 not_specified SNV \n", + "2340733 not_specified SNV \n", + "312980 Intellectual_developmental_disorder_with_micro... SNV \n", + "1829920 Dilated_cardiomyopathy_1DD|Cardiovascular_phen... SNV \n", + "315617 not_provided SNV \n", + "2279534 not_provided SNV \n", + "2536550 not_provided|Hereditary_cancer-predisposing_sy... SNV \n", + "\n", + " ref_len alt_len len_diff abs_len_diff \n", + "370378 1 1 0 0 \n", + "47441 1 1 0 0 \n", + "2370658 1 1 0 0 \n", + "2479341 1 1 0 0 \n", + "2340733 1 1 0 0 \n", + "312980 1 1 0 0 \n", + "1829920 1 1 0 0 \n", + "315617 1 1 0 0 \n", + "2279534 1 1 0 0 \n", + "2536550 1 1 0 0 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sample(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "check to see which variant types from the vep vcf are not included in the fasta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '2' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '3' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '4' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '5' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '6' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '7' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '8' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '9' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '10' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '11' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '12' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '13' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '14' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '15' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '16' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '17' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '18' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '19' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '20' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '21' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '22' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'X' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'Y' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'MT' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_113889.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_187633.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_187661.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_187693.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NW_009646201.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In VCF but not in FASTA: ['NT_113889.1', 'NT_187633.1', 'NT_187661.1', 'NT_187693.1', 'NW_009646201.1']\n", + "In both VCF and FASTA: ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '3', '4', '5', '6', '7', '8', '9', 'MT', 'X', 'Y']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[W::vcf_parse] Contig '2' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '3' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '4' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '5' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '6' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '7' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '8' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '9' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '10' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '11' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '12' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '13' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '14' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '15' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '16' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '17' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '18' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '19' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '20' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '21' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '22' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'X' is not defined in the header. (Quick workaround: index the file with tabix.)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Counts of variants on missing contigs:\n", + " NT_113889.1: 1\n", + " NT_187633.1: 10\n", + " NT_187661.1: 8\n", + " NT_187693.1: 10\n", + " NW_009646201.1: 1\n", + "\n", + "Total variants on contigs present in both VCF and FASTA: 3494465\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[W::vcf_parse] Contig 'Y' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'MT' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_113889.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_187633.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_187661.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_187693.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NW_009646201.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n" + ] + } + ], + "source": [ + "import pysam\n", + "from collections import Counter\n", + "\n", + "vcf_path = \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_coding_only.vcf\"\n", + "fasta_path = \"SCRATCH_DIR/DNASNVData113/clinvar_data/vep-cache-113/homo_sapiens/113_GRCh38/Homo_sapiens.GRCh38.dna.toplevel.fa\"\n", + "\n", + "# 1) open VCF\n", + "vcf = pysam.VariantFile(vcf_path)\n", + "\n", + "# 2) get contigs from header if present, else from records\n", + "vcf_contigs = set(vcf.header.contigs)\n", + "if not vcf_contigs:\n", + " vcf_contigs = { rec.contig for rec in vcf }\n", + " vcf = pysam.VariantFile(vcf_path) # reopen to iterate again\n", + "\n", + "# 3) open FASTA and get its contigs\n", + "fa = pysam.FastaFile(fasta_path)\n", + "fasta_contigs = set(fa.references)\n", + "\n", + "# 4) compute sets\n", + "missing = sorted(vcf_contigs - fasta_contigs)\n", + "common = sorted(vcf_contigs & fasta_contigs)\n", + "\n", + "print(\"In VCF but not in FASTA:\", missing)\n", + "print(\"In both VCF and FASTA:\", common)\n", + "\n", + "# 5) count variants by category\n", + "counts_missing = Counter()\n", + "counts_common = 0\n", + "\n", + "for rec in vcf:\n", + " chrom = rec.contig\n", + " if chrom in missing:\n", + " counts_missing[chrom] += 1\n", + " elif chrom in fasta_contigs:\n", + " counts_common += 1\n", + "\n", + "# 6) report\n", + "print(\"\\nCounts of variants on missing contigs:\")\n", + "for contig in missing:\n", + " print(f\" {contig}: {counts_missing[contig]}\")\n", + "\n", + "print(f\"\\nTotal variants on contigs present in both VCF and FASTA: {counts_common}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/BioReason-main/data/Dataset Figures.ipynb b/BioReason-main/data/Dataset Figures.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c6eaff194c30a73420c93362b7c6ad1d4f507d71 --- /dev/null +++ b/BioReason-main/data/Dataset Figures.ipynb @@ -0,0 +1,591 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b73f6a17-b1f5-4f8f-9493-0a0363095b09", + "metadata": {}, + "source": [ + "# Making the Dataset Figures" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f58fac67-18d9-414d-94ea-73549d5acbd7", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a446e28e-acee-4159-82e6-8bcbc7e14bf6", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e2d8d17687c24dd08017894671c9e692", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "README.md: 0%| | 0.00/2.82k [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "bars = ['Splits', 'Disease Categories']\n", + "split_subcategories = ['Train', 'Test', 'Validation']\n", + "disease_subcategories = ['Neurodegenerative', 'Cancer', 'Metabolic', 'Other']\n", + "\n", + "# Create data arrays\n", + "split_counts = np.array([len(ds['train']), len(ds['test']), len(ds['val'])])\n", + "disease_counts = np.array([41.3, 28.2, 12.7, 17.8])\n", + "\n", + "# Convert to percentage\n", + "split_perc = split_counts / split_counts.sum() * 100\n", + "disease_perc = disease_counts / disease_counts.sum() * 100\n", + "\n", + "fig, ax = plt.subplots()\n", + "bar_height = 0.35\n", + "y = np.arange(len(bars)) # [0, 1]\n", + "\n", + "# Plot \"Splits\" as a horizontal stacked bar\n", + "left = 0\n", + "for i, val in enumerate(split_perc):\n", + " ax.barh(y[0], val, left=left, height=bar_height, label=split_subcategories[i])\n", + " left += val\n", + "\n", + "# Plot \"Disease Categories\" as a horizontal stacked bar\n", + "left = 0\n", + "for i, val in enumerate(disease_perc):\n", + " ax.barh(y[1], val, left=left, height=bar_height, label=disease_subcategories[i])\n", + " left += val\n", + "\n", + "# Labels and legend\n", + "ax.set_yticks(y)\n", + "ax.set_yticklabels(bars)\n", + "ax.set_xlabel('Percentage')\n", + "ax.set_title('Percent Stacked Bar Graph (Horizontal)')\n", + "ax.legend(loc='lower right', bbox_to_anchor=(1.05, 0))\n", + "\n", + "plt.tight_layout()\n", + "plt.savefig(\"stacked_bar_graph.svg\", format=\"svg\") # Save as SVG\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "4fc5111e-22c0-4a4f-a1cb-b8bb206fbb94", + "metadata": {}, + "source": [ + "# Task 2 and 5 Disease Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "b4505210-a97b-4f94-a5cd-48d05df1bc32", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "ds = load_dataset(\"wanglab/bioR_tasks\", 'variant_effect_coding')" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "7a9f4934-0ac8-4114-b8a1-7dc20826151e", + "metadata": {}, + "outputs": [], + "source": [ + "disease = (ds['train']['answer'] + ds['test']['answer'])" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "5da68fbb-9f0f-4c09-99ab-793ad2940706", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import Counter\n", + "\n", + "# Count occurrences of each disease name\n", + "disease_counts = Counter(ds['train']['answer'] + ds['test']['answer'])\n", + "\n", + "# Write to TSV file\n", + "with open(\"VEP_1_labels.tsv\", \"w\") as f:\n", + " f.write(\"Disease\\tCount\\n\") # Header\n", + " for disease, count in disease_counts.most_common(): # sorted by count\n", + " f.write(f\"{disease}\\t{count}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "0414a333-64df-40d8-9b3d-a0fff5bfcb1e", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "ds = load_dataset(\"wanglab/bioR_tasks\", 'task5_variant_effect_non_snv')" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "880652b7-4986-4fb8-8c20-5bcb57547ca2", + "metadata": {}, + "outputs": [], + "source": [ + "answer = (ds['train']['answer'] + ds['test']['answer'])" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "254bb39e-8bb7-43bc-b5f9-d82ad10b2093", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import Counter\n", + "\n", + "# Count occurrences of each disease name\n", + "disease_counts = Counter(ds['train']['answer'] + ds['test']['answer'])\n", + "\n", + "# Write to TSV file\n", + "with open(\"VEP_Non_SNV_labels.tsv\", \"w\") as f:\n", + " f.write(\"Disease\\tCount\\n\") # Header\n", + " for disease, count in disease_counts.most_common(): # sorted by count\n", + " f.write(f\"{disease}\\t{count}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "89f19d13-93a0-46e2-98e2-55e25f4ee399", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "ds = load_dataset(\"wanglab/bioR_tasks\", 'kegg_variant_2k')" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "b7fbb940-ac12-4d29-9d33-acdf46307008", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import Counter\n", + "\n", + "# Count occurrences of each disease name\n", + "disease_counts = Counter(ds['train']['answer'])\n", + "\n", + "# Write to TSV file\n", + "with open(\"KEGG_disease_labels.tsv\", \"w\") as f:\n", + " f.write(\"Disease\\tCount\\n\") # Header\n", + " for disease, count in disease_counts.most_common(): # sorted by count\n", + " f.write(f\"{disease}\\t{count}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a22e4fa-f1d0-42c8-b31b-96259dfacb11", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "1b62117f-c833-4f54-b334-18a199a6a40d", + "metadata": {}, + "source": [ + "ChatGPT classified these answer into the 8 categories. Now I am creating the stacked bar plots" + ] + }, + { + "cell_type": "markdown", + "id": "6a634362-7681-4302-887d-771c30a2b7d0", + "metadata": {}, + "source": [ + "Keyword Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "020b6f57-7147-4a28-9f46-bf54b35c68c9", + "metadata": {}, + "outputs": [], + "source": [ + "# Reuse the improved keyword-based classification function from earlier\n", + "\n", + "def keyword_based_classify(disease):\n", + " disease_lower = disease.lower()\n", + "\n", + " if any(keyword in disease_lower for keyword in [\n", + " \"alzheimer\", \"parkinson\", \"neuro\", \"ataxia\", \"epilepsy\", \"intellectual\", \"cerebellar\",\n", + " \"brain\", \"dystonia\", \"charcot\", \"myasthenic\", \"autism\", \"spinocerebellar\",\n", + " \"als\", \"developmental\", \"leuko\", \"hypotonia\", \"encephalopathy\", \"seizure\",\n", + " \"microcephaly\", \"prion\", \"huntington\", \"myopathy\", \"spinal\", \"sma\"]):\n", + " return \"Neurological & Neurodevelopmental\"\n", + "\n", + " elif any(keyword in disease_lower for keyword in [\n", + " \"cancer\", \"leukemia\", \"lymphoma\", \"tumor\", \"carcinoma\", \"adenocarcinoma\",\n", + " \"sarcoma\", \"neoplasm\", \"pheochromocytoma\", \"adenoma\", \"glioblastoma\", \"melanoma\"]):\n", + " return \"Oncological / Cancer\"\n", + "\n", + " elif any(keyword in disease_lower for keyword in [\n", + " \"metabolic\", \"glycogen\", \"storage\", \"diabetes\", \"hypercholesterolemia\",\n", + " \"hypophosphatasia\", \"acyl\", \"cobalamin\", \"lipodystrophy\", \"maple syrup\",\n", + " \"homocystinuria\", \"porphyria\", \"gaucher\", \"phgdh\", \"thyroid\", \"adrenal\",\n", + " \"lipoprotein\", \"hypomagnesemia\", \"coenzyme\", \"desmosterolosis\", \"biogenesis\",\n", + " \"hemochromatosis\", \"mitochondrial\", \"pyruvate\", \"oxidative\", \"ketosis\",\n", + " \"aldosteronism\", \"cushing\", \"lesch\", \"dyshormonogenesis\"]):\n", + " return \"Metabolic / Mitochondrial\"\n", + "\n", + " elif any(keyword in disease_lower for keyword in [\n", + " \"cardio\", \"heart\", \"aortic\", \"arrhythmia\", \"ventricular\", \"artery\", \"hypertension\",\n", + " \"thrombocythemia\", \"fibrillation\", \"cardiomyopathy\", \"vascular\", \"anemia\",\n", + " \"thrombocytopenia\", \"myelofibrosis\", \"blood\", \"hypotension\", \"fanconi\"]):\n", + " return \"Cardiovascular & Hematological\"\n", + "\n", + " elif any(keyword in disease_lower for keyword in [\n", + " \"immunodeficiency\", \"scid\", \"autoimmune\", \"inflammation\", \"inflammatory\",\n", + " \"neutropenia\", \"immune\", \"lymphoproliferation\", \"cytokine\", \"common_variable\",\n", + " \"deficiency\", \"immunologic\"]):\n", + " return \"Immunological & Hematopoietic\"\n", + "\n", + " elif any(keyword in disease_lower for keyword in [\n", + " \"ehlers\", \"dysplasia\", \"dystrophy\", \"muscular\", \"osteogenesis\", \"fibrochondrogenesis\",\n", + " \"connective\", \"skeletal\", \"bone\", \"myopathy\", \"chondrodysplasia\", \"hypochondroplasia\",\n", + " \"marfan\"]):\n", + " return \"Musculoskeletal & Connective Tissue\"\n", + "\n", + " elif any(keyword in disease_lower for keyword in [\n", + " \"deafness\", \"hearing\", \"retinitis\", \"macular\", \"amaurosis\", \"dystrophy\",\n", + " \"cone-rod\", \"stargardt\", \"vision\", \"optic\", \"blindness\", \"retina\", \"eye\",\n", + " \"corneal\", \"cataract\"]):\n", + " return \"Sensory Disorders\"\n", + "\n", + " elif disease_lower == \"benign\":\n", + " return \"Benign\"\n", + "\n", + " else:\n", + " return \"Other / Multisystem / Syndromic\"\n", + "\n", + "# Reclassify the diseases in the dataframe\n", + "disease_df[\"Keyword_Category\"] = disease_df[\"Disease\"].apply(keyword_based_classify)\n", + "\n", + "# Save to file\n", + "keyword_classified_path = \"/mnt/data/VEP_Non_SNV_labels_keyword_classified.tsv\"\n", + "disease_df.to_csv(keyword_classified_path, sep=\"\\t\", index=False)\n", + "\n", + "tools.display_dataframe_to_user(name=\"Keyword Classified VEP Non-SNV\", dataframe=disease_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "787f83ea-3a0a-4066-b1b8-2f98eeaf38cd", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e6faccd-40f3-45c5-8400-7d9b77ea3a96", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "d3ea9b94-c604-46e1-8b6f-cb2e867f23f3", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA8YAAAHqCAYAAADGYFelAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjEsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvc2/+5QAAAAlwSFlzAAAPYQAAD2EBqD+naQAAx25JREFUeJzs3XlcTfn/B/DXbd9XWqS0K4oQRpZKTJTEyL6UGsbXTmSXPftOjKVikL2xG0vWLIUsI1tT1gxCFNFyfn94dH6udvu4r+fjcR8z95zP+Xze59xzj9738zmfIxEEQQARERERERGRjJL71gEQERERERERfUtMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIiISKYxMSYiIiIiIiKZxsSYiIiIiIiIZBoTYyIiIiIiIpJpTIxJpkRGRkIikYgvBQUFVK5cGT179sT9+/e/dXif7OrVq5gwYQJSU1PLvM2ZM2fQtm1bmJmZQVlZGYaGhmjQoAGCg4Olyi1duhSRkZGfN+AimJubo1WrVl+0jQkTJkAikZRaLiAgQOp8kZeXR+XKldGhQwdcuXLli8b4PnNzc6k4NDQ0UL9+faxZs0aqnJubG9zc3D6qjWnTpiEmJubTg/3A06dP0alTJxgYGEAikaBNmzbFlnVzcxP3UU5ODpqamrC2tkb79u2xZcsW5OfnF9rG3NwcAQEBnz3u79XOnTvh4+MDQ0NDKCkpQU9PDx4eHli3bh1ycnLKXd/X+l5/LgXX8PJc4z6nhQsXQiKRwMHB4Zu0/zV8zPewrNavX4/58+d/vmA/0X/t/CeiL0vhWwdA9C1ERETAzs4Or1+/xrFjxxAWFoajR4/i8uXLUFdX/9bhfbSrV69i4sSJcHNzg7m5eanld+/ejdatW8PNzQ0zZ86EsbEx0tLSkJCQgOjoaMyZM0csu3TpUlSoUEGmkhAAUFVVxeHDhwEAubm5uHXrFqZMmQIXFxckJSXBxMTkq8TRsGFDzJ49GwBw7949zJ49G/7+/sjKysL//ve/T65/2rRp8PPzKzFx/RiTJ0/G9u3bsXr1alhZWUFPT6/E8paWlli3bh0AICsrCykpKYiJiUH79u3RuHFj7Ny5E9ra2mL57du3Q0tL67PG/D0SBAGBgYGIjIyEl5cX5s6dC1NTU2RkZCA2NhZ9+/bFkydPMGjQoHLV+1/7Xnt7e+PUqVMwNjb+Ju2vXr0aAPD333/jzJkzqF+//jeJ40sr7/ewrNavX48rV65g8ODBnznij/NfO/+J6MtiYkwyycHBAc7OzgAAd3d35OXlYfLkyYiJiUHXrl0/qe5Xr15BTU3tc4T5xc2cORMWFhbYv38/FBT+/3LQqVMnzJw58xtG9v2Qk5PDTz/9JL5v1KgRzMzM4OHhgd27d6N3796f3EZeXh5yc3OhrKxcbBkdHR2pOJo1a4YqVapg7ty5nyUx/lKuXLkCKyurMn+vVFVVpfYTAH799VdEREQgMDAQvXv3xsaNG8V1tWrV+qzxfq9mzZqFyMhITJw4EePHj5da5+Pjg5CQENy6desbRfflvX79GioqKqhYsSIqVqz4TWJISEjAxYsX4e3tjd27d2PVqlWfLTHOyckRRzF9D8r7PSQi+hFwKDURIP4BcPv2bQDvemeWLl0KJycnqKqqQldXF35+fvjnn3+ktnNzc4ODgwOOHTsGFxcXqKmpITAwEADw/PlzBAcHw9LSEsrKyjAwMICXlxeuXbsmbv/27VtMmTIFdnZ2UFZWRsWKFdGzZ088fvxYqp2C4cX79u1D7dq1oaqqCjs7O7H3Ang3xLB9+/YA3iX7BUPhShomlp6ejgoVKhT5x5ic3P9fHszNzfH333/j6NGjYr0FPdLZ2dkIDg6Gk5MTtLW1oaenhwYNGuDPP/8sVGd+fj4WLVokHteCZG/Hjh3Fxgi8+1VfQUEBoaGh4rKDBw/Cw8MDWlpaUFNTQ8OGDXHo0KFC2+7evRtOTk5QVlaGhYWF2Ov6KQp6ShQVFcVljx8/Rt++fVGtWjVoaGjAwMAATZs2xfHjx6W2TU1NhUQiwcyZMzFlyhRYWFhAWVkZsbGx5YpBR0cHVatWFc/Z4jx9+hR9+/aFiYkJlJSUYGlpiTFjxuDNmzdiGYlEgqysLERFRYmfb2lDskurt2A/Dx48iKSkJLHeI0eOlGs/C/Ts2RNeXl7YvHmz1D5/OJQ6Pz8fU6ZMQdWqVcVzrEaNGliwYIFUfTdv3kSXLl1gYGAAZWVl2NvbY8mSJVJlynNub968GfXr14e2tjbU1NRgaWkpXgsKvHjxAsOGDYOFhQWUlJRgYmKCwYMHIysrq8R9z8nJwYwZM2BnZ4dx48YVWcbIyAiNGjUS30+cOBH169eHnp4etLS0ULt2baxatQqCIEgdu+K+1+WJ9/nz5wgKCoKenh40NDTg7e2Nf/75BxKJBBMmTJAqe+LECXh4eEBTUxNqampwcXHB7t27pcoUDJf+66+/EBgYiIoVK0JNTQ1v3rwpdih1Wa4Hjx8/Ru/evWFqaipebxs2bIiDBw+WePwLrFq1CgAwffp0uLi4IDo6Gq9evSpU7v79+2I7SkpKqFSpEvz8/PDvv/8CAI4cOQKJRIK1a9ciODgYJiYmUFZWFn/YWL16NWrWrAkVFRXo6emhbdu2SEpKkmrjn3/+QadOnVCpUiXxFhgPDw8kJiaKZQ4fPgw3Nzfo6+tDVVUVZmZmaNeuXZExl1Vx38MlS5agSZMmMDAwgLq6OhwdHTFz5kyp4f1ubm7YvXs3bt++LXVrSIGynLNl3a+y/Nta0vlf1usIEf1Yvo+fJom+sYI/SAp6In777TdERkZi4MCBmDFjBp4+fYpJkybBxcUFFy9ehKGhobhtWloaunXrhpCQEEybNg1ycnJ4+fIlGjVqhNTUVIwYMQL169dHZmYmjh07hrS0NNjZ2SE/Px++vr44fvw4QkJC4OLigtu3byM0NBRubm5ISEiAqqqq2M7FixcRHByMkSNHwtDQECtXrkRQUBCsra3RpEkTeHt7Y9q0aRg9ejSWLFmC2rVrAwCsrKyK3e8GDRpg5cqVGDhwILp27YratWtLJXsFtm/fDj8/P2hra2Pp0qUAIPZuvnnzBk+fPsWwYcNgYmKCt2/f4uDBg/jll18QERGBHj16iPUEBATgjz/+QFBQECZNmgQlJSWcP3++2PsFBUHA8OHDsXDhQqxcuVJMgP744w/06NEDvr6+iIqKgqKiIpYvXw5PT0/s378fHh4eAIBDhw7B19cXDRo0QHR0NPLy8jBz5kzxD9Syys3NFf9769YtDB8+HLq6uvD29hbLPH36FAAQGhoKIyMjZGZmYvv27XBzc8OhQ4cKJZoLFy6Era0tZs+eDS0tLdjY2JQrppycHNy+fbvE3rPs7Gy4u7sjOTkZEydORI0aNXD8+HGEhYUhMTFRTEhOnTqFpk2bwt3dXUy8ShqeXJZ6jY2NcerUKfTt2xcZGRnisMxq1aqVaz/f17p1a+zZswfHjx9HlSpViiwzc+ZMTJgwAWPHjkWTJk2Qk5ODa9eu4fnz52KZq1evwsXFBWZmZpgzZw6MjIywf/9+DBw4EE+ePBF/gCnruX3q1Cl07NgRHTt2xIQJE6CiooLbt2+LQ/CBdyNJXF1dce/ePYwePRo1atTA33//jfHjx+Py5cs4ePBgsfe9JyQk4OnTp+jVq1eZ7o0H3v0w8dtvv8HMzAwAcPr0aQwYMAD3798Xe5xL+l6XNd78/Hz4+PggISEBEyZMQO3atXHq1Cm0aNGiUExHjx5F8+bNUaNGDaxatQrKyspYunQpfHx8sGHDBnTs2FGqfGBgILy9vbF27VpkZWUVeW0Cyn496N69O86fP4+pU6fC1tYWz58/x/nz55Genl7q8Xz9+jU2bNiAunXrwsHBAYGBgfj111+xefNm+Pv7i+Xu37+PunXrIicnRzxu6enp2L9/P549eyb1b8eoUaPQoEEDLFu2DHJycjAwMEBYWBhGjx6Nzp07IywsDOnp6ZgwYQIaNGiA+Ph48Trh5eUlXs/MzMzw5MkTxMXFied5amoqvL290bhxY6xevRo6Ojq4f/8+9u3bh7dv337SqKaivofJycno0qWL+CPKxYsXMXXqVFy7dk38AXfp0qXo3bs3kpOTsX379kL1luWcLct+lfXf1pLO/7JcR4joByQQyZCIiAgBgHD69GkhJydHePnypbBr1y6hYsWKgqampvDw4UPh1KlTAgBhzpw5UtvevXtXUFVVFUJCQsRlrq6uAgDh0KFDUmUnTZokABAOHDhQbCwbNmwQAAhbt26VWh4fHy8AEJYuXSouq1KliqCioiLcvn1bXPb69WtBT09P+O2338RlmzdvFgAIsbGxZToeT548ERo1aiQAEAAIioqKgouLixAWFia8fPlSqmz16tUFV1fXUuvMzc0VcnJyhKCgIKFWrVri8mPHjgkAhDFjxpS4fZUqVQRvb2/h1atXQrt27QRtbW3h4MGD4vqsrCxBT09P8PHxkdouLy9PqFmzplCvXj1xWf369YVKlSoJr1+/Fpe9ePFC0NPTE8py+fP39xePzfsvY2Nj4cSJE2U6Dh4eHkLbtm3F5SkpKQIAwcrKSnj79m2pMQjCu2Pi5eUl5OTkCDk5OUJKSooY2/Dhw8Vyrq6uUp/RsmXLBADCpk2bpOqbMWOGAED466+/xGXq6uqCv79/meIpT72urq5C9erVy1RvaWX37t0rABBmzJghLqtSpYpU3K1atRKcnJxKbMfT01OoXLmykJGRIbW8f//+goqKivD06dMityvu3J49e7YAQHj+/HmxbYaFhQlycnJCfHy81PItW7YIAIQ9e/YUu210dLQAQFi2bFmJ+1WcvLw8IScnR5g0aZKgr68v5Ofni+uK+16XNd7du3cLAITw8PBC2wMQQkNDxWU//fSTYGBgIHVtyc3NFRwcHITKlSuLcRVcp3v06FEoroJ1KSkpgiCU73qgoaEhDB48uIQjVbw1a9ZIfQYvX74UNDQ0hMaNG0uVCwwMFBQVFYWrV68WW1dsbKwAQGjSpInU8mfPngmqqqqCl5eX1PI7d+4IysrKQpcuXQRBeHfdBiDMnz+/2DYKPqfExMRy7acgfNz38H0F59uaNWsEeXl5qe+Tt7e3UKVKlVJjKO6cLct+leff1uLO/7JcR4jox8Oh1CSTfvrpJygqKkJTUxOtWrWCkZER9u7dC0NDQ+zatQsSiQTdunVDbm6u+DIyMkLNmjULDQXV1dVF06ZNpZbt3bsXtra2aNasWbEx7Nq1Czo6OvDx8ZFqx8nJCUZGRoXacXJyEn9JBwAVFRXY2tqWOpS2JPr6+jh+/Dji4+Mxffp0+Pr64saNGxg1ahQcHR3x5MmTMtWzefNmNGzYEBoaGlBQUICioiJWrVolNfxv7969AIB+/fqVWl96ejqaNm2Ks2fPikMvC8TFxeHp06fw9/eXOm75+flo0aIF4uPjkZWVhaysLMTHx+OXX36BioqKuL2mpiZ8fHzKeoigqqqK+Ph4xMfH48yZM9i2bRtsbW3h5eWFU6dOSZVdtmwZateuDRUVFfE4HDp0qNAwSOBdr0txPWBF2bNnDxQVFaGoqAgLCwts2rQJAwYMwJQpU4rd5vDhw1BXV4efn5/U8oKe96KGnpfFl6q3NMIHQyqLUq9ePVy8eBF9+/bF/v378eLFC6n12dnZOHToENq2bQs1NTWpc8jLywvZ2dk4ffq0WL4s53bdunUBAB06dMCmTZuKnOF+165dcHBwgJOTk1Sbnp6enzTEvDiHDx9Gs2bNoK2tDXl5eSgqKmL8+PFIT0/Ho0ePSt2+rPEePXoUwLt9f1/nzp2l3mdlZeHMmTPw8/ODhoaGuFxeXh7du3fHvXv3cP36dalt2rVrV2qcZb0eAO/OjcjISEyZMgWnT58u1yzeq1atgqqqKjp16gQA0NDQQPv27XH8+HHcvHlTLLd37164u7vD3t6+1Do/3L9Tp07h9evXhSaCMjU1RdOmTcXvlZ6eHqysrDBr1izMnTsXFy5cKDRTtJOTE5SUlNC7d29ERUUVug3oUxT1Pbxw4QJat24NfX198Xzr0aMH8vLycOPGjTLVW5Zztiz7Vd5/W4tS2nWEiH5MTIxJJq1Zswbx8fG4cOECHjx4gEuXLqFhw4YAgH///ReCIMDQ0FBMRApep0+fLpQsFjU76uPHj1G5cuUSY/j333/x/PlzKCkpFWrn4cOHhdrR19cvVIeysjJev35d3t0vxNnZGSNGjMDmzZvx4MEDDBkyBKmpqWWagGvbtm3o0KEDTExM8Mcff+DUqVOIj49HYGAgsrOzxXKPHz+GvLw8jIyMSq3zxo0bOHPmDFq2bFnosSgFw6D9/PwKHbcZM2ZAEAQ8ffoUz549Q35+fpHtlSWGAnJycnB2doazszPq1auHtm3bYs+ePVBQUMDQoUPFcgWTYNWvXx9bt27F6dOnER8fjxYtWhT5GZV3Vt1GjRohPj4eCQkJuHr1Kp4/f46FCxdCSUmp2G3S09NhZGRUaPitgYEBFBQUyjSE9GvWW5qCH4EqVapUbJlRo0Zh9uzZOH36NFq2bAl9fX14eHggISFBjD03NxeLFi0qdP54eXkBgPjdK+u53aRJE8TExCA3Nxc9evRA5cqV4eDggA0bNohl/v33X1y6dKlQm5qamhAEocQfoQp+EEtJSSnTcTp79ix+/vlnAMCKFStw8uRJxMfHY8yYMQBQpmtGWeNNT0+HgoJCodnG3x8yDADPnj2DIAhFnvcFn+eH501ZviNlvR4AwMaNG+Hv74+VK1eiQYMG0NPTQ48ePfDw4cMS27h16xaOHTsGb29vCIKA58+f4/nz5+IPQ+/P9VCWa39x+1ew/8Udo4L1EokEhw4dgqenJ2bOnInatWujYsWKGDhwIF6+fAng3S00Bw8ehIGBAfr16wcrKytYWVl9lntkP/we3rlzB40bN8b9+/exYMEC8cfWgnv2y3K+lfWcLct+lfff1qKUdh0hoh8T7zEmmWRvby/OSv2hChUqQCKR4Pjx40XOEvzhsqLu+atYsSLu3btXYgwVKlSAvr4+9u3bV+R6TU3NErf/UhQVFREaGop58+aV6Vm9f/zxBywsLLBx40apY/H+5E7Au2OSl5eHhw8flvoHb4MGDdC+fXsEBQUBAMLDw8XJwCpUqAAAWLRoUaFZUwsYGhqKs7wW9UdvaX8Il0ZNTQ1WVla4ePGiuOyPP/6Am5sbwsPDpcoW/KH6obLeK1pAW1u72HO2OPr6+jhz5gwEQZBq79GjR8jNzRWPZXl9qXpLs2PHDkgkEjRp0qTYMgU/WAwdOhTPnz/HwYMHMXr0aHh6euLu3bvQ1dUVeymLG71gYWEBoOznNgD4+vrC19cXb968wenTpxEWFoYuXbrA3NwcDRo0QIUKFaCqqiqVRL2vpGPm7OwMPT09/PnnnwgLCyv13ImOjoaioiJ27dolNVqiPM+pLmu8+vr6yM3NxdOnT6WS4w+/Y7q6upCTk0NaWlqhuh48eCBVZ4GyfEfKej0oKDt//nzMnz8fd+7cwY4dOzBy5Eg8evSo2Osw8C7xFQQBW7ZswZYtWwqtj4qKwpQpUyAvL1+ma3+BD/ev4MfP4o7R+8enSpUq4mRgN27cwKZNmzBhwgS8ffsWy5YtAwA0btwYjRs3Rl5eHhISErBo0SIMHjwYhoaGYs/3x/jwexgTE4OsrCxs27ZN6t7/9ycCK015ztnS9utz/Nta2nXkv/LkCSIqH/YYE32gVatWEAQB9+/fF3sK3385OjqWWkfLli1x48YNqcl3imonPT0deXl5RbZTtWrVcsdekLSXtRe5qD/AAIjDRN/vmSuud1oikUBJSUnqj7yHDx8Wmrm3ZcuWAFAocSyOv78/oqOjxUmO8vLyALx7nq+Ojg6uXr1a5HFzdnaGkpIS1NXVUa9ePWzbtk2qd+/ly5fYuXNnmWIoTmZmJm7dugUDAwNxmUQiKfSjyaVLlwoNt/6aPDw8kJmZWeiPyzVr1ojrC5Rn9EF56v1cIiIisHfvXnTu3FnqloKS6OjowM/PD/369cPTp0+RmpoKNTU1uLu748KFC6hRo0aR509BglLWc/t9ysrKcHV1xYwZMwC8G2IKvPu+JycnQ19fv8g2S3ruuKKiIkaMGIFr165h8uTJRZZ59OgRTp48KcatoKAAeXl5cf3r16+xdu3aIuMt6nMva7yurq4AUOjRPdHR0VLv1dXVUb9+fWzbtk2qvfz8fPzxxx+oXLkybG1tiz0GxSnr9eBDZmZm6N+/P5o3b47z588XW39eXh6ioqJgZWWF2NjYQq/g4GCkpaWJt4q0bNkSsbGxhYaFl0WDBg2gqqqKP/74Q2r5vXv3cPjw4WK/V7a2thg7diwcHR2L3Bd5eXnUr19f7MEtaX9LU9T3sOD78f71TxAErFixotD2Jf07UtZztkBx+1Wef1vLct0r6jpCRD8m9hgTfaBhw4bo3bs3evbsiYSEBDRp0gTq6upIS0vDiRMn4OjoWOpzYwcPHoyNGzfC19cXI0eORL169fD69WscPXoUrVq1gru7Ozp16oR169bBy8sLgwYNQr169aCoqIh79+4hNjYWvr6+aNu2bbliLxh2/Pvvv0NTUxMqKiqwsLAochg2AHh6eqJy5crw8fERZ8pOTEzEnDlzoKGhgUGDBollHR0dER0djY0bN8LS0hIqKipwdHREq1atsG3bNvTt2xd+fn64e/cuJk+eDGNjY6l77xo3bozu3btjypQp+Pfff9GqVSsoKyvjwoULUFNTw4ABAwrF5+fnBzU1Nfj5+YmzwmpoaGDRokXw9/fH06dP4efnBwMDAzx+/BgXL17E48ePxeR78uTJaNGiBZo3b47g4GDk5eVhxowZUFdXF4dXliY/P1+85zQ/Px/379/HwoUL8ezZM6lH0bRq1QqTJ09GaGgoXF1dcf36dUyaNAkWFhbirNZfW48ePbBkyRL4+/sjNTUVjo6OOHHiBKZNmwYvLy+pe+AdHR1x5MgR7Ny5E8bGxtDU1Cz2x5ny1Fter1+/Fo/369ev8c8//yAmJga7du2Cq6ur2BtWHB8fH/E55RUrVsTt27cxf/58VKlSRZzRd8GCBWjUqBEaN26M//3vfzA3N8fLly9x69Yt7Ny5U/xBq6zn9vjx43Hv3j14eHigcuXKeP78ORYsWABFRUUxcRw8eDC2bt2KJk2aYMiQIahRowby8/Nx584d/PXXXwgODi7xmbjDhw9HUlISQkNDcfbsWXTp0gWmpqbIyMjAsWPH8Pvvv2PixIlo2LAhvL29MXfuXHTp0gW9e/dGeno6Zs+eXeQImOK+12WNt0WLFmjYsCGCg4Px4sUL1KlTB6dOnRJ/JHn/sW9hYWFo3rw53N3dMWzYMCgpKWHp0qW4cuUKNmzYUO5RFADKfD3IyMiAu7s7unTpAjs7O2hqaiI+Ph779u3DL7/8Umz9e/fuxYMHDzBjxowiH2Hm4OCAxYsXY9WqVWjVqhUmTZqEvXv3okmTJhg9ejQcHR3x/Plz7Nu3D0OHDoWdnV2xbeno6GDcuHEYPXo0evTogc6dOyM9PR0TJ06EioqKOFv6pUuX0L9/f7Rv3x42NjZQUlLC4cOHcenSJYwcORLAu/kODh8+DG9vb5iZmSE7O1vs/S/L97M838PmzZtDSUkJnTt3RkhICLKzsxEeHo5nz54VqtfR0RHbtm1DeHg46tSpI96qUtZztiz7VZ5/W4s7/8tyHSGiH9C3mPGL6FspmNH0w5lWi7J69Wqhfv36grq6uqCqqipYWVkJPXr0EBISEsQyJc3e+ezZM2HQoEGCmZmZoKioKBgYGAje3t7CtWvXxDI5OTnC7NmzhZo1awoqKiqChoaGYGdnJ/z222/CzZs3xXIFMzV/6MNZiAVBEObPny9YWFgI8vLyAgAhIiKi2H3cuHGj0KVLF8HGxkbQ0NAQFBUVBTMzM6F79+6FZlVNTU0Vfv75Z0FTU1MAIDWz6PTp0wVzc3NBWVlZsLe3F1asWCGEhoYWmvk5Ly9PmDdvnuDg4CAoKSkJ2traQoMGDYSdO3eWuK+xsbGChoaG0KJFC+HVq1eCIAjC0aNHBW9vb0FPT09QVFQUTExMBG9vb2Hz5s1S2+7YsUOoUaOGoKSkJJiZmQnTp08vMraiFDUrtYGBgeDq6ips375dquybN2+EYcOGCSYmJoKKiopQu3ZtISYmRvD395c6VgWzUs+aNavU9ks6JkUp6nxIT08X+vTpIxgbGwsKCgpClSpVhFGjRgnZ2dlS5RITE4WGDRsKampqAoBSZyAva73lnZX6/WOtrq4uWFpaCn5+fsLmzZuFvLy8Qtt8OCv1nDlzBBcXF6FChQriZx4UFCSkpqZKbZeSkiIEBgYKJiYmgqKiolCxYkXBxcVFmDJlilS5spzbu3btElq2bCmYmJgISkpKgoGBgeDl5SUcP35cqq7MzExh7NixQtWqVcXz39HRURgyZIjw8OHDMh2jP//8U/D29hYqVqwoKCgoCLq6uoK7u7uwbNky4c2bN2K51atXC1WrVhWUlZUFS0tLISwsTFi1apXUjM6CUPL3uqzxPn36VOjZs6ego6MjqKmpCc2bNxdOnz4tABAWLFggFf/x48eFpk2bitfVn376Ser7LwglX6c/nJW6QGnXg+zsbKFPnz5CjRo1BC0tLUFVVVWoWrWqEBoaKmRlZRV7vNu0aSMoKSkJjx49KrZMp06dBAUFBfGY3L17VwgMDBSMjIwERUVFoVKlSkKHDh2Ef//9VxCE/5+V+sNrVYGVK1eK1yxtbW3B19dX+Pvvv8X1//77rxAQECDY2dkJ6urqgoaGhlCjRg1h3rx5Qm5uriAIgnDq1Cmhbdu2QpUqVQRlZWVBX19fcHV1FXbs2FHsfhT4mO/hzp07xX/HTExMhOHDh4uzV7//lISnT58Kfn5+go6OjiCRSKS+R2U5Z8u6X2X9t7W487+s1xEi+rFIBKEM03wSERERldH69evRtWtXnDx5Ei4uLt86HCIiolIxMSYiIqKPtmHDBty/fx+Ojo6Qk5PD6dOnMWvWLNSqVUt8nBMREdH3jvcYExER0UfT1NREdHQ0pkyZgqysLBgbGyMgIKDEZ2wTERF9b9hjTERERERERDKNj2siIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaZx86z8sPz8fDx48gKamJiQSybcOh4iIiIgIgiDg5cuXqFSpEuTk2A9H/w1MjP/DHjx4AFNT028dBhERERFRIXfv3kXlypW/dRhEZcLE+D9MU1MTwLuLjpaW1jeOhoiIiIgIePHiBUxNTcW/VYn+C5gY/4cVDJ/W0tJiYkxERERE3xXe6kf/JRz0T0RERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMY2JMREREREREMo2JMREREREREck0JsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTGNiTERERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMU/jWAdCncwjdDzlltW8dxjeTqtLlm7XtaGH2UdttCsv9zJEU77Dbkq/WVllkP5v7rUP4LnW0GPGtQyD6Ya1UOfStQyD6LjVusvaL1JuVlf9F6iX6kthjTERERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMY2JMREREREREMo2JMREREREREck0JsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTGNiTERERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMY2JMREREREREMo2JMREREREREck0JsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTGNiTERERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMY2JMREREREREMo2JMREREREREck0JsZEREREREQk08qcGPv4+KBZs2ZFrjt16hQkEgnOnz+P1NRUSCSSIl+nT58GAERGRkotNzY2RocOHZCSklJs+xMmTIBEIkGfPn2klicmJkIikSA1NbWsu/LRYmNj4e7uDj09PaipqcHGxgb+/v7Izc0FABw5cgQSiQQODg7Iy8uT2lZHRweRkZF4+/YtKlSogClTphTZRlhYGCpUqIC3b99+8f0hIiIiIiKiciTGQUFBOHz4MG7fvl1o3erVq+Hk5ITatWuLyw4ePIi0tDSpV506dcT1WlpaSEtLw4MHD7B+/XokJiaidevWhRLK96moqGDVqlW4ceNGWcP+bP7++2+0bNkSdevWxbFjx3D58mUsWrQIioqKyM/PlyqbnJyMNWvWFFmPkpISunXrhsjISAiCUGh9REQEunfvDiUlpS+yH0RERERERCStzIlxq1atYGBggMjISKnlr169wsaNGxEUFCS1XF9fH0ZGRlIvRUVFcb1EIoGRkRGMjY3h7u6O0NBQXLlyBbdu3So2hqpVq8Ld3R1jx44tMdajR4+iXr16UFZWhrGxMUaOHCn26gKAm5sbBg4ciJCQEOjp6cHIyAgTJkwosc4DBw7A2NgYM2fOhIODA6ysrNCiRQusXLmyUBI7YMAAhIaGIjs7u8i6goKCkJycjGPHjkktP378OG7evFnoWBIREREREdGXU+bEWEFBAT169CjU07l582a8ffsWXbt2/aRAVFVVAQA5OTkllps+fTq2bt2K+Pj4Itffv38fXl5eqFu3Li5evIjw8HCsWrWq0NDlqKgoqKur48yZM5g5cyYmTZqEAwcOFNuukZER0tLSCiWzRRk8eDByc3OxePHiItc7Ojqibt26iIiIkFq+evVq1KtXDw4ODqW2QURERERERJ9HuSbfCgwMRGpqKo4cOSIuW716NX755Rfo6upKlXVxcYGGhobUq7hh0vfu3cOsWbNQuXJl2NralhhD7dq10aFDB4wcObLI9UuXLoWpqSkWL14MOzs7tGnTBhMnTsScOXOkhjzXqFEDoaGhsLGxQY8ePeDs7IxDhw4V22779u3RuXNnuLq6wtjYGG3btsXixYvx4sWLQmXV1NQQGhqKsLAwZGRkFFlfYGAgtmzZgszMTABAZmYmNm/eXGJv8Zs3b/DixQupFxEREREREX2aciXGdnZ2cHFxwerVqwG8u5f2+PHjCAwMLFR248aNSExMlHrJy8uL6zMyMqChoQF1dXWYmpri7du32LZtW5nurZ0yZQqOHz+Ov/76q9C6pKQkNGjQABKJRFzWsGFDZGZm4t69e+KyGjVqSG1nbGyMR48eAQD69OkjldADgLy8PCIiInDv3j3MnDkTlSpVwtSpU1G9enWkpaUViiMoKAgVKlTAjBkzityHzp07Iz8/Hxs3bhSPlyAI6NSpU7H7HRYWBm1tbfFlampabFkiIiIiIiIqm3I/rikoKAhbt27FixcvEBERgSpVqsDDw6NQOVNTU1hbW0u93qepqYnExERcvnwZmZmZOHfuHOrWrVumGKysrNCrVy+MHDmy0ARWgiBIJcUFywBILX//fueCdQU9ypMmTZJK6N9nYmKC7t27Y8mSJbh69Sqys7OxbNmyQjEqKChgypQpWLBgAR48eFBovba2Nvz8/MTh1BEREfDz84OWllax+z1q1ChkZGSIr7t37xZbloiIiIiIiMqm3Ilxhw4dIC8vj/Xr1yMqKgo9e/YslIiWqWE5OVhbW8PS0hLq6url3n78+PG4ceMGoqOjpZZXq1YNcXFxUglzXFwcNDU1YWJiUqa6DQwMik3o36erqwtjY2NkZWUVub59+/aoXr06Jk6cWOT6oKAgnDx5Ert27cLJkydLnXRLWVkZWlpaUi8iIiIiIiL6NArl3UBDQwMdO3bE6NGjkZGRgYCAgCLLpaen4+HDh1LLdHR0oKKi8lGBfsjQ0BBDhw7FrFmzpJb37dsX8+fPx4ABA9C/f39cv34doaGhGDp0KOTkyv07gGj58uVITExE27ZtYWVlhezsbKxZswZ///03Fi1aVOx206dPh6enZ5HrXF1dYW1tjR49esDa2hpNmjT56PiIiIiIiIjo43xUphgUFIRnz56hWbNmMDMzK7JMs2bNYGxsLPWKiYn5lFgLGT58uHgPcAETExPs2bMHZ8+eRc2aNdGnTx8EBQWV+oin0tSrVw+ZmZno06cPqlevDldXV5w+fRoxMTFwdXUtdrumTZuiadOmUo+Lel9gYCCePXtW5H3aRERERERE9OVJhA9v0qX/jBcvXrybhGvwJsgpq33rcL6ZVJUu36xtR4uifxgqzaawon8o+RIOuy35am2VRfazud86hO9SR4sR3zoEoh/WSpXinzpBJMsaN1n7RerNysqHb+tUZGRk8NY/+s/4+LHFRERERERERD8AJsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTGNiTERERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMY2JMREREREREMo2JMREREREREck0JsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTGNiTERERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMY2JMREREREREMo2JMREREREREck0JsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTJMIgiB86yDo47x48QLa2trIyMiAlpbWtw6HiIiIiIh/o9J/EnuMiYiIiIiISKYxMSYiIiIiIiKZxsSYiIiIiIiIZBoTYyIiIiIiIpJpTIyJiIiIiIhIpjExJiIiIiIiIpnGxJiIiIiIiIhkGhNjIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIiISKYxMSYiIiIiIiKZxsSYiIiIiIiIZBoTYyIiIiIiIpJpTIyJiIiIiIhIpjExJiIiIiIiIpnGxJiIiIiIiIhkGhNjIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIiISKYxMSYiIiIiIiKZxsSYiIiIiIiIZBoTYyIiIiIiIpJpTIyJiIiIiIhIpjExJiIiIiIiIpnGxJiIiIiIiIhkGhNjIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIiISKYxMSYiIiIiIiKZxsSYiIiIiIiIZBoTYyIiIiIiIpJpTIyJiIiIiIhIpjExJiIiIiIiIpnGxJiIiIiIiIhkGhNjIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIiISKYxMSYiIiIiIiKZxsSYiIiIiIiIZBoTYyIiIiIiIpJpTIyJiIiIiIhIpjExJiIiIiIiIpnGxJiIiIiIiIhkGhNjIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIiISKYxMSYiIiIiIiKZxsSYiIiIiIiIZBoTYyIiIiIiIpJpTIyJiIiIiIhIpjExJiIiIiIiIpnGxJiIiIiIiIhkGhNjIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIiISKYpfOsA6NM5hO6HnLLatw6DiMogVaXLZ6/T0cLss9f5oU1huV+8DQA47Lbkq7TzMbKfzf3WIfwndbQY8cl1rFQ59BkiIaKyaNxk7SfXkZWV/xkiIfq62GNMREREREREMo2JMREREREREck0JsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTGNiTERERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMY2JMREREREREMo2JMREREREREck0JsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTGNiTERERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMY2JMREREREREMo2JMREREREREck0JsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxh8hMjISOjo64vsJEybAycnpm8VDREREREREH++LJcY+Pj5o1qxZketOnToFiUSC8+fPIzU1FRKJpMjX6dOnAbxLRN9fbmxsjA4dOiAlJaXEGF68eIExY8bAzs4OKioqMDIyQrNmzbBt2zYIgvDZ9nXYsGE4dOjQZ6uPiIiIiIiIvh6FL1VxUFAQfvnlF9y+fRtVqlSRWrd69Wo4OTmhdu3aSE1NBQAcPHgQ1atXlyqnr68v/r+WlhauX78OQRBw7do1/Pbbb2jdujUSExMhLy9fqP3nz5+jUaNGyMjIwJQpU1C3bl0oKCjg6NGjCAkJQdOmTaV6fT+FhoYGNDQ0PktdRERERERE9HV9sR7jVq1awcDAAJGRkVLLX716hY0bNyIoKEhqub6+PoyMjKReioqK4nqJRAIjIyMYGxvD3d0doaGhuHLlCm7dulVk+6NHj0ZqairOnDkDf39/VKtWDba2tujVqxcSExPFRPbZs2fo0aMHdHV1oaamhpYtW+LmzZtSdUVGRsLMzAxqampo27Yt0tPTpdZ/OJQ6ICAAbdq0wezZs2FsbAx9fX3069cPOTk5Ypm0tDR4e3tDVVUVFhYWWL9+PczNzTF//vyyHmIiIiIiIiL6DL5YYqygoIAePXogMjJSatjy5s2b8fbtW3Tt2vWT6ldVVQUAqWSzQH5+PqKjo9G1a1dUqlSp0HoNDQ0oKLzrLA8ICEBCQgJ27NiBU6dOQRAEeHl5ifWeOXMGgYGB6Nu3LxITE+Hu7o4pU6aUGl9sbCySk5MRGxuLqKgoREZGSv1I0KNHDzx48ABHjhzB1q1b8fvvv+PRo0cfcyiIiIiIiIjoE3yxodQAEBgYiFmzZuHIkSNwd3cH8G4Y9S+//AJdXV2psi4uLpCTk87TMzIyihwmfe/ePcyaNQuVK1eGra1tofVPnjzBs2fPYGdnV2J8N2/exI4dO3Dy5Em4uLgAANatWwdTU1PExMSgffv2WLBgATw9PTFy5EgAgK2tLeLi4rBv374S69bV1cXixYshLy8POzs7eHt749ChQ+jVqxeuXbuGgwcPIj4+Hs7OzgCAlStXwsbGpsQ637x5gzdv3ojvX7x4UWJ5IiIiIiIiKt0XnZXazs4OLi4uWL16NQAgOTkZx48fR2BgYKGyGzduRGJiotTr/aQ4IyMDGhoaUFdXh6mpKd6+fYtt27ZBSUmpUF0FPdQSiaTE+JKSkqCgoID69euLy/T19VG1alUkJSWJZRo0aCC13Yfvi1K9enWp+I2NjcUe4evXr0NBQQG1a9cW11tbWxf6seBDYWFh0NbWFl+mpqalxkFEREREREQl+6I9xsC7Sbj69++PJUuWICIiAlWqVIGHh0ehcqamprC2ti62Hk1NTZw/fx5ycnIwNDSEurp6sWUrVqwIXV1dMbktTnEzUwuCICbVHzt79fv3RwPvkvT8/PxS2y3JqFGjMHToUPH9ixcvmBwTERERERF9oi+eGHfo0AGDBg3C+vXrERUVhV69epXak1sUOTm5EhPnD8t27NgRa9euRWhoaKH7jLOysqCsrIxq1aohNzcXZ86cEYdSp6en48aNG7C3twcAVKtWTXxsVIEP35eXnZ0dcnNzceHCBdSpUwcAcOvWLTx//rzE7ZSVlaGsrPxJbRMRERF9DHl5eaioqHzU33H03yEnV3h+nvJSUMhHlSoC3r59i+zs7M8QFdHHUVRULPLW3KJ88cRYQ0MDHTt2xOjRo5GRkYGAgIAiy6Wnp+Phw4dSy3R0dKCiovJR7U6bNg1HjhxB/fr1MXXqVDg7O0NRURHHjx9HWFgY4uPjYWNjA19fX/Tq1QvLly+HpqYmRo4cCRMTE/j6+gIABg4cCBcXF8ycORNt2rTBX3/9Ver9xaWxs7NDs2bN0Lt3b4SHh0NRURHBwcFQVVXlPzZERET03bGxsUGVKlXK/Acm/XepqDh9ch1amsCyZbn4999/8fjx408PiugT6OjowMjIqNQ864snxsC74dSrVq3Czz//DDMzsyLLNGvWrNCyDRs2oFOnTh/Vpq6uLk6fPo3p06djypQpuH37NnR1deHo6IhZs2ZBW1sbABAREYFBgwahVatWePv2LZo0aYI9e/aIQ6F/+uknrFy5EqGhoZgwYQKaNWuGsWPHYvLkyR8VV4E1a9YgKCgITZo0gZGREcLCwvD3339/9A8BRERERF+CjY0NbGxsoKenB0VFRf6I/4NTU3v+yXXk5wsQhByYm5vzxxT6ZgRBwKtXr8R5noyNjUssLxE+9iZa+qzu3bsHU1NTHDx4sMh7sIvy4sWLd5NwDd4EOWW1LxwhEX0OqSpdPnudjhZF/+D4OW0Ky/3ibQDAYbclX6Wdj5H9bO63DuE/qaPFiE+uY6XKoc8QCX0MBQUFuLm5wcjICGpq/FtDFmhopH9yHfn5Am7deotatWoxMaZvLj09HY8ePYKtrW2J5+NX6TGmwg4fPozMzEw4OjoiLS0NISEhMDc3R5MmTb51aEREREQA3s1vIi8vX2hSUSKi/4qCH/VycnKYGH+PcnJyMHr0aPzzzz/Q1NSEi4sL1q1bx394iIiI6LtRMGyaw6eJ6L+qrNcvJsbfiKenJzw9Pb91GERERERERDJP7lsHQERERET0X1W/fn2sWLHiW4dBRJ+IPcZEREREVC4uCy981fbiBtYq9zaDBw/G5s2bxfc6OjpwcnLCmDFjUK1atc8W2549ezgxGdEPgD3GRERERPRDcnd3x4ULF3DhwgVs3LgR8vLy8Pf3/6xt6OvrQ1VV9bPWSURfHxNjIiIiIvohKSkpwcDAAAYGBnBwcEC/fv3w4MEDpKe/eyRRWloa+vTpg2rVqqF69ero2bMn7t69K24/ePBgBAYGYtmyZahVqxaqV6+O0aNHIycnRyzz4VDqW7duoU2bNrC0tISbmxuOHTsGExMT7Nu3DwBw9+5dmJiYYM+ePfDz84OVlRWaNWuGhISEr3RUiKgoTIyJiIiI6IeXlZWFbdu2wdzcHLq6unj9+jXat28PdXV1bN26FTExMVBXV0fXrl3x9u1bcbu4uDikpqZi8+bNmD9/PjZt2oRNmzYV2UZ+fj4CAwOhqqqKnTt3YubMmZg5c2aRZWfMmIE+ffrgr7/+gqWlJfr164fc3K/zzHgiKoz3GBMRERHRD+ngwYOwsbEBALx69QqGhoaIioqCnJwc/vzzT8jJyWH27Nni41zmzp0Le3t7nDp1Cq6urgAAbW1tTJ06FfLy8rC2toaHhwdOnDiBrl27Fmrv6NGjuH37NrZs2QIDAwMAQEhICDp37lyobJ8+fdCsWTMAwLBhw+Du7o7U1FRYW1t/kWNBRCVjjzERERER/ZBcXFzw119/4a+//sKuXbvQpEkTdOvWDffu3cOlS5eQmpoKW1tb2NjYwMbGBtWrV8ebN2+Qmpoq1mFrawt5eXnxvaGhIZ48eVJke8nJyahUqZKYFANArVpFTxxmb28v/n9B+eLqJaIvjz3GRERERPRDUlNTg4WFhfi+Ro0asLOzw7p165Cfn48aNWpg0aJFhbbT19cX/19RUbHQekEQimxPEASx97k0Cgr//2d4wTb5+fll2paIPj8mxkREREQkEyQSCeTk5JCdnQ1HR0fs3LkTFSpUgKam5mep39raGvfv38fjx49RsWJFAEBiYuJnqZuIviwOpSYiIiKiH9Lbt2/x6NEjPHr0CDdv3sTYsWORlZWF5s2b45dffoGuri569uyJM2fO4M6dOzh16hTGjx+PBw8efFR7TZo0QZUqVTB48GBcvXoV8fHxmDFjBgCUuSeZiL4N9hgTERER0Q8pNjZWvMdXQ0MD1tbWWL58OVxcXAAA27Ztw9SpU/Hrr78iKysLRkZGaNSo0Uf3IMvLy2P16tUYNmwYvL29YWZmhrFjxyIgIADKysqfbb+I6PNjYkxERERE5RI3sOgJpb4n8+fPx/z580ssY2BggAULFpRYx4cmTZok9f7MmTNS762trRETEyO+j4+PBwCYm5sDAExNTXH//n2pbbS1tQstI6Kvi4kxEREREdFnsnfvXqirq8PCwgIpKSkIDQ1F3bp1xcSYiL5PTIyJiIiIiD6TzMxMTJkyBWlpadDV1UXjxo0xfvz4bx0WEZWCiTERERER0WfSvn17tG/f/luHQUTlxFmpiYiIiIiISKYxMSYiIiIiIiKZxsSYiIiIiIiIZBoTYyIiIiIiIpJpTIyJiIiIiIhIpjExJiIiIiIiIpnGxJiIiIiI6BPVr18fK1asEN+bmJhg37593zCiz+v48Xhoa9fA8+cvvnUoMmPChAlwcnL6rHUeOXIEEokEz58//yz1paamQiKRIDEx8bPU9y3xOcZEREREVC6Vfrf/qu096J1U7m0ePXqEhQsX4tChQ3j48CH09fVRvXp1/Prrr2jcuPEXiFLahQsXoK2t/cXb+VHcunUL/fr1w6lTp6CgoID69etjzZo1qFixYonbRUZGYvDgwUUmehKJBNu3b0ebNm2+TNBlNGHCBMTExHwXyaOLiwvS0tJ4bhaBiTERERER/VDu3r2LNm3aQEtLC2PGjIG9vT1yc3Nx5MgRjBkzBseOHfuoevPy8iCRSCAnV/qgSwMDg49q40f29u1bKCkpFbmud+/eyMjIwNGjR6GmpoZTp05BEISvHOGPT0lJCUZGRt86jO8Sh1ITERER0Q9l9OjRAIDdu3ejVatWsLKyQtWqVfHbb79h586dYrnly5fDw8MD1tbWcHZ2xqhRo5CVlSWu37hxI+zt7XHgwAG4ubnBwsIC9+7dw5MnT+Dv7w8rKyv89NNP2LZtW6EYPhxKnZSUhPbt28PKygrVq1dHSEiI2NaRI0dgaWmJjIwMqTrGjRuHdu3aAQCePn2Kvn37ok6dOrCysoKHhwdiYmKkyu/atQseHh5iGx07dsSrV6/E9dHR0XB3d4eFhQVq1aqFMWPGAHj3Q4KJiQmuXLkiln3+/AW0tWvg+PH4Io/x06fPERgYAnv7ZjAyqocGDX7Bli17pMo0bdoU/fv3x9ChQ1GhQgU0b968yLoAQE5ODp6enqhVqxaqVq2KgICAz/7jwv3799GxY0fo6upCX18fvr6+SE1NFdcHBASgTZs2mDZtGgwNDaGjo4OJEyciNzcXw4cPh56eHipXrozVq1dL1TtixAjY2tpCTU0NlpaWGDduHHJycgC869GeOHEiLl68CIlEAolEgsjISADAnTt34OvrCw0NDWhpaaFDhw74999/i40/Pz8fkyZNQuXKlaGsrAwnJ6dCw/Xj4uLg5OQEFRUVODs7IyYmRmqoc1FDqU+ePAlXV1eoqalBV1cXnp6eePbsGQBg3759aNSoEXR0dKCvr49WrVohOTn5Iz+B7xsTYyIiIiL6YTx79gyxsbEICAiAmppaofXvDyGVk5PDpEmTcPjwYcyfPx8nT57ElClTpMq/fv0aixcvxqxZs3D48GFUqFABQ4YMwb1797Bx40b8/vvviIqKwpMnT4qN6fXr1+jWrRt0dHSwe/duLF++HMePHxcT08aNG0NLSwt79vx/YpmXl4edO3eibdu2AIA3b96gRo0aiIqKwuHDh9G1a1cMHDgQ58+fBwD8+++/6NevHzp27IgjR45gy5YtaNmypdjrGhUVhTFjxqBr1644ePAgIiIiYG5u/nEHGUB29hs4OVXDxo2LcerUNgQE+KF37zFISLgkVS4qKgoKCgo4efIkli9fXmx9vr6+WLp0qbg/n9urV6/g7u4ODQ0NHDt2DCdOnICGhgZatGiBt2/fiuUOHz6MBw8e4NixY5g7dy4mTJiAVq1aQVdXF2fOnEGfPn3Qp08f3L17V9xGU1MTkZGRuHr1KhYsWIAVK1Zg3rx5AICOHTsiODgY1atXR1paGtLS0tCxY0cIgoA2bdrg6dOnOHr0KA4cOIDk5GR07Nix2H1YsGAB5syZg9mzZ+PSpUvw9PRE69atcfPmTQDAy5cv4ePjA0dHR5w/fx6TJ0/GiBEjSjwuiYmJ8PDwQPXq1XHq1CmcOHECPj4+yMvLAwBkZWVh6NChiI+Px6FDhyAnJ4e2bdsiPz//oz+L7xWHUhMRERHRDyM1NRWCIMDa2rrUsr169RL/38zMDMOHD8eoUaMQFhYmLs/JycG0adNQvXp1AEBycjIOHz6MnTt3onbt2gCAOXPmwNXVtdh2tm3bhuzsbCxYsEBM1qdMmYKAgACMGTMGFStWROvWrbF9+3Z07twZAHDixAlkZGSgVatWAABjY2P06dNHrDMwMBCxsbHYtWsXateujUePHiE3NxdeXl6oXLkyAMDe/v/vBV+4cCF69+6NX3/9VVz2KRM7VapkiIEDA8T3v/3WBQcPnsT27X+hdm1Hcbm1tTVmzpxZYl2HDx/GyJEjMXHiRLRq1QrR0dFo0qQJAGDLli3o2bMnXr58Wez2GRkZ0NDQKLGN6OhoyMnJYeXKlZBIJACAiIgI6Ojo4MiRI/j5558BAHp6eli4cCHk5ORQtWpVzJw5E69evRJHIYwaNQrTp0/HyZMn0alTJwDA2LFjxXbMzc0RHByMjRs3IiQkBKqqqtDQ0ICCgoLUEOYDBw7g0qVLSElJgampKQBg7dq1qF69OuLj41G3bt1C+zB79myMGDFCbHfGjBmIjY3F/PnzsWTJEqxbtw4SiQQrVqyAiooKqlWrhvv370ud5x+aOXMmnJ2dsXTpUnFZwbkOQByxUGDVqlUwMDDA1atX4eDgUOIx/69hYvwDuDLRE1paWt86DCIqk4zSi5TT5c9eYxH8v0YjwNedzqe8mn7rAGTWBHz5iZKoaNnZ2UhJSYGBgQFUVFS+WRyVKlUqc9mCnjx9ff1St4uNjcW0adNw9epVvHjxArm5ucjOzoa2tjbU1dWhq6sLJSUlNGvWTEym4uPjoaCggJYtW0JeXl6MT0dHB9ra2lJt6unpoVKlSkhLS4OTk5NUst66dWvk5+cjIyMDNWvWRO/evdGgQQOxvn379sHLywvVqlUD8K4Hefr06di4cSPu37+PN2/e4M2bN+J+GhoawsPDA82aNYOnpyd+/vln+Pn5QVdXF48ePcLDhw/Rpk2bIo9JQY+pgYEBtLTeJbX5+c8BAOrqltDScoS6ejoAQEurOrS0dIqNR1vbCJqa1QFcAAA4OzuX+pmNHDkS/fr1w7Bhw+Dg4AAfHx+sXbsWrVu3xpUrV9CoUaMSt9fU1Cyyp9nGxkb8/3PnzuHWrVvQ1NSUKpOdnS01NLh69epS95AbGhpKJYDy8vLQ19fHo0ePxGVbtmzB/PnzcevWLWRmZiI3N7fUv82TkpJgamoqJsUAUK1aNejo6CApKalQYvzixQs8ePAADRs2lFresGFDXLx4EQBw/fp11KhRQ+q7Wq9evRLjSExMRPv27Ytdn5ycjHHjxuH06dN48uSJ2FN8586dHy4x5lBqIiIiIvph2NjYQCKRICmp5Jmsb9++DS8vLzg4OGDr1q04d+4clixZAgDi/aEAoKqqKibFAMShye8vK40gCMWWL1her149WFlZITo6Gq9fv8b27dvRrVs3sdycOXMwb948hISE4PDhw0hMTISnp6eY1MrLy+PAgQPYu3cvqlWrhkWLFqFq1apISUmBqqpqifEVJILvT3b1/jEoSmnxFFBXVy+xHgC4dOkSatWqBQBo0aIFVq9ejQ4dOmDlypWIiIhAz549S43f2tq60Ot9+fn5qFOnDhITE6VeN27cQJcuXcRyioqKUttJJJIilxUkiKdPn0anTp3QsmVL7Nq1CxcuXMCYMWMKHYcPFXdOlHSuFLRdXPmiti1tArPSzg0fHx+kp6djxYoVOHPmDM6cOQMApe7ffxETYyIiIiL6Yejp6cHT0xNLliyRmkirQMGkQwkJCcjNzcWcOXPw008/wdbWFg8ePCi1/oIZrhMSEsRl169fL/G5sNWqVUNiYqJUPCdPnoScnBxsbW3FZV26dMG6deuwc+dOyMnJwdvbW1x3/Phx+Pr6olu3bqhZsyYsLS3Fe0sLSCQSNGzYEBMnTsSFCxegpKSE7du3Q1NTE+bm5jh06FCR8RU8EiktLU1cVtqjhcoST1mZmJhIzRTerl07LF++HL1794aurm6JPZplVbt2bdy8eRMGBgaFEuhPeXTRyZMnUaVKFYwZMwbOzs6wsbHB7du3pcooKSmJ9+wWqFatGu7cuSN1r/LVq1eRkZEhNQS+gJaWFipVqoQTJ05ILY+LixPL29nZ4dKlS3jz5o24/v3ztCg1atQo9rxIT09HUlISxo4dCw8PD9jb24uTcv2ImBgTERER0Q9l6dKlyMvLQ7169bB161bcvHkTSUlJWLhwoThc2crKCrm5uVi0aBH++ecfrF27FsuWLSu17qpVq6JFixbo1asXzpw5g3PnzuHXX38tseeta9euUFFRgb+/P65cuYLY2FgMGDAA3bt3h6GhoVS58+fPY+rUqfDz85MaEmttbY0DBw4gLi4OSUlJ+O233/Dw4UNx/ZkzZzBt2jQkJCTgzp072LZtGx4/fiwmTRMmTMCcOXOwcOFC3Lx5E+fPn8eiRYsAvOs1/OmnnzB9+nRcvXoVx44dk7pvtiilxVMeISEh+P333zFx4kRcv34dZ8+exaFDh6CmpoZr164VSgY/RteuXVGhQgX4+vri+PHjSElJwdGjRzFo0CDcu3fvo+u1trbGnTt3EB0djeTkZCxcuBDbt2+XKmNubo6UlBQkJibiyZMnePPmDZo1a4YaNWqIn/nZs2fRo0cPuLq6Fjv8fPjw4ZgxYwY2btyI69evY+TIkUhMTMSgQYMAvPthJT8/H71790ZSUhL279+P2bNnAyh+hMOoUaMQHx+Pvn374tKlS7h27RrCw8Px5MkTcfbu33//Hbdu3cLhw4cxdOjQjz5W3zsmxkRERET0Q7GwsMD58+fh7u6O4OBgODg4oHnz5jh06BDCw8MBvJt4au7cuZgxYwYcHBywbt06qUm3ShIREQFTU1O4urril19+Qe/evUt8tJCamhr279+Pp0+fom7duvDz84OHhwcWL14sVc7GxgZ169bFpUuX0LVrV6l148aNQ+3ateHp6Qk3NzcYGRmhTZs24notLS0cO3YMXl5esLW1xdixYzFnzhy0bNkSAODv74/58+dj6dKlqF69Olq1aiXVw7t69Wrk5OTA2dkZgwYNKjQ794dKi6c8fvvtN2zcuFGc0MzHxwcKCgq4du0aunfvjrZt2350b3QBNTU1HDt2DGZmZvjll19gb2+PwMBAvH79+pPm6vH19cWQIUPQv39/ODk5IS4uDuPGjZMq065dO7Ro0QLu7u6oWLEiNmzYAIlEgpiYGOjq6qJJkyZo1qwZLC0tsXHjxmLbGjhwIIKDgxEcHAxHR0fs27cPO3bsEO+l1tLSws6dO5GYmAgnJyeMGTMG48ePB4Bi5wiwtbXFX3/9hYsXL6JevXpo0KAB/vzzTygoKEBOTg7R0dE4d+4cHBwcMGTIEMyaNeujj9X3TiLwydn/WS9evIC2tjYyMjI4+RYRERF9dgWTb1lYWHzTybfovyUvLw8XLlxArVq1xAnK6NtYt24devbsiYyMjFLvJ/5RlfU6xlmpiYiIiIiIfgBr1qyBpaUlTExMcPHiRYwYMQIdOnSQ2aS4PJgYExERERER/QAePnyI8ePH4+HDhzA2Nkb79u0xderUbx3WfwITYyIiIiIioh9ASEgIQkJCvnUY/0mcfIuIiIiIiIhkGhNjIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIj+Y9zc3DB48ODPWueECRPg5OT02eqLioqCjo7OZ6uPyicgIABt2rT55Hq+xLn2PeJzjImIiIioXByjHL9qe5f9L5erfEBAAJ4/f46YmJgvE9APatiwYRgwYMBXb3fOnDlYtGgR/v33X5iZmSE4OBi9e/cudTtzc3MMHjy4UNI2YcIExMTEIDEx8csEXA4SiQTbt2//LAlqeS1YsACCIJS5/JEjR+Du7o5nz55J/aCxbds2KCoqfoEIvy9MjImIiIiICBoaGtDQ0PiqbR47dgzDhg3DwoUL4ePjg7t37+LJkydfNYYflba29mepR09P77PU873jUGoiIiIi+qG5ublhwIABGDx4MHR1dWFoaIjff/8dWVlZ6NmzJzQ1NWFlZYW9e/eK2xw5cgQSiQT79+9HrVq1oKqqiqZNm+LRo0fYu3cv7O3toaWlhc6dO+PVq1fidubm5pg/f75U+05OTpgwYYL4XiKRYOXKlWjbti3U1NRgY2ODHTt2SG1z9OhR1KtXD8rKyjA2NsbIkSORm5tb7D4+e/YMPXr0gK6uLtTU1NCyZUvcvHlTqsyKFStgamoKNTU1tG3bFnPnzpXqGSxqKPXq1atRvXp1MY7+/fuL6+bOnQtHR0eoq6vD1NQUffv2RWZmZrExFkVOTg7y8vIICgqCubk5GjdujLZt25arjrKIiIiAvb09VFRUYGdnh6VLl4rrUlNTIZFIsGnTJjRu3BiqqqqoW7cubty4gfj4eDg7O0NDQwMtWrTA48ePxe3i4+PRvHlzVKhQAdra2nB1dcX58+fF9ebm5gCAtm3bQiKRiO8BIDw8HFZWVlBSUkLVqlWxdu1aqXglEgnCw8PRsmVLqKqqwsLCAps3b5Yqc/nyZTRt2hSqqqrQ19dH7969pY7/h0OpBUHAzJkzYWlpCVVVVdSsWRNbtmwRj4G7uzsAQFdXFxKJBAEBAQAKD6V+8+YNQkJCYGpqCmVlZdjY2GDVqlVl/zC+U0yMiYiIiOiHFxUVhQoVKuDs2bMYMGAA/ve//6F9+/ZwcXHB+fPn4enpie7du0slucC7ZHHx4sWIi4vD3bt30aFDB8yfPx/r16/H7t27ceDAASxatKjc8UycOBEdOnTApUuX4OXlha5du+Lp06cAgPv378PLywt169bFxYsXER4ejlWrVmHKlCnF1hcQEICEhATs2LEDp06dgiAI8PLyQk5ODgDg5MmT6NOnDwYNGoTExEQ0b94cU6dOLTHG8PBw9OvXD71798bly5exY8cOWFtbi+vl5OSwcOFCXLlyBVFRUTh8+DBCQkLKdRxq1aoFExMT9O3bF/n5+eXatqxWrFiBMWPGYOrUqUhKSsK0adMwbtw4REVFSZULDQ3F2LFjcf78eSgoKKBz584ICQnBggULcPz4cSQnJ2P8+PFi+ZcvX8Lf3x/Hjx/H6dOnYWNjAy8vL7x8+RLAu8QZeJeUp6Wlie+3b9+OQYMGITg4GFeuXMFvv/2Gnj17IjY2ViqecePGoV27drh48SK6deuGzp07IykpCQDw6tUrtGjRArq6uoiPj8fmzZtx8OBBqR8uPjR27FhEREQgPDwcf//9N4YMGYJu3brh6NGjMDU1xdatWwEA169fR1paGhYsWFBkPT169EB0dDQWLlyIpKQkLFu27KuPNPgiBPrPysjIEAAIGRkZ3zoUIiIi+gG9fv1auHr1qvD69Wup5Q6RDl/1VV7+/v6Cr6+v+N7V1VVo1KiR+D43N1dQV1cXunfvLi5LS0sTAAinTp0SBEEQYmNjBQDCwYMHxTJhYWECACE5OVlc9ttvvwmenp7i+ypVqgjz5s2TiqdmzZpCaGio+B6AMHbsWPF9ZmamIJFIhL179wqCIAijR48WqlatKuTn54tllixZImhoaAh5eXniPg0aNEgQBEG4ceOGAEA4efKkWP7JkyeCqqqqsGnTJkEQBKFjx46Ct7e3VFxdu3YVtLW1xfehoaFCzZo1xfeVKlUSxowZI5TVpk2bBH19fSE3N1eIj48XVq1aJVX/h/Ly8gQPDw/Bx8dH8PX1FTp27Ci8efNGXF+9enVh9uzZxW5fpUoVQUlJSVBXV5d6KSoqSu2HqampsH79eqltJ0+eLDRo0EAQBEFISUkRAAgrV64U12/YsEEAIBw6dEhcFhYWJlStWrXYeHJzcwVNTU1h586d4jIAwvbt26XKubi4CL169ZJa1r59e8HLy0tquz59+kiVqV+/vvC///1PEARB+P333wVdXV0hMzNTXL97925BTk5OePjwoSAI0t+DzMxMQUVFRYiLi5OqMygoSOjcubMgCP9/zj979kyqzPvn2vXr1wUAwoEDB4o9Dt+b4q5jH2KPMRERERH98GrUqCH+v7y8PPT19eHo+P+TiBkaGgIAHj16VOx2hoaGUFNTg6WlpdSyD7cpbzzq6urQ1NQU60lKSkKDBg0gkUjEMg0bNkRmZibu3btXqK6kpCQoKCigfv364jJ9fX1UrVpV7GG8fv066tWrJ7Xdh+/f9+jRIzx48AAeHh7FlomNjUXz5s1hYmICTU1N9OjRA+np6cjKyipl79/Zt28fTp48icjISGzcuBHp6enw8fFBVlYWsrOzkZycjEaNGpVYx/Dhw5GYmCj16tOnj7j+8ePHuHv3LoKCgsR7qDU0NDBlyhQkJydL1fXhZw2g0Dny/mf96NEj9OnTB7a2ttDW1oa2tjYyMzNx586dEmNOSkpCw4YNpZY1bNhQ/KwKNGjQoND7gjJJSUmoWbMm1NXVperIz8/H9evXC7V59epVZGdno3nz5lLHYc2aNYWOQ0kSExMhLy8PV1fXMm/zX8HJt4iIiIjoh/fhrLoSiURqWUES+uFw3g/LFFXP+9vIyckVmgm4YDhzafEU1CMIglRSXLDs/TiLWlfU8oLyJdVZFFVV1WLXAcDt27fh5eWFPn36YPLkydDT08OJEycQFBRU5P4W5dKlSzAzMxMnd4qJicHPP/8MDw8PtGnTBpaWliUm7wBQoUIFqeHdgPRkUQXHdMWKFVI/HADvfiB5X1Hnw4fL3v+sAwIC8PjxY8yfPx9VqlSBsrIyGjRogLdv35a670V9FkV9tsVtV1L5opYXxL17926YmJhIrVNWVi613QKlnRf/ZewxJiIiIiL6TCpWrIi0tDTx/YsXL5CSklKuOqpVq4a4uDipxDUuLg6ampqFkpqC8rm5uThz5oy4LD09HTdu3IC9vT0AwM7ODmfPnpXaLiEhodgYNDU1YW5ujkOHDhW5PiEhAbm5uZgzZw5++ukn2Nra4sGDB+XaTxMTE6SkpIi94Orq6tizZw/evn2LUaNGYcqUKWVKFktiaGgIExMT/PPPP7C2tpZ6WVhYfFLdx48fx8CBA+Hl5SVOUPbhjNqKiorIy8uTWmZvb48TJ05ILYuLixM/qwKnT58u9N7Ozg7Au888MTFRqnf+5MmTkJOTg62tbaFYq1WrBmVlZdy5c6fQcTA1NQUAKCkpAUCheN/n6OiI/Px8HD16tNgy/1VMjImIiIiIPpOmTZti7dq1OH78OK5cuQJ/f/9CPZOl6du3L+7evYsBAwbg2rVr+PPPPxEaGoqhQ4dCTq7wn+82Njbw9fVFr169cOLECXGyJhMTE/j6+gIABgwYgD179mDu3Lm4efMmli9fjr1795aYeE6YMAFz5szBwoULcfPmTZw/f16caMzKygq5ublYtGgR/vnnH6xduxbLli0r1362a9cOZmZm8Pb2xsGDB3Hr1i3s3LkTaWlpUFdXx+rVqz/LhFwTJkxAWFgYFixYgBs3buDy5cuIiIjA3LlzP6lea2trrF27FklJSThz5gy6du1aqEe14MeFhw8f4tmzZwDeDf+OjIzEsmXLcPPmTcydOxfbtm3DsGHDpLbdvHkzVq9ejRs3biA0NBRnz54VJ9fq2rUrVFRU4O/vjytXriA2NhYDBgxA9+7dxWHg79PU1MSwYcMwZMgQREVFITk5GRcuXMCSJUvESciqVKkCiUSCXbt24fHjx0XOMG5ubg5/f38EBgYiJiYGKSkpOHLkCDZt2vRJx/J7wMSYiIiIiOgzGTVqFJo0aYJWrVrBy8sLbdq0gZWVVbnqMDExwZ49e3D27FnUrFkTffr0QVBQEMaOHVvsNhEREahTpw5atWqFBg0aQBAE7NmzRxwK3LBhQyxbtgxz585FzZo1sW/fPgwZMgQqKirF1unv74/58+dj6dKlqF69Olq1aiU+AsrJyQlz587FjBkz4ODggHXr1iEsLKxc+6mmpoa4uDg4OzujZ8+ecHBwwLx58zBz5kzEx8fj6NGjUo8J+li//vorVq5cicjISDg6OsLV1RWRkZGf3GO8evVqPHv2DLVq1UL37t0xcOBAGBgYSJWZM2cODhw4AFNTU9SqVQsA0KZNGyxYsACzZs1C9erVsXz5ckRERMDNzU1q24kTJyI6Oho1atRAVFQU1q1bh2rVqgF4d+z279+Pp0+fom7duvDz84OHhwcWL15cbLyTJ0/G+PHjERYWBnt7e3h6emLnzp3icTAxMcHEiRMxcuRIGBoaFjvDdXh4OPz8/NC3b1/Y2dmhV69eZb6v/HsmEUq6uYC+ay9evIC2tjYyMjKgpaX1rcMhIiKiH0x2djZSUlJgYWFRYgJF/029evXCtWvXcPz48c9ab15eHi5cuIBatWqVu7ec3pFIJNi+fbvUc4jp45T1OsbJt4iIiIiIZMDs2bPRvHlzqKurY+/evYiKisLSpUu/dVhE3wUmxkREREREMuDs2bOYOXMmXr58CUtLSyxcuBC//vrrtw6L6LvAxJiIiIiISAb8CBMkyQre7fr1cfItIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIiISKYxMSYiIiIi+gJSU1MhkUiQmJj4SfW4ublh8ODB4ntzc3PMnz//k+r83CQSCWJiYr5oGxMmTICTk9MXbeNT2y7vZxMZGQkdHZ2Pjos+Hz7HmIiIiIjKJcnO/qu2Z38tqVzlAwICEBUVhd9++w3Lli2TWte3b1+Eh4fD398fkZGRZarvyJEjcHd3x7Nnz76LJCY+Ph7q6uqfXI+FhQXCw8PRokWLQuvMzc1x+/ZtbNiwAZ06dZJaV716dVy9ehUREREICAgAAKSlpUFXVxfAux8E6tati4SEBNSpU+eT4/wv+VyfDX197DEmIiIioh+OqakpoqOj8fr1a3FZdnY2NmzYADMzs28Y2aerWLEi1NTUPqmOS5cuIT09He7u7sWWMTU1RUREhNSy06dP4+HDh4WSPyMjIygrK39STP9lb9++BfB5Phv6NpgYExEREdEPp3bt2jAzM8O2bdvEZdu2bYOpqSlq1aolVVYQBMycOROWlpZQVVVFzZo1sWXLFgDvej8LkkddXV1IJBKxl3Tfvn1o1KgRdHR0oK+vj1atWiE5OblQLNeuXYOLiwtUVFRQvXp1HDlyRGr90aNHUa9ePSgrK8PY2BgjR45Ebm5usfv24XDd58+fo3fv3jA0NISKigocHBywa9euEo/Pn3/+CU9PzxKT2a5du+Lo0aO4e/euuGz16tXo2rUrFBSkB56+P5Ta2toaAODs7AyJRAI3NzcAQH5+PiZNmoTKlStDWVkZTk5O2Ldvn1Q99+7dQ6dOnaCnpwd1dXU4OzvjzJkzUmXWrl0Lc3NzaGtro1OnTnj58qW47s2bNxg4cCAMDAygoqKCRo0aIT4+Xlx/5MgRSCQSHDp0CM7OzlBTU4OLiwuuX78u1cb06dNhaGgITU1NBAUFITs7W2p9QEAA2rRpg7CwMFSqVAm2trYACn82c+fOhaOjI9TV1WFqaoq+ffsiMzOz2GNO3w4TYyIiIiL6IfXs2VOqx3P16tUIDAwsVG7s2LGIiIhAeHg4/v77bwwZMgTdunXD0aNHYWpqiq1btwIArl+/jrS0NCxYsAAAkJWVhaFDhyI+Ph6HDh2CnJwc2rZti/z8fKn6hw8fjuDgYFy4cAEuLi5o3bo10tPTAQD379+Hl5cX6tati4sXLyI8PByrVq3ClClTyrSP+fn5aNmyJeLi4vDHH3/g6tWrmD59OuTl5UvcbseOHfD19S2xjKGhITw9PREVFQUAePXqFTZu3FjkMXzfqVOnAAD79+9HWlqa+OPEggULMGfOHMyePRuXLl2Cp6cnWrdujZs3bwIAMjMz4erqigcPHmDHjh24ePEiQkJCpI5ncnIyYmJisGvXLuzatQtHjx7F9OnTxfUhISHYunUroqKicP78eVhbW8PT0xNPnz6VinHMmDGYM2cOEhISoKCgILVPmzZtQmhoKKZOnYqEhAQYGxtj6dKlhfbz0KFDSEpKwoEDB4r9IUJOTg4LFy7ElStXEBUVhcOHDyMkJKTE40ffBu8xJiIiIqIfUvfu3TFq1ChxEqyTJ08iOjpaqsc2KysLc+fOxeHDh9GgQQMAgKWlJU6cOIHly5fD1dUVenp6AAADAwOpe4zbtWsn1d6qVatgYGCAq1evwsHBQVzev39/sWx4eDj27duHVatWISQkBEuXLoWpqSkWL14MiUQCOzs7PHjwACNGjMD48eMhJ1dyP9bBgwdx9uxZJCUlib2WlpaWJW5z//59XLx4EV5eXiUfQACBgYEIDg7GmDFjsGXLFlhZWZU6CVXFihUBAPr6+jAyMhKXz549GyNGjBDvWZ4xYwZiY2Mxf/58LFmyBOvXr8fjx48RHx8vHvOC3ucC+fn5iIyMhKamJoB3n/GhQ4cwdepUZGVlITw8HJGRkWjZsiUAYMWKFThw4ABWrVqF4cOHi/VMnToVrq6uAICRI0fC29sb2dnZUFFRwfz58xEYGIhff/0VADBlyhQcPHiwUK+xuro6Vq5cCSUlpWKPxfuTpllYWGDy5Mn43//+V2SiTd8We4yJiIiI6IdUoUIFeHt7IyoqChEREfD29kaFChWkyly9ehXZ2dlo3rw5NDQ0xNeaNWuKHBb9vuTkZHTp0gWWlpbQ0tKChYUFAODOnTtS5QoSbgBQUFCAs7MzkpLeTSiWlJSEBg0aQCKRiGUaNmyIzMxM3Lt3r9R9TExMROXKlcWkuCx27NiBhg0bislnSby9vZGZmYljx44V2+NeFi9evMCDBw/QsGFDqeUNGzYUj0ViYiJq1apVYlzm5uZiUgwAxsbGePToEYB3n0dOTo5UG4qKiqhXr57YRoEaNWpI1QFArKfgM3nfh+8BwNHRscSkGABiY2PRvHlzmJiYQFNTEz169EB6ejqysrJK3I6+PvYYExEREdEPKzAwEP379wcALFmypND6gmG6u3fvhomJidS60iaT8vHxgampKVasWIFKlSohPz8fDg4O4kRMJSlIhAVBkEqKC5a9X6YkqqqqpZb5UFmGURdQUFBA9+7dERoaijNnzmD79u3lbu99Re1rwbKy7IuiomKh+go+w+KOW1HH+P16CtZ9OAS+NKXNPn379m14eXmhT58+mDx5MvT09HDixAkEBQUhJyenXG3Rl8ceYyIiIiL6YbVo0QJv377F27dv4enpWWh9tWrVoKysjDt37sDa2lrqZWpqCgBir2BeXp64XXp6OpKSkjB27Fh4eHjA3t4ez549KzKG06dPi/+fm5uLc+fOwc7OTmw/Li5OTOoAIC4uDpqamoUS9aLUqFED9+7dw40bN8pwNN7dxxsbG4vWrVuXqTzw7seFo0ePwtfXV3wkU0mKOl5aWlqoVKkSTpw4IVU2Li4O9vbvHv9Vo0YNJCYmFrofuKysra2hpKQk1UZOTg4SEhLENsrC3t5e6jMDUOh9WSQkJCA3Nxdz5szBTz/9BFtbWzx48KDc9dDXwR5jIiIiIvphycvLi8Noi5qQSlNTE8OGDcOQIUOQn5+PRo0a4cWLF4iLi4OGhgb8/f1RpUoVSCQS7Nq1C15eXlBVVYWuri709fXx+++/w9jYGHfu3MHIkSOLjGHJkiWwsbGBvb095s2bh2fPnolDkvv27Yv58+djwIAB6N+/P65fv47Q0FAMHTq01PuLAcDV1RVNmjRBu3btMHfuXFhbW+PatWuQSCRFPp943759sLGxKfU+5PfZ29vjyZMnZX4MkYGBAZSVlbF//35UqVIFKioq0NbWxvDhwxEaGirepxwREYHExESsW7cOANC5c2dMmzZNnO3Z2NgYFy5cQKVKlYocyvwhdXV1/O9//8Pw4cOhp6cHMzMzzJw5E69evUJQUFCZ93fQoEHw9/eHs7MzGjVqhHXr1uHvv/8u1zEDACsrK+Tm5mLRokXw8fHByZMnCz1Xm74f7DEmIiIioh+alpYWtLS0il0/efJkjB8/HmFhYbC3t4enpyd27twp3jNsYmKCiRMnYuTIkTA0NET//v0hJyeH6OhonDt3Dg4ODhgyZAhmzZpVZP3Tp0/HjBkzULNmTRw/fhx//vmneK+ziYkJ9uzZg7Nnz6JmzZro06cPgoKCMHbs2DLv39atW1G3bl107twZ1apVQ0hIiFRv7fv+/PPPMg+jfp++vn6Zh20rKChg2LBh4hDzgvYGDhyI4OBgBAcHw9HREfv27cOOHTtgY2MD4F1P819//QUDAwN4eXnB0dGxTDNsv2/69Olo164dunfvjtq1a+PWrVvYv39/mXq6C3Ts2BHjx4/HiBEjUKdOHdy+fRv/+9//yrx9AScnJ8ydOxczZsyAg4MD1q1bh7CwsHLXQ1+HRHh/3Ab9p7x48QLa2trIyMgo8WJPRERE9DGys7ORkpICCwsLqKiofOtw6BPl5eXBwMAAe/fuRb169b5oOxcuXECtWrXKldQSfQllvY6xx5iIiIiISAakp6djyJAhqFu37rcOhei7w3uMiYiIiIhkgIGBQbmGaBPJEvYYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTGNiTERERERERDKNiTERERER0Tc0YcIEODk5fbb6JBIJYmJiPnp7c3NzzJ8//7PFU5zIyEjo6Oh88XZ+ZAEBAWjTps03jeFH+RyZGBMRERHRDyUgIAASiQR9+vQptK5v376QSCQICAj4+oH9oD53Yv+hOXPmwNzcHKqqqqhatSp+//33Mm974cIFtG/fHoaGhlBRUYGtrS169eqFGzdufLF4v4TU1FRIJBIkJiZKLV+wYAEiIyO/WLvm5uaQSCTFvtzc3NCxY8f/3PEsisK3DoCIiIiI/luW9Dn8Vdvrt6xpubcxNTVFdHQ05s2bB1VVVQBAdnY2NmzYADMzs88dIn0hx44dw7Bhw7Bw4UL4+Pjg7t27ePLkSZm23bVrF9q1awdPT0+sW7cOVlZWePToETZv3oxx48Zh48aNXzj6L09bW/uL1h8fH4+8vDwAQFxcHNq1a4fr169DS0sLAKCkpARVVVXxO/Zfxh5jIiIiIvrh1K5dG2ZmZti2bZu4bNu2bTA1NUWtWrWkyhY1dNjJyQkTJkwQ30+YMAFmZmZQVlZGpUqVMHDgQHHdmzdvEBISAlNTUygrK8PGxgarVq0CUPQw05iYGEgkkmJjz8/Px6RJk1C5cmUoKyvDyckJ+/btE9e/ffsW/fv3h7GxMVRUVGBubo6wsLBi65s0aRIMDQ3F3sa4uDg0adIEqqqqMDU1xcCBA5GVlVXs9hkZGejduzcMDAygpaWFpk2b4uLFi+L+TZw4ERcvXhR7EaOiogAA8+bNg6OjI9TV1WFqaoq+ffsiMzOz2HaKIicnB3l5eQQFBcHc3ByNGzdG27ZtS93u1atX6NmzJ7y8vLBjxw40a9YMFhYWqF+/PmbPno3ly5eLZY8ePYp69epBWVkZxsbGGDlyJHJzc8X1bm5uGDhwIEJCQqCnpwcjIyOpcwN4N3x95cqVaNu2LdTU1GBjY4MdO3ZIlbl69Sq8vLygoaEBQ0NDdO/eXSrJz8/Px4wZM2BtbQ1lZWWYmZlh6tSpAAALCwsAQK1atcSeWkB6KPXy5cthYmKC/Px8qXZbt24Nf39/8f3OnTtRp04dqKiowNLSEhMnTpTa3/dVrFgRRkZGMDIygp6eHgDAwMBAatmH5/jFixfh7u4OTU1NaGlpoU6dOkhISAAA3L59Gz4+PtDV1YW6ujqqV6+OPXv2ACj7d6U88ZcHE2MiIiIi+iH17NkTERER4vvVq1cjMDCw3PVs2bIF8+bNw/Lly3Hz5k3ExMTA0dFRXN+jRw9ER0dj4cKFSEpKwrJly6ChofHRcS9YsABz5szB7NmzcenSJXh6eqJ169a4efMmAGDhwoXYsWMHNm3ahOvXr+OPP/6Aubl5oXoEQcCgQYOwatUqnDhxAk5OTrh8+TI8PT3xyy+/4NKlS9i4cSNOnDiB/v37FxmLIAjw9vbGw4cPsWfPHpw7dw61a9eGh4cHnj59io4dOyI4OBjVq1dHWloa0tLS0KFDBwDvktqFCxfiypUriIqKwuHDhxESElKuY1GrVi2YmJigb9++hRK+kuzfvx9Pnjwptr2CBOz+/fvw8vJC3bp1cfHiRYSHh2PVqlWYMmWKVPmoqCioq6vjzJkzmDlzJiZNmoQDBw5IlZk4cSI6dOiAS5cuwcvLC127dsXTp08BAGlpaXB1dYWTkxMSEhKwb98+/Pvvv+KxAoBRo0ZhxowZGDduHK5evYr169fD0NAQAHD27FkAwMGDB5GWlib1g0+B9u3b48mTJ4iNjRWXPXv2DPv370fXrl3F49KtWzcMHDgQV69exfLlyxEZGSkm4J9D165dUblyZcTHx+PcuXMYOXIkFBUVAQD9+vXDmzdvcOzYMVy+fBkzZswo13flS8bPodRERERE9EPq3r07Ro0aJd6fefLkSURHR+PIkSPlqufOnTswMjJCs2bNoKioCDMzM9SrVw8AcOPGDWzatAkHDhxAs2bNAACWlpafFPfs2bMxYsQIdOrUCQAwY8YMxMbGYv78+ViyZAnu3LkDGxsbNGrUCBKJBFWqVClUR25uLnr06IGEhAScPHkSlStXBgDMmjULXbp0weDBgwEANjY2WLhwIVxdXREeHg4VFRWpemJjY3H58mU8evQIysrKYnwxMTHYsmULevfuDQ0NDSgoKMDIyAgAxKG3gwYNgry8PIB3PZ6TJ0/G//73PyxdurRMxyE/Px++vr6oWbMmnj9/ji5dumDNmjVQUlICADg4OKBnz54IDg4utG3Bjwh2dnYltrF06VKYmppi8eLFkEgksLOzw4MHDzBixAiMHz8ecnLv+hFr1KiB0NBQ8ZgtXrwYhw4dQvPmzcW6AgIC0LlzZwDAtGnTsGjRIpw9exYtWrRAeHg4ateujWnTponlV69eDVNTU9y4cQPGxsZYsGABFi9eLPbuWllZoVGjRgDe9dwCgL6+vnicP6Snp4cWLVpg/fr18PDwAABs3rwZenp64vupU6di5MiRYhuWlpaYPHkyQkJCxP37VHfu3MHw4cPFY29jYyO1rl27duIPS+X9rnzJ+JkYExEREdEPqUKFCvD29kZUVJTY81mhQoVy19O+fXvMnz8flpaWaNGiBby8vODj4wMFBQUkJiZCXl4erq6unyXmFy9e4MGDB2jYsKHU8oYNG4rDlwMCAtC8eXNUrVoVLVq0QKtWrfDzzz9LlR8yZAiUlZVx+vRpqX0+d+4cbt26hXXr1onLBEFAfn4+UlJSYG9vL1XPuXPnkJmZCX19fanlr1+/RnJycon7EhsbixkzZuDq1at48eIFcnNzkZ2djaysLKirq5d6LPbt24eTJ0/i/v37UFdXR6tWreDj44Nt27ZBXl4eycnJYuL4IUEQSq0fAJKSktCgQQOp4boNGzZEZmYm7t27J96PXqNGDantjI2N8ejRI6ll75dRV1eHpqamWObcuXOIjY0tsnc0OTkZz58/x5s3b8QE9mN17doVvXv3xtKlS6GsrIx169ahU6dO4g8U586dQ3x8vFQPa15eHrKzs/Hq1Suoqal9UvsAMHToUPz6669Yu3YtmjVrhvbt28PKygoAMHDgQPzvf//DX3/9hWbNmqFdu3aFjm1JvmT8HEpNRERERD+swMBAREZGIioqqthh1HJycoUSqZycHPH/TU1Ncf36dSxZsgSqqqro27cvmjRpgpycnFInHSqt7uJ8eF+lIAjistq1ayMlJQWTJ0/G69ev0aFDB/j5+UmVb968Oe7fv4/9+/dLLc/Pz8dvv/2GxMRE8XXx4kXcvHlTTF4+LG9sbCxVPjExEdevX8fw4cOLjT8tLQ0+Pj5wcHDA1q1bce7cOSxZsqTM+w8Aly5dgpmZGfT09KCsrIyYmBhkZmbCw8ND/KGioOf+Q7a2tgCAa9euldjG+8f1/WWA9GdQMBS4gEQiKTS0u6Qy+fn58PHxKXQcb968Kd7v/Tn4+PggPz8fu3fvxt27d3H8+HF069ZNXJ+fn4+JEydKxXD58mXcvHmz0GiBjzVhwgT8/fff8Pb2xuHDh1GtWjVs374dAPDrr7/in3/+Qffu3XH58mU4Oztj0aJFAMr2XfmS8bPHmIiIiIh+WC1atMDbt28BAJ6enkWWqVixItLS0sT3L168QEpKilQZVVVVtG7dGq1bt0a/fv1gZ2eHy5cvw9HREfn5+Th69Kg4lPrDul++fCnVS/rhI3fep6WlhUqVKuHEiRNo0qSJuDwuLk4qCdTS0kLHjh3RsWNH+Pn5oUWLFnj69Kk4QVLr1q3h4+ODLl26QF5eXhyWXbt2bfz999+wtrYu6bCJateujYcPH0JBQaHI+5iBdzMTFwyfLpCUlITc3FzMmTNHHI68adOmMrVZwMTEBCkpKbh37x4qV64MdXV17NmzB+7u7hg1ahS2bdtW7CRmP//8MypUqICZM2eKSdn7nj9/Dh0dHVSrVg1bt26VSpDj4uKgqakJExOTcsVbktq1a2Pr1q0wNzeHgkLhFMzGxgaqqqo4dOgQfv3110LrC4aPf3icP6SqqopffvkF69atw61bt2Bra4s6depIxXH9+vUyf/4fy9bWFra2thgyZAg6d+6MiIgIcdI0U1NT9OnTB3369MGoUaOwYsUKDBgwoEzflS8ZP3uMiYiIiOiHJS8vj6SkJCQlJYnDST/UtGlTrF27FsePH8eVK1fg7+8vVTYyMhKrVq3ClStX8M8//2Dt2rVQVVVFlSpVYG5uDn9/fwQGBiImJgYpKSk4cuSImATWr18fampqGD16NG7duoX169eX+tzZ4cOHY8aMGdi4cSOuX7+OkSNHIjExEYMGDQLwbrbn6OhoXLt2DTdu3MDmzZthZGRUaEbftm3bYu3atejZsye2bNkCABgxYgROnTqFfv36iT2WO3bswIABA4qMpVmzZmjQoAHatGmD/fv3IzU1FXFxcRg7dqw407C5uTlSUlKQmJiIJ0+e4M2bNzAxMUFubi4WLVokHrNly5aV+nm9r127djAzM4O3tzcOHjyIW7duYefOnUhLS4O6ujpWr15d7IRc6urqWLlyJXbv3o3WrVvj4MGDSE1NRUJCAkJCQsRnXPft2xd3797FgAEDcO3aNfz5558IDQ3F0KFDxYT+c+jXrx+ePn2Kzp074+zZs/jnn3/w119/ITAwEHl5eVBRUcGIESMQEhKCNWvWIDk5GadPnxZnNzcwMICqqqo4aVdGRkaxbXXt2hW7d+/G6tWrpXqLAWD8+PFYs2aN2KublJSEjRs3YuzYsZ9lP1+/fo3+/fvjyJEjuH37Nk6ePIn4+HhxiP7gwYOxf/9+pKSk4Pz58zh8+LC4rizflS8ZPxNjIiIiIvqhaWlpic9dLcqoUaPQpEkTtGrVCl5eXmjTpo3UsGIdHR2sWLECDRs2RI0aNXDo0CHs3LlTvO82PDwcfn5+6Nu3L+zs7NCrVy/x8Ud6enr4448/sGfPHjg6OmLDhg2FHvXzoYEDByI4OBjBwcFwdHTEvn37sGPHDnESIw0NDcyYMQPOzs6oW7cuUlNTsWfPniITOT8/P0RFRaF79+7Ytm0batSogaNHj+LmzZto3LgxatWqhXHjxsHY2LjIWCQSCfbs2YMmTZogMDAQtra26NSpE1JTU8UZk9u1a4cWLVrA3d0dFStWRHR0NKpWrYrZs2djxowZcHBwwLp160p8pFRR1NTUEBcXB2dnZ/Ts2RMODg6YN28eZs6cifj4eBw9elScRKwovr6+iIuLg6KiIrp06QI7Ozt07twZGRkZ4qzTJiYm2LNnD86ePYuaNWuiT58+CAoK+myJYoFKlSrh5MmTyMvLg6enJxwcHDBo0CBoa2uLn9u4ceMQHByM8ePHw97eHh07dhTvUVZQUMDChQuxfPlyVKpUCb6+vsW21bRpU+jp6eH69evo0qWL1DpPT0/s2rULBw4cQN26dfHTTz9h7ty5RU7g9jHk5eWRnp6OHj16wNbWFh06dEDLli0xceJEAO96vPv16wd7e3u0aNECVatWFSdjK8t35UvGLxHKemc6fXdevHgBbW1tZGRklHixJyIiIvoY2dnZSElJgYWFxWe7/5B+fHl5ebhw4QJq1apVbC890ddS1usYe4yJiIiIiIhIpjExJiIiIiIiIpnGxJiIiIiIiIhkGhNjIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimabwrQOgT/fT+p8gr1r0VPibwnK/cjRf356aVqUXIiKir6qjxYhvHQJ9BrmaEuS6q+OtahbkFHK+aSxP5F5+0/ap7PLz8791CETlxh5jIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIvrK7t69CxMTE1y5cuWz1lu/fn2sWLHis9Xn5+eH8ePHf7b6fhRubm4YPHjwV20zICAAbdq0+aptfo8mTJgAJyenz14vE2MiIiIi+qEMHjwYJiYmWLx4sdTyffv2wcTE5BtF9XXs2bMH3bp1+6ptpqeno1evXqhWrRrs7OzQvn17pKamlrrdkSNHIJFI4ODggLy8PKl1Ojo6iIyM/DIB03fhe0v0OfkWEREREZXLouDOX7W9AXM2lHsbFRUVLF26FN26dYOOjs7nD6oEOTk5UFRU/KptFtDX1//qbU6dOhWXLl1CVFQUKlSogEuXLpVr++TkZKxZswY9e/b8QhEW7e3bt1BSUvqqbdL3iz3GRERERPTDadSoESpWrFio1/hD8fHx+OWXX2BlZQVnZ2eMGzcOr169EtebmJhg3759UtvY29tj48aNAP5/SPSOHTvg5+cHS0tLbNu2Dfn5+Zg3bx7q1KkDCwsLNG/eHLGxsSXGcurUKXh7e8PCwgK1atXCtGnTkJv7/08YyczMRP/+/WFtbY1atWrh999/LzTU+cOh1BkZGQgJCUHNmjVhaWmJpk2b4sCBAwCAp0+fom/fvqhTpw6srKzg4eGBmJiYkg9sEeTk5ODs7Iy6devCwsICPj4+MDc3L/P2AwYMQGhoKLKzs4stk5GRgd69e8PAwABaWlpo2rQpLl68KK4vqvdx8ODBcHNzE9+7ubmhf//+GDp0KCpUqIDmzZsDAI4ePYp69epBWVkZxsbGGDlypNRxz8rKQo8ePaChoQFjY2PMmTOnUHxv375FSEgITExMoK6ujvr16+PIkSNi7KqqqoXOo23btkFdXR2ZmZkAgPv376Njx47Q1dWFvr4+fH19S+x5f/PmDQYOHAgDAwOoqKigUaNGiI+PF9cX9Mjv3r0bNWvWhIqKCurXr4/Lly+LZSIjI6Gjo4Ndu3ahatWqUFNTg5+fH7KyshAVFQVzc3Po6upiwIABUr36Je3v+/Xu378f9vb20NDQQIsWLZCWlgbg3XDoqKgo/Pnnn5BIJJBIJOL2I0aMgK2tLdTU1GBpaYlx48YhJ+fLz4rPxJiIiIiIfjjy8vIYOXIkIiIi8ODBgyLLJCUloWvXrmjZsiUOHDiA8PBwnD17FmPGjCl3e9OmTUNgYCCOHDkCV1dXrFy5EsuXL8f48eNx4MABuLm5oWfPnvjnn3+K3D4tLQ3du3dHzZo1ceDAAYSFhWHDhg1YsGCBWGbixImIj49HREQENmzYgLNnz0olOR/Kz89Ht27dkJCQgEWLFiE2NhajRo2CvPy7x3y+efMGNWrUQFRUFA4fPoyuXbti4MCBOH/+fLn2/eeff8bu3btLTfyLM3jwYOTm5hb7I4YgCPD29sbDhw+xZ88enDt3DrVr14aHhweePn1arraioqKgoKCAkydPYvny5bh//z68vLxQt25dXLx4EeHh4Vi1ahWmTJkibjN8+HDExsZi+/bt+Ouvv3DkyBGcO3dOqt6ePXvi5MmTiI6OxqVLl9C+fXu0aNECN2/ehLa2Nry9vbFu3TqpbdavXw9fX19oaGjg1atXcHd3h4aGBo4dO4YTJ06IyeTbt2+L3JeQkBBs3boVUVFROH/+PKytreHp6VnomAwfPhyzZ89GfHw8DAwM0Lp1a6lE89WrV1i4cCGio6Oxb98+HDlyBL/88gv27NmDPXv2YO3atfj999+xZcuWMu3v+/XOnj0ba9euxbFjx3Dnzh0MGzYMADBs2DB06NBBTJbT0tLg4uICANDU1ERkZCSuXr2KBQsWYMWKFZg3b155PuaPwsSYiIiIiH5ILVu2RLVq1Yrs4QOA8PBwtGnTBr169YKlpSXq1q2LyZMnY8uWLSX2Xhbl119/hZeXF8zMzGBkZITly5ejb9++8PX1hbW1NcaMGYPq1atj5cqVRW4fFRWFSpUqYerUqbC2tkaLFi0QHByM5cuXIz8/H5mZmdi8eTPGjRuHxo0bw87ODnPnzi10b+77jh8/jsTERKxYsQJNmjRBlSpV0Lx5czRt2hQAYGxsjD59+sDBwQFVqlRBYGAgXF1dsWvXrjLv940bN9C/f38MGzYMw4YNw86dO8V1CQkJkEgkSE9PL7EONTU1hIaGIiwsDBkZGYXWx8bG4vLly9i8eTOcnZ1hY2OD2bNnQ0dHRypZKwtra2vMnDkTVatWhZ2dHZYuXQpTU1MsXrwYdnZ2aNOmDSZOnIg5c+aIx33VqlWYPXs2mjdvDkdHR0RFRUkd9+TkZGzYsAGbN29G48aNYWVlhWHDhqFRo0aIiIgAAHTt2hUxMTHiaIQXL15g9+7d4v3g0dHRkJOTw8qVK+Ho6Ah7e3tERETgzp07Uj2xBbKyshAeHo5Zs2aJ5/mKFSugqqqKVatWSZUNDQ2Viv3ff//F9u3bxfU5OTkIDw9HrVq10KRJE/j5+eHEiRNYtWoVqlWrhlatWsHd3V384aMs+1tQ77Jly+Ds7IzatWujf//+OHToEABAQ0MDqqqqUFZWhpGREYyMjMRh7WPHjoWLiwvMzc3h4+OD4OBgbNq0qVyf88fgPcZERERE9MMaM2YMOnTogN9++63QusuXLyM1NVUqSRAEAfn5+bh79y5sbGzK3E7Nmv/X3n2H93T+jx9/vrOnhBAZZElE7EiMGBnEqJXUCqJE1KpNUVSoGo3WqNq04dOaVa1RsyQxUkUqZqwI2lQaFLFCxvn94Zfz9ZYhOgR5Pa4r19X3ue9zn9c5ua/U633f575rqf999+5dUlNTqVu3rlYdb29vzpw5k+/5Fy9exMvLC41Gox6rW7cu9+/f59q1a9y+fZvMzEw8PT3V8lKlSlGpUqUCYzp9+jS2trYF1snOzmb+/Pls2bKFa9eu8fjxYx4/foyJiUmR7hlg1qxZBAQEMHjwYPz9/QkJCeGvv/6iefPmnD59mipVqhTpvec+ffowe/ZsIiMjmT59ulZZfHw89+7dy9POw4cPSUpKKnKs8OR38LTExER8fHy0nnujRo24d+8ev//+O7du3eLx48f4+Pio5WXKlMHd3V39/Ouvv6IoCpUrV9Zq+9GjR2rMbdq0QU9Pj82bN9O1a1e+++47zM3NadGihXqPFy9exNzcXKuNjIyMfO8xKSmJzMxMGjVqpB7T19enXr16JCYmatXNL/an65iYmGj1kfLly+Pk5ISZmZnWsbS0tCLfb37t2traqm0UZsOGDcydO5eLFy9y7949srKyKFWq1HPP+6ckMRZCCCGEEG+sBg0a4OfnxyeffEKXLl20ynKnGoeHh+c5L3f1ao1Gg6IoWmX5ve9obGyc59jTyRY8SbqfPVZY2dPXfTaG5x2HJwuQFWbJkiUsW7aMjz76iCpVqqgjty/yPmdiYiKdO3cGoHr16qxYsYLu3btz9epVjh8/XuQFtfT09Jg6dSphYWEMHjxYqywnJwdbW9t8R05zF1bT0dEp0u/J1NRU63Nhzz2/331+cnJy0NXVJT4+Xp2mnis3uTQwMKBTp06sXr2arl27snr1akJCQtDT01Pb8PLyyjPdGqBcuXJ5jj0d4/PuJz9P13l2oTiNRpPvsZycnCLfb0HtPu95Hjp0iK5du/LRRx/RsmVLLCwsWLt2bYGzPv5NkhgLIYQQQog32vjx42nRogUuLi5ax2vUqMG5c+dwdnYu8FwrKyv+/PNP9fOlS5d4+PBhodczNzfHxsaGw4cP06BBA/V4fHx8gfuvurm5sW3bNq3E5ujRo+qCTxYWFujr65OQkKAm7Xfv3iU5OVnrGk/z8PDg2rVrJCUl5Ttq/Msvv9CyZUs6duwIPEl4kpOTX2ikPPc+c9WtW5dly5YRFhaGlZVVniS3MJ07d+bTTz/lo48+0jpep04dUlNT0dPTK3BRr3LlyuXZEzohIeG5q4NXrVqV7777Tuu5x8XFYW5ujr29PaVLl0ZfX59Dhw7h4OAAwK1btzh//jx+fn4AeHp6kp2dTVpaGk2aNCnwWqGhobRo0YLTp08THR3Nxx9/rHWP69atUxcXex5XV1cMDAw4cOAA3bt3B558EXD06NE8+yvnF3uVKlWee42CFPV+n8fAwCDPqwAHDx7E0dFR6z3/K1eu/O1rvAh5x1gIIYQQQrzRPDw8ePvtt7XefwR47733iI+PZ/z48Zw6dYpLly6xa9cuPvzwQ7VOo0aNWLFiBSdPnuT48eN88MEHRdqKacCAASxcuJBNmzZx8eJFpk+fzunTp+nTp0++9Xv16sUff/zBhx9+yMWLF9m5cyezZs2iX79+6OjoYGZmRufOnZk6dSoHDx7k3LlzjBw5Eh0dnQJHCH18fKhfvz79+vVTFz/au3ev+q6ok5MT+/bt48iRI1y4cIGxY8dy/fr1oj5WAAYOHEh0dDTjx4/n7NmznDp1ipiYGPT09Lh+/brWO8dF8cknn/DVV19x//599VhgYCA+Pj4EBwezc+dOLl++TFxcHB9++CFHjx4FoGnTphw9epT//e9/XLhwgUmTJuVJlPPz3nvv8dtvvzFkyBDOnj3Lpk2bmDRpkvpszczM6NOnD6NHj2bPnj2cOnWKsLAwdHT+L42qXLkyoaGh9OzZk40bN5KcnMyRI0eIjIxk27Ztaj0/Pz/Kly9PaGgoTk5OWl9ohIaGUrZsWYKCgti/fz/JycnExsYybNgwfv/99zxxm5qaMnDgQEaPHs2OHTs4c+YMffv25cGDB3n62JQpU7RiL1u27D/aP7io9/s8Tk5OnDhxgnPnznHjxg0yMzNxdXXl6tWrrF27lqSkJObNm6f1qsN/SRJjIYQQQgjxxhszZkyeaZy5o4XJycl06NCBli1bMnPmTKytrdU6ERER2NnZ0aFDBwYNGsSAAQPynTb9rD59+tC/f3+mTJlCYGAg0dHRREVF5Rm1zmVra8vXX39NQkICzZs354MPPqBbt24MGzZMrTNp0iS8vLzo1asXXbt2pW7duri5uRU6ZXrZsmXUqlWL9957j4CAAKZNm6aO0g0fPpwaNWoQGhpKp06dKFeuHC1btnzuvT0tICCAdevWkZiYSFBQEF26dOGPP/5g5cqVTJo0ibCwMOLi4orcXtOmTWnatKnWdkkajYZt27bh6+tLeHg4lStXpmvXrly+fJny5csD0LJlSyZOnMiYMWOoW7cud+/epWfPns+9nr29Pdu2bePw4cPUqlWLAQMG0KdPH60vRz799FN8fX1p3749gYGBNG7cGC8vL612oqKi6NmzJ6NGjcLd3Z327dvzyy+/ULFiRa376NatG8ePHyc0NFTrfBMTE/bt24eDgwMdOnTAw8OD8PBwHj58WOAI8ieffELHjh155513qFOnjvqFSunSpfPUGzZsGF5eXly7do3Nmzf/4/2bi3K/z9O3b1/c3d3x9vamXLlyHDx4kKCgIEaMGMHgwYOpXbs2cXFxTJw48R/FWlQapSgT58UrKT09HQsLCzwWeaBrrJtvnfUzsvI9/ibZVqvgRSeEEEIUjxDnscUdgvgXZJlruBtgiqOdA0Z6/+wf0v/UDZ27xXr9V9GDBw/w8vIiIiKCbt26FXc4qpycHFJTU/H09MzzDqp4eWJiYggICODWrVvqu9glUUZGBsnJyTg7Oxf6JZK8YyyEEEIIIcRr4NSpU1y8eJHatWtz9+5ddW/XFx3lFULkJYmxEEIIIYQQr4nFixeTlJSEgYEBNWrUYOPGjZQpU6a4wxLitSeJsRBCCCGEEK+B6tWrs2PHjuIOQ7wm/P39i7TdlHhCFt8SQgghhBBCCFGiSWIshBBCCCGEEKJEk8RYCCGEEEIIIUSJJomxEEIIIYQQQogSTRJjIYQQQgghhBAlmiTGQgghhBBCCCFKNEmMhRBCCCGE+A/Z29v/69ssderUiYiIiH+tveHDhxMeHv6vtSfE60YSYyGEEEII8cZJSUlh1KhR1KlTBycnJ+rVq0dERAR//fVXcYf2r1i2bBljxox56deNi4vD09OzwP1xFUVh1apVhIWFYWFhgaWlJd7e3sydO5cHDx685GiFKDq94g5ACCGEEEK8XtLmJ7zU61kPrv1C9a9cuUL79u1xcXFhwYIFODg4cO7cOaZOncrevXvZsmULpUuX/m+CfUmKK/5du3bRvHlzNBpNvuVDhw5l27ZthIeHs3z5cmxsbDh+/Dhz587FycmJ4ODglxvwC8jMzERfX7+4wxDFREaMhRBCCCHEG2XChAno6+uzevVqfHx8sLe3p2nTpqxdu5bU1FQiIyPVuvXr12fevHmMHDmSypUrU7duXb755hut9v744w8GDhxItWrVcHV15a233uLXX39Vy1euXEnDhg1xcnKiSZMmbNiwodD4EhMT6dy5M5UqVaJatWqMGTOG+/fvq+VZWVlMnDgRDw8PqlWrxrRp0xg2bJjWVOdnp1I/evSIqVOn4u3tjbOzM40aNWLNmjUAZGdnM2rUKBo0aEClSpVo0qQJy5cv/1vPdvfu3bRo0SLfss2bN7Nx40bmz59P7969qVu3Lk5OTgQFBbF3714CAgIAOHLkCM2bN6ds2bJYWFjg5+en9TwBNBoNy5cv5+2338bExAQ3Nzc2b96sVef06dO0adOGUqVKYW5uTpMmTUhKSlLLo6Ki8PDwwMjIiCpVqrBw4UK17PLly2g0GtavX4+/vz9GRkZ5fu+iZJHEWAghhBBCvDFu3bpFTEwMvXr1wtjYWKvM2tqaDh06sGXLFq2pwEuWLKFmzZrs3LmTXr16MW7cOC5evAjA/fv36dSpE3/++SdRUVHs3r2bgQMHkpOTA8D27duZNGkS/fr1Y8+ePfTo0YORI0dy8ODBfON7+PAhPXr0wNLSkh9//JElS5awf/9+JkyYoNZZsGABGzduZPbs2WzatIm7d++yc+fOQu972LBhbNq0iY8//piYmBg++eQTTExMAMjJycHW1pbFixcTHR3NiBEj+OSTT/Ikms9z7tw50tLSaNy4cb7l33//PZUqVaJly5Z5yjQaDRYWFgDcvXuXXr16sX//fg4dOoSbmxutW7fm7t27Wud89NFHdOnShRMnTtC6dWtCQ0PVqfApKSn4+vpiZGTE3r17iY+PJzw8nKysLODJVPMJEyYwbdo0EhMTmT59OhMnTmTlypVa1xg7dixDhw4lMTEx37hFySFTqYUQQgghxBsjOTkZRVFwc3PLt9zV1ZXbt29z8+ZNypYtC0DTpk0JCwsDYNCgQSxbtoy4uDhcXV35/vvvuXnzJj/++KM6fdnZ2Vltb/HixXTp0kU9v1KlSvz6668sXryYRo0a5bn+xo0bycjI4PPPP1cT16lTpxIWFsaECRMoV64cUVFRDBkyhLfeeguAadOmsXfv3gLvOSkpiS1btrBmzRp8fX0BcHR0VMv19fV5//331c8ODg4cPXqULVu20L59+0Kf59N27typjq7mJzk5mUqVKj23naZNm2p9XrJkCaVLlyY2Npa2bduqx8PCwujWrRsA06dP54svvuDw4cO0atWKBQsWYGFhwdq1a9Xpz5UrV1bP/fjjj5k1axYdOnQAnvzOzpw5w5IlS+jVq5dab/jw4WodUbJJYiyEEEIIIUqM3JHip9+RrVq1qvrfGo2GcuXKcfPmTeDJdN3q1asX+E7vxYsXCQ0N1TpWt25dvvzyy3zrX7hwAQ8PDzUpzq2fk5NDUlIShoaGXL9+ndq1a6vlurq61KxZUx2lftbp06fR1dXFx8enwPv+3//+x5o1a/j999/JyMggMzOTatWqFVg/P7t27aJnz54FliuKUuC7x09LS0sjIiKCvXv38ueff5Kdnc2DBw+4evWqVr2aNWuq/21qaoq5uTlpaWkAJCQk0KRJk3zfCb5+/Tq//fYbffr0oW/fvurxrKwsddQ6l7e393PjFSWDJMZCCCGEEOKN4eTkhEaj4fz587Rq1SpPeVJSEpaWlpQpU0Y9pqen/U9ijUajJqEFjY4+W/9phSWIhZU9fTy/NgvyvBg3b97MRx99xMSJE/H29sbU1JRFixZx7NixQs97WlpaGidPniQwMLDAOi4uLly4cOG5bYWFhXH9+nXmzp2Lo6MjhoaG+Pj48PjxY616zya9T/9enp0m/7TcOsuWLaN+/fpaZbq6ulqfTU1NnxuvKBnkHWMhhBBCCPHGKFOmDL6+vqxcuZKHDx9qlaWlpbFx40batWtXpJFNAA8PD06fPs2tW7fyLXd1deXIkSNax44ePYqrq2u+9StXrsyZM2e0ti46cuQIOjo6uLi4UKpUKcqVK6eVtGZnZ3Pq1KlCY8zJyeHnn3/Ot/zw4cN4eXkRFhZG9erVcXZ25sqVKwW2l59du3bh5eWl9YXCs4KDg7l06VK+70MrisKdO3cA2L9/P0OHDqV169ZUq1YNQ0NDbty48ULx1KxZk/3795OZmZmnrHz58tjb23Pp0iVcXV21fp6eBi/E00p8YhwWFpZn2fgNGzZgZGTEzJkzmTx5MhqNJs9PlSpVtM65ePEi4eHhODg4YGhoiL29Pc2aNWPVqlXqIgC5oqOjadu2LeXKlcPIyIhKlSoREhLCvn37/uvbFUIIIYR4402dOpXHjx8TGhrKoUOHSElJITo6mm7dumFjY8PYsWOL3FZwcDDlypWjT58+HDlyhCtXrvDjjz9y9OhRAAYOHMj69ev53//+x6VLl1iyZAnbt29nwIAB+bbXoUMHDA0NGTZsGGfPnuXgwYNMnDiRjh07Uq5cOQB69+7N/Pnz2blzJxcvXiQiIoI7d+4UmMxXrFiRzp07M2rUKHbs2MHVq1eJi4tTF9dycnLixIkTxMTEkJSUxMyZMzl+/PiLPFJ27dpV4GrUudq3b0/79u0ZPHgwUVFRHD16lCtXrrB161YCAwOJjo4GnnyZ8PXXX5OYmMgvv/xCaGhooSPA+Rk8eDDp6el07dqVo0ePcuHCBb7++mvOnTsHwOTJk5kxYwaff/4558+f5+TJk0RFRTF79uwXuo4oOUp8Yvys5cuXExoayvz589VN06tVq8a1a9e0fg4cOKCec/jwYerUqUNiYiILFizg1KlTbN26lfDwcBYvXszp06fVugsXLqRZs2ZYWVmxbt06EhMT+frrr2nYsCEjRox46fcrhBBCCPGmcXFxYfv27Tg6OjJw4EAaNWrEmDFjaNiwIZs3b36hPYANDAxYs2YNVlZWvPPOOzRr1owFCxaoU3JbtWrFRx99xOLFi2natCnffPMNs2fPpmHDhvm2Z2xszKpVq7h9+zZt2rShX79+NG7cmGnTpql1Bg0aRHBwMMOGDSMoKAhTU1P8/PwwNDQsMM4ZM2bQpk0bxo8fj5+fH6NHj1ZHzN955x3eeustBg4cSLt27bh165bWAlTP8+DBAw4ePPjcxFij0bBgwQIiIiKIiYmhadOm1KxZk8mTJxMUFKSu+vzVV19x69YtPD09eeeddxg6dCjW1tZFjgfAysqKvXv3cu/ePfz8/PDy8mLZsmXq9Ot3332X5cuXs2LFCmrUqIGfnx8rVqyQEWNRII1S2AsLJUBYWBi3b9/mhx9+YObMmURERLBq1So6duwIPPm26YcffiAhISHf8xVFoVq1apiYmHD48GF0dPJ+15D7LsnVq1dxdXVl8ODB+X5bVdQFC3Klp6djYWGBxyIPdI11862zfkZWvsffJNtqPX/1QyGEEC9XiHPRR+TEqyvLXMPdAFMc7Rww0jMo1lhu6Nx9fqU3VE5ODn5+frRr104duHmZtm3bxsyZM4mJiSlS/ZycHFJTU/H09MzzTq8QL1tGRgbJyck4OzsX+j6+LL71/33wwQcsWLBAnepRVAkJCSQmJrJmzZp8k2L4v8UTvvvuOzIzMwv8g/a8pPjRo0c8evRI/Zyenl7kOIUQQgghxOvh999/JzY2lgYNGvD48WOioqL47bffePvtt4slHlNTU8aPH18s1xbiZZGp1DzZmD0yMpJNmzblmxSfPHkSMzMzrZ93330XgPPnzwPg7u6u1k9LS9Oqu3DhQrVuqVKlsLGxUet+9913WnVPnjxZYJwzZszAwsJC/alYseK/cv9CCCGEEOLVodFoWL9+PW3atCE4OJizZ8+ydu3aAvdm/q/5+fk9dxq1EK87GTHmyap2N27cICIigrp162Jubq5V7u7uri5ekOvZOk+P9lpZWalTr/39/bWWnn92VLhly5YkJCSQkpKCv78/2dnZBcY5btw4Ro4cqX5OT0+X5FgIIYQQ4g1jb2/Ppk2bijsMIUoUSYx58sfnu+++IyAggFatWrFjxw6txNfAwKDAJfdzv7k7e/asuhG7rq6uWv/pffHc3Ny4c+cOqamp6qixmZkZrq6uefbPy4+hoWGhiy4IIYQQQgghhHhxMpX6/3NwcCA2Npa0tDRatGhR5Pd3PT09qVKlCp999pm6mXhBOnXqhL6+PpGRkf9GyEIIIYQQQggh/gUyYvyUChUqEBMTQ0BAAC1atFA3J8/KyiI1NVWrrkajoXz58mg0GqKiomjevDmNGjVi3LhxeHh4kJmZyb59+7h+/bq6Gp+DgwOzZs1i2LBh/PXXX4SFheHs7Mxff/3FN998AyAr9wkhhBBCCCHESyaJ8TPs7e2JjY0lICCA5s2b07BhQ06fPo2tra1WPUNDQzIyMgBo0KAB8fHxTJ8+nUGDBpGamoqpqSm1atVizpw5hIeHq+cNGTIEDw8PZs+eTadOnUhPT8fKygofHx927NhBjRo1Xur9CiGEEEIIIURJV+L3MX6dyT7GT8g+xkII8eqRfYzfDLKPsfg7ZB9j8Sop6j7G8o6xEEIIIYQQQogSTRJjIYQQQgghnrJu3To8PDyKO4z/VFxcHPb29ty5c6fQevXr12fZsmUvKSpRECcnJ+bOnVss1548ebK6+86bTBJjIYQQQgjxxklJSWHUqFHUqVMHJycn6tWrR0REBH/99ZdWvVcp8WvQoAHR0dH5ltWvX7/A/Y0DAgKwt7dn3bp1f/vaBX0ZsG3bNnr06PG32y2qVyX5mjx5Ml27ds237P79+4wdOxYXFxeMjIwoV64c/v7+bN269SVH+XK9//777Nmzp7jD+M/J4ltCCCGEEOKFTF8+66Veb/y7o16o/pUrV2jfvj0uLi4sWLAABwcHzp07x9SpU9m7dy9btmyhdOnS/1G0BcvMzERfXz/fsjNnznDr1i0aNmxY4Pl2dnasW7eOoKAg9Vh8fDxpaWmYmJj86/ECWFlZ/Sftvqo2b97M6NGj8y0bMGAAhw8fZv78+VStWpWbN28SFxfHzZs3X3KUeRXWt/4pMzMzzMzM/pO2XyUyYiyEEEIIId4oEyZMQF9fn9WrV+Pj44O9vT1NmzZl7dq1pKamEhkZCUCnTp34/fffmTx5Mvb29tjb22u1ExMTg5+fH25uboSGhvLnn39qla9btw4/Pz9cXFzw9fVlxYoVatlvv/2Gvb09mzdvplOnTri4uLBx48YCY965cyd+fn4YGhoWWKdDhw4cOnSIlJQUrRg6dOiAnt7/jXflXvvUqVPqsTt37mBvb09cXFyeduPi4hg5ciTp6enqc5g168mXH8+OqM+aNYu6devi7OxMnTp1mDhxIgBz5syhWbNmedr28vIiIiICePI869Wrh6mpKZaWljRq1IgrV66wYsUKPvroI44fP45Go0Gj0ajP8s6dO/Tr1w9ra2tKlSpF06ZNOX78uNp+7kjzV199hYODA2ZmZgwcOJDs7GxmzpyJjY0N1tbWTJs2rcDn+vRzO3XqFG+99Va+5Vu2bGH8+PG0bt0aJycnvLy8GDJkCL169QJgypQp+e4w8/QzCAsLIzg4mM8++wxbW1usrKwYNGgQmZmZav20tDTatWuHsbExzs7OrFq1Kk+bGo2GxYsXExQUhKmpKVOnTgVg0aJFVKpUCQMDA9zd3fn666/znLdkyRLatm2LiYkJHh4e/Pzzz1y8eBF/f39MTU3x8fEhKSkpzzN+2ldffUW1atUwNDTE1taWwYMHP/f5vuokMRZCCCGEEG+MW7duERMTQ69evTA2NtYqs7a2pkOHDmzZsgVFUVi2bBm2tra8//77HDt2jGPHjql1Hz58yOLFi5k3bx4bN24kJSWFjz/+WC1ftWoVkZGRjB07lpiYGD744AM+/fRT1q9fr3XN6dOnEx4eribZBdm9ezctW7Ys9N7Kli2Ln58f3377rRrj5s2bCQkJKfLzyY+3tzcfffQR5ubm6nMYMGBAnnpbt25l2bJlREZGcuDAAb788kuqVKkCQEhICOfPnychIUGtf+LECY4dO0ZYWBhZWVkEBwfj5+fHiRMn+Pnnn+nXrx8ajYaQkBBGjRpFtWrVuHbtGteuXSMkJARFUWjTpg2pqals27aN+Ph46tSpQ7NmzbSmxCclJbF9+3Z27NjBmjVr+Oqrr2jTpg2///47sbGxREZG8uGHH3Lo0KFCn8PmzZvx9fXF0tIy33IbGxu2bdvG3bv5r5AeHh7OmTNnOHLkSL7PIFd0dDRJSUlER0ezcuVKVqxYofWlSlhYGJcvX2bv3r1s2LCBhQsXkpaWlud6kyZNIigoiJMnTxIeHs7333/PsGHDGDVqFKdOnaJ///707t07z/T8jz/+mJ49e5KQkECVKlXo3r07/fv3Z9y4cRw9ehSg0ER30aJFDBo0iH79+nHy5Ek2b96Mq6trgfVfFzKVWgghhBBCvDGSk5NRFAU3N7d8y11dXbl9+zY3b96kbNmy6OrqYmZmhrW1tVa9zMxMPvnkE5ycnIAnycrTix/NnTuXiIgIWrduDYCDgwPnz5/nm2++oUuXLmq9d999V61TkGvXrnHmzBmaNm363Pvr2rUrU6ZMYdiwYWzduhVHR0eqV6/+3PMKY2BggLm5ORqNJs9zeFpKSgrlypWjSZMm6OvrY29vj6enJ/Bkmre/vz/r1q2jZs2aAKxYsUIdUf/rr7+4c+cObdu2pVKlJ1ttPv1Os5mZGXp6etjY2KjH9u7dy8mTJ0lLS1NH0j/77DN++OEHNmzYQL9+/YAn20N99dVXmJubU7VqVQICAjh37hzbtm1DR0cHd3d3IiMjiYmJoUGDBgXe36ZNm7SmqT9r6dKlhIaGYmVlRa1atWjcuDGdOnWiUaNGAFSoUIGWLVsSFRVF3bp1AYiKilKfQa7SpUszf/58dHV1qVKlCm3atGHPnj307duX8+fPs337dg4dOkT9+vUB+PLLL/N9/7t79+6Eh4drfQ4LC+O9994DYOTIkRw6dIjPPvuMgIAAtV7v3r3VPjp27Fh8fHyYOHGi+sXMsGHD6N27d4HPYerUqYwaNYphw4apx3Lv93UmI8ZCCCGEEKLEUBQFeDKltDDGxsZqUgxQvnx5bty4AcDNmzf5448/GDVqFG5uburPvHnzuHLlilY7tWrVem5Mu3btwtvbu0jvPTdr1oz79+9z6NAh1q1bV+BCUf+Ftm3bkpGRgY+PD6NHj2b79u1kZWWp5d27d2fTpk1kZGSQlZXFmjVr1MStTJkyhIWF0bJlS9q1a8fnn3/OtWvXCr1efHw89+7dw8rKSn3P1czMjOTkZK2pvk5OTpibm6ufy5cvT9WqVdHR0dE6lt+oa6709HRiY2Np3759gXV8fX25dOkSe/bsoWPHjpw+fZomTZpozSTo27cva9asISMjg8zMTFatWqWVvAJUq1ZNa39nW1tbNbbExET09PTw9vZWy6tUqZLvKPbTdXLPzU3SczVq1IjExEStY7lfXMCT5wJoTQEvX748GRkZpKen57lmWloaf/zxR77T5l93MmIshBBCCCHeGE5OTmg0Gs6fP0+rVq3ylCclJWFpaUmZMmUKbefZhYw0Go2aVOfk5ADw6aefqiOmuZ5OeIA807nzU5Rp1Ln09PTo2LEjs2bN4tixYyxfvjxPnacTwlxPJ7B/l729Pfv27WP//v3s37+f8ePHs2jRIr777jv09fVp3rw5BgYG7NixgwcPHvDo0SM6duyonh8VFcXQoUPZsWMH69at48MPP2T37t0FjuLm5ORga2tLTExMnrKnE8X8flf5Hcv9veVn+/bteHh44OjoWOgz0NfXp0mTJjRp0oQPPviAqVOnMmXKFMaOHYuBgQHt2rXD0NCQ77//HkNDwzzPoKB4c2Mr6hc3AKampnmOPXueoih5jj19/dyy/I7l97yK0p9fVzJiLIQQQggh3hhlypTB19eXlStX8vDhQ62ytLQ0Nm7cSLt27bQSguzs7Be6Rrly5bCxseHKlSs4Oztr/Tg4OLxQW/fv3ycuLo4WLVoU+ZyuXbvy888/06JFi3xHEnOT/qcXCzt9+nShbRoYGBTpORgbG9OiRQs+/vhjvv32W+Lj4zl79izwJGnv3Lkz69evZ8uWLYSEhORZLdvT05Nx48YRFxdH9erVWb16dYHXr1OnDqmpqejp6eHq6qr1U7Zs2efG+iI2bdpU6GhxQapWrUpWVhYZGRnAk2fQq1cvoqKiiIqKomvXri+0YriHhwdZWVnqu74A586d4/bt20U698CBA1rH4uLi/tU9uc3NzXFycnojt2+SEWMhhBBCCPFGmTp1KkFBQYSGhjJmzBgqVqzI+fPnmTp1KjY2NowdO1atW7FiRX755ReCgoIwNDR87khyrlGjRjFx4kTMzc0JCAjg8ePHnDhxgtu3b9O/f/8ixxodHY2zs/NzRyqf5ubmxsmTJwscvTM2NqZOnTosWLCAihUr8tdffzFz5sxC26xQoQL3799n//79VKtWDWNj4zztr1u3jpycHDw9PTE2Nua7777DyMhIazXvbt26sWTJEgB19W948u730qVLad++PXZ2dpw7d47z58/Ts2dP4MlIf3JyMgkJCVSoUAFzc3MCAwPx8fEhODiYyMhI3N3d+eOPP9i2bRvBwcF5phL/XVlZWWzfvp2ffvqp0Hr+/v5069YNb29vrKysOHPmDOPHjycgIIBSpUqp9d599101GT148OALxeLu7k6rVq3o27cvS5cuRU9Pj+HDhxdppHb06NF06dJFXaBsy5YtbNy48bn39aImT57MgAEDsLa25q233uLu3bscPHiQIUOG/KvXedlkxFgIIYQQQrxRXFxc2L59O46OjgwcOJBGjRoxZswYGjZsyObNm7Xe5X3//ff57bffaNSoUb5b7RSke/fufPbZZ6xfv57AwEA6derE+vXrX3jEeOfOnS80WpyrTJkyhSZLs2fPJjMzk7feeouIiAjGjBlTaHt169blnXfeYeDAgdSoUYOFCxfmqWNhYcGqVasIDg4mMDCQAwcOsGLFCq0vE1xcXPDy8sLBwUFdPArAxMSEs2fP0rFjRypXrky/fv0YPHiw+iVCx44dadWqFQEBAZQrV441a9ag0WjYtm0bvr6+hIeHU7lyZbp27crly5fVd2P/DbGxsZiZmeHl5VVovZYtW7Jy5UpatGiBh4cHQ4YMoWXLlnlWIndzc6Nhw4a4u7trPYOiioqKomLFivj5+dGhQwd1u6rnCQ4O5vPPP+fTTz+lWrVqLFmyhKioKPz9/V84hsL06tWLuXPnsnDhQqpVq0bbtm25cOHCv3qN4qBRcieyi9dOeno6FhYWeCzyQNdYN98662f88/dJXnXbalUq7hCEEEI8I8R57PMriVdelrmGuwGmONo5YKRnUKyx3NDJf4uc11l2djY1a9bkm2++yfOu8utKURR8fX1p3749n332WZ53rl9FQ4cOJSsrK98vA/4ORVGoUqUK/fv3Z+TIkf9Km+Lvy8jIIDk5GWdnZ4yMjAqsJ1OphRBCCCGEKAa3bt2ib9++1K5du7hD+VfcuHGDDRs2kJqaSrt27Yo7nCKrXr06Pj4+/0pbaWlpfP3116SkpBS65ZF49UhiLIQQQgghRDEoW7Ysw4cPL+4w/jW1atWiTJkyfPLJJ1rv3L7qcvdD/jeUL1+esmXLsnTp0iJtvyVeHZIYCyGEEEIIIf6xlJQU4Mk2P6mpqcUcTfGQt1RfX7L4lhBCCCGEEEKIEk0SYyGEEEIIIYQQJZokxkIIIYQQQgghSjRJjIUQQgghhBBClGiSGAshhBBCCCGEKNEkMRZCCCGEEEIIUaJJYiyEEEIIIUQJYW9vz44dO176dTUaDT/88MNLv64QRSX7GAshhBBCiDfKjRs3mDlzJtHR0dy4cQMLCwuqVq3KyJEj8fb2Lu7w/nXDhw/n22+/BUBPTw9LS0s8PDwIDg6mS5cu6Oj831jYsWPHsLCwKK5QhXhlSWIshBBCCCFeyP7ztV/q9ZpUTnih+n379iUzM5O5c+fi6OjI9evXOXDgALdv3/5P4vu3ZGZmoq+v/7fODQgIYPbs2WRnZ3Pjxg2io6OJiIjgxx9/JCoqCj29J//st7a2/jdD1vJP4n+ex48fY2Bg8J+0LQTIVGohhBBCCPEGuXPnDocPH2bChAk0atSIChUq4OnpyZAhQwgMDFTrpaenM2bMGGrWrIm7uzudO3fm9OnTavmsWbNo3rw5GzZsoH79+lSpUoWBAwdy7949tc7WrVtp1qwZlSpVolq1aoSEhPDgwQMAcnJymDNnDl5eXjg7O9O8eXOio6PVc3/77Tfs7e3ZvHkznTp1wsXFhVWrVuHu7s7WrVu17mnXrl24urpqXftZBgYGWFtbY2trS40aNRg6dChfffUVe/fuZf369Wq9p6dSP378mAkTJuDp6YmLiwv169fniy++UOumpKTQu3dv3NzccHd3p3///ly/fj3PM1q7di0+Pj44OzujKArJycn069cPU1NTqlatyu7du/PEm5KSQkhICKVLl8bKyoqgoCAuX76sloeFhREcHMyMGTOws7OjcuXKACxcuBA3NzeMjIwoX748nTp1KvCZCPEiJDEWQgghhBBvDFNTU0xNTdmxYwePHj3Kt46iKPTs2ZO0tDS+/vprtm/fTo0aNQgJCeHWrVtqvStXrrBz505WrlzJypUrOXToEPPnzwfgzz//ZNCgQYSEhBATE8OGDRt46623UBQFgOXLl7NkyRIiIiLYvXs3/v7+9O7dm0uXLmnFMn36dMLDw4mJiaFVq1YEBQWxbt06rTrr16+nTZs2mJmZvdCzaNy4MVWrVmX79u35ln/11Vfs2rWLxYsXs2/fPr744gsqVqyoPqPw8HBu377Nd999x5o1a7hy5QoDBw7UauPy5cts2bKFZcuWsWvXLnJycujbty86OjocPHiQxYsXM3bsWK1zHjx4QEBAAGZmZuzbt48DBw5gZmZGq1atePz4sVpvz549JCYmsnv3brZu3crRo0cZOnQoU6ZM4dy5c+zYsQNfX98XeiZCFESmUgshhBBCiDeGnp4ec+bMYcyYMXzzzTdUr16dBg0aEBQURNWqVQE4ePAgZ8+e5fjx4xgaGgIQERHBzp07+fHHH+nRowfwf6O+uQlpx44dOXDgAABpaWlkZWXRunVrKlSoAICHh4cax5IlS3jvvfcICgoCYMKECcTFxbF8+XKmT5+u1nv33Xdp3bq1+rlbt24EBQWRmpqKjY0Nf/31Fz/99BNr1qz5W8/D1dWVxMTEfMtSUlJwdnamXr16aDQa9T4A9u/fT2JiIj///DP29vYAzJs3j4CAABISEqhduzbwZPr0vHnzsLKyAiA2NpaLFy+yadMmateuja6uLtOnT+ett95S2167di06OjosX74cjUYDQFRUFJaWlsTExNCiRQvgyZccy5cvV6dQb9y4EVNTU9q2bYu5uTmOjo54enr+recixLNkxFgIIYQQQrxR2rRpQ3x8PFFRUfj7+/Pzzz/TqlUrdST25MmT3L9/n+rVq+Pm5qb+XL16lStXrqjtVKxYUWuU1tramps3bwJQtWpVGjduTLNmzejXrx+rVq1S32G+e/cuqamp1K1bVysub29vLly4oHWsVq1aWp89PT2pXLkyGzZsAGDDhg3Y29vToEGDv/UsFEVRk89ndenShdOnT9OkSRMmTpxIbGysWnbhwgXs7OzUpBigcuXKWFhYaN2Dvb29mhTnnmdvb0/58uXVYz4+PlrXjY+P5+LFi5ibm2NmZoaZmRllypQhIyODpKQktV6NGjW03itu3rw5jo6OuLi48M4777Bq1Sp16roQ/5SMGAshhBBCiDeOkZERvr6++Pr6MmLECN5//31mzZpFSEgIOTk5WFtbq8nn055esTl3wapcGo2GnJwcAHR1dVm7di1Hjx4lNjaWqKgoIiMj2bp1K6VLl1brPy2/JNXY2DhPDN27dycqKorBgwezfv16unTpUmBy+zwXL15Up0c/q0aNGhw6dIi9e/dy4MABBgwYQOPGjVm2bFmBCfWzx01MTPKUP+vZdnJycvDy8mLVqlV56pYrV079b1NTU60yc3Nzfv31V2JiYti1axcRERFMnjyZI0eOYGlpme89ClFUkhi/AQ51P0SpUqXyL+z1cmMpDh7PryKEEEKIvyEjI4OHyckY2JhiYGT0fwXnX24cBhXMscP8H7Xh7e3Nrl27sLOzw9/fn8jISOzt7XFycsq3vrm5Ofr6+tjZ2anHLCws0NXV1Tpmb29PUFAQn376KY6OjsTFxTFy5Ejs7Ow4e/YsHTp0UOueOHGCevXqYWdnp75La21trdUewKBBg5g2bRobNmzg3LlzatwFMTEx4fHjx3nq7N27l8TEREaPHq1VVqZMGa3P7u7uDBw4kJ07d9KqVSuMjIzw8fFhypQpZGdnq4n1mTNnSE9Pp2HDhtjZ2eX7jHx8fPj444+1Fun6+eefteKqU6cO69atw9rauuB/wxZAT0+PwMBAAgMDmTRpEpaWluzdu1frOQvxd0hiLIQQQggh3hg3b96kc+fOhIeHU7NmTczNzTl69CgzZ85U3/cNDAzEx8eH4OBgIiMjcXd3548//mDbtm0EBwcXaa/jX375hT179tCiRQusra355ZdfuH79uvqe8ejRo5k0aRKVKlWidu3aREVFkZCQkO8o6bNKly5Nhw4dGD16NC1atNB697cgjx49IjU1lezsbP7880927NjBjBkzaNu2LT179sz3nDlz5mBra0vt2rXR0dHh22+/xcbGBktLSwIDA6lZsyahoaHMnTuXrKws3nvvPfz8/Ap9PoGBgbi7uzNp0iTKly/P/fv3mTBhglad0NBQPv30U4KCgpgyZQoVKlTg6tWrbNy4kdGjRxd4v1u3buXSpUv4+vpSunRptm3bRk5ODu7u7s99PkI8jyTGQgghhBDijWFmZkb9+vWZM2cOSUlJZGZmUrFiRfr27cv48eOBJ1N7t23bxoQJEwgPD+f69evY2Njg6+ur9W5sYUqVKsW+ffuYO3cu6enpODo6MmvWLHWRqaFDh5Kens6oUaNIS0ujatWqbN68GTc3tyK136dPH1avXk14eHiR6u/YsQNbW1v09PQoXbo0tWrVYt68efTq1QsdnfyXFTIzMyMyMpILFy6gq6tL3bp12bZtm1r/hx9+YMiQIfj6+qKjo0OrVq20tnPKj46ODhs2bKB79+74+Pjg5OTEvHnzaNWqlVrHxMSEffv2MXbsWDp06MDdu3ext7enWbNmhY4gW1pasnHjRiZPnkxGRgZubm6sWbOGatWqFekZCVEYjZLfiwDitZCeno6FhQV37tx54WkoQgghhBDPk5GRQXJyMs7Ozhg9PZVa/OdWrVrFsGHD+OOPP7QWoHodZGdnc+zYMTw9PdHV1S3ucEQJV9S/YzJiLIQQQgghxCviwYMHJCcnM2PGDPr37//aJcVCvK5kuyYhhBBCCCFeETNnzqR27dqUL1+ecePGFXc4QpQYkhgLIYQQQgjxipg8eTKZmZns2bNHaw9lIcR/SxJjIYQQQgghhBAlmiTGQgghhBBCCCFKNEmMhRBCCCFEoWQTEyHE66qof78kMRZCCCGEEPnS19cHnqyULIQQr6Pcv1+5f88KIts1CSGEEEKIfOnq6mJpaUlaWhoAJiYmaDSaYo5KvOqys7OBJ/vHyj7GorgoisKDBw9IS0vD0tLyuX1REmMhhBBCCFEgGxsbADU5FuJ5cnJyuHHjBpcvX0ZHRyaoiuJlaWmp/h0rjEaRl0ZeW+np6VhYWHDnzh1KlSpV3OEIIYQQ4g2WnZ1NZmZmcYchXgP37t3D29ubo0ePypZToljp6+sXedaCjBgLIYQQQojn0tXVlWmxokgeP37MlStXMDAwwMjIqLjDEaJIZG6DEEIIIYQQQogSTRJjIYQQQgghhBAlmiTGQgghhBBCCCFKNHnH+DWWu25aenp6MUcihBBCCCHEE7n/NpU1fsXrRBLj19jNmzcBqFixYjFHIoQQQgghhLabN29iYWFR3GEIUSSSGL/GypQpA8DVq1flj47IIz09nYoVK/Lbb7/Jdl4iD+kf4nmkj4jCSP8Qhblz5w4ODg7qv1WFeB1IYvway90w3cLCQv6nJApUqlQp6R+iQNI/xPNIHxGFkf4hCpP7b1UhXgfSW4UQQgghhBBClGiSGAshhBBCCCGEKNEkMX6NGRoaMmnSJAwNDYs7FPEKkv4hCiP9QzyP9BFRGOkfojDSP8TrSKPIOupCCCGEEEIIIUowGTEWQgghhBBCCFGiSWIshBBCCCGEEKJEk8RYCCGEEEIIIUSJJonxa2rhwoU4OztjZGSEl5cX+/fvL+6QRDGYMWMGdevWxdzcHGtra4KDgzl37pxWHUVRmDx5MnZ2dhgbG+Pv78/p06eLKWJRnGbMmIFGo2H48OHqMekfIiUlhR49emBlZYWJiQm1a9cmPj5eLZc+UnJlZWXx4Ycf4uzsjLGxMS4uLkyZMoWcnBy1jvSPkmPfvn20a9cOOzs7NBoNP/zwg1Z5UfrCo0ePGDJkCGXLlsXU1JT27dvz+++/v8S7EKJgkhi/htatW8fw4cOZMGECx44do0mTJrz11ltcvXq1uEMTL1lsbCyDBg3i0KFD7N69m6ysLFq0aMH9+/fVOjNnzmT27NnMnz+fI0eOYGNjQ/Pmzbl7924xRi5etiNHjrB06VJq1qypdVz6R8l269YtGjVqhL6+Ptu3b+fMmTPMmjULS0tLtY70kZIrMjKSxYsXM3/+fBITE5k5cyaffvopX3zxhVpH+kfJcf/+fWrVqsX8+fPzLS9KXxg+fDjff/89a9eu5cCBA9y7d4+2bduSnZ39sm5DiIIp4rVTr149ZcCAAVrHqlSponzwwQfFFJF4VaSlpSmAEhsbqyiKouTk5Cg2NjbKJ598otbJyMhQLCwslMWLFxdXmOIlu3v3ruLm5qbs3r1b8fPzU4YNG6YoivQPoShjx45VGjduXGC59JGSrU2bNkp4eLjWsQ4dOig9evRQFEX6R0kGKN9//736uSh94fbt24q+vr6ydu1atU5KSoqio6Oj7Nix46XFLkRBZMT4NfP48WPi4+Np0aKF1vEWLVoQFxdXTFGJV8WdO3cAKFOmDADJycmkpqZq9RdDQ0P8/Pykv5QggwYNok2bNgQGBmodl/4hNm/ejLe3N507d8ba2hpPT0+WLVumlksfKdkaN27Mnj17OH/+PADHjx/nwIEDtG7dGpD+If5PUfpCfHw8mZmZWnXs7OyoXr269BfxStAr7gDEi7lx4wbZ2dmUL19e63j58uVJTU0tpqjEq0BRFEaOHEnjxo2pXr06gNon8usvV65ceekxipdv7dq1/Prrrxw5ciRPmfQPcenSJRYtWsTIkSMZP348hw8fZujQoRgaGtKzZ0/pIyXc2LFjuXPnDlWqVEFXV5fs7GymTZtGt27dAPkbIv5PUfpCamoqBgYGlC5dOk8d+TeseBVIYvya0mg0Wp8VRclzTJQsgwcP5sSJExw4cCBPmfSXkum3335j2LBh7Nq1CyMjowLrSf8ouXJycvD29mb69OkAeHp6cvr0aRYtWkTPnj3VetJHSqZ169bxzTffsHr1aqpVq0ZCQgLDhw/Hzs6OXr16qfWkf4hcf6cvSH8RrwqZSv2aKVu2LLq6unm+WUtLS8vzLZ0oOYYMGcLmzZuJjo6mQoUK6nEbGxsA6S8lVHx8PGlpaXh5eaGnp4eenh6xsbHMmzcPPT09tQ9I/yi5bG1tqVq1qtYxDw8PdTFH+RtSso0ePZoPPviArl27UqNGDd555x1GjBjBjBkzAOkf4v8UpS/Y2Njw+PFjbt26VWAdIYqTJMavGQMDA7y8vNi9e7fW8d27d9OwYcNiikoUF0VRGDx4MBs3bmTv3r04OztrlTs7O2NjY6PVXx4/fkxsbKz0lxKgWbNmnDx5koSEBPXH29ub0NBQEhIScHFxkf5RwjVq1CjPFm/nz5/H0dERkL8hJd2DBw/Q0dH+p6Kurq66XZP0D5GrKH3By8sLfX19rTrXrl3j1KlT0l/EK0GmUr+GRo4cyTvvvIO3tzc+Pj4sXbqUq1evMmDAgOIOTbxkgwYNYvXq1WzatAlzc3P1m1oLCwuMjY3VPWunT5+Om5sbbm5uTJ8+HRMTE7p3717M0Yv/mrm5ufq+eS5TU1OsrKzU49I/SrYRI0bQsGFDpk+fTpcuXTh8+DBLly5l6dKlAPI3pIRr164d06ZNw8HBgWrVqnHs2DFmz55NeHg4IP2jpLl37x4XL15UPycnJ5OQkECZMmVwcHB4bl+wsLCgT58+jBo1CisrK8qUKcP7779PjRo18iwOKUSxKLb1sMU/smDBAsXR0VExMDBQ6tSpo27PI0oWIN+fqKgotU5OTo4yadIkxcbGRjE0NFR8fX2VkydPFl/Qolg9vV2Tokj/EIqyZcsWpXr16oqhoaFSpUoVZenSpVrl0kdKrvT0dGXYsGGKg4ODYmRkpLi4uCgTJkxQHj16pNaR/lFyREdH5/tvjl69eimKUrS+8PDhQ2Xw4MFKmTJlFGNjY6Vt27bK1atXi+FuhMhLoyiKUkw5uRBCCCGEEEIIUezkHWMhhBBCCCGEECWaJMZCCCGEEEIIIUo0SYyFEEIIIYQQQpRokhgLIYQQQgghhCjRJDEWQgghhBBCCFGiSWIshBBCCCGEEKJEk8RYCCGEEEIIIUSJJomxEEIIIYQQQogSTRJjIYQQ4h9aunQpFStWREdHh7lz577Ua69YsQJLS8uXek0hhBDiTSOJsRBCvIHCwsLQaDRoNBr09fVxcXHh/fff5/79+8Ud2nM5OTm99OTyn0hPT2fw4MGMHTuWlJQU+vXrp1X+559/oq+vzzfffJPv+f3796dmzZp/+/ohISGcP3/+b5+fn8uXL6PRaEhISPhX2xVCCCFeVZIYCyHEG6pVq1Zcu3aNS5cuMXXqVBYuXMj777//t9pSFIWsrKx/OcI3w9WrV8nMzKRNmzbY2tpiYmKiVV6+fHnatGlDVFRUnnMfPnzI2rVr6dOnz9+6dmZmJsbGxlhbW/+t84UQQgjxhCTGQgjxhjI0NMTGxoaKFSvSvXt3QkND+eGHH4Anie7MmTNxcXHB2NiYWrVqsWHDBvXcmJgYNBoNO3fuxNvbG0NDQ/bv309OTg6RkZG4urpiaGiIg4MD06ZNU89LSUkhJCSE0qVLY2VlRVBQEJcvX1bLw8LCCA4O5rPPPsPW1hYrKysGDRpEZmYmAP7+/ly5coURI0aoI94AN2/epFu3blSoUAETExNq1KjBmjVrtO737t27hIaGYmpqiq2tLXPmzMHf35/hw4erdR4/fsyYMWOwt7fH1NSU+vXrExMTU+hzvHr1KkFBQZiZmVGqVCm6dOnCn3/+CTyZxlyjRg0AXFxc0Gg0Wvebq0+fPkRHR+cp27BhAxkZGfTo0YMdO3bQuHFjLC0tsbKyom3btiQlJal1c0dx169fj7+/P0ZGRnzzzTd5plInJSURFBRE+fLlMTMzo27duvz0009a13VycmL69OmEh4djbm6Og4MDS5cuVcudnZ0B8PT0RKPR4O/vDzzpF/Xq1cPU1BRLS0saNWrElStXCn1+QgghxOtAEmMhhCghjI2N1QT0ww8/JCoqikWLFnH69GlGjBhBjx49iI2N1TpnzJgxzJgxg8TERGrWrMm4ceOIjIxk4sSJnDlzhtWrV1O+fHkAHjx4QEBAAGZmZuzbt48DBw5gZmZGq1atePz4sdpmdHQ0SUlJREdHs3LlSlasWMGKFSsA2LhxIxUqVGDKlClcu3aNa9euAZCRkYGXlxdbt27l1KlT9OvXj3feeYdffvlFbXfkyJEcPHiQzZs3s3v3bvbv38+vv/6qdT+9e/fm4MGDrF27lhMnTtC5c2datWrFhQsX8n1miqIQHBzMX3/9RWxsLLt37yYpKYmQkBDgyTTm3KTz8OHDXLt2jYoVK+Zpp3Xr1tjY2Kj3meurr74iODgYKysr7t+/z8iRIzly5Ah79uxBR0eHt99+m5ycHK1zxo4dy9ChQ0lMTKRly5Z5rnXv3j1at27NTz/9xLFjx2jZsiXt2rXj6tWrWvVmzZqFt7c3x44d47333mPgwIGcPXtWvReAn376iWvXrrFx40aysrIIDg7Gz8+PEydO8PPPP9OvXz/1ywshhBDitaYIIYR44/Tq1UsJCgpSP//yyy+KlZWV0qVLF+XevXuKkZGREhcXp3VOnz59lG7duimKoijR0dEKoPzwww9qeXp6umJoaKgsW7Ys32t++eWXiru7u5KTk6Mee/TokWJsbKzs3LlTjcvR0VHJyspS63Tu3FkJCQlRPzs6Oipz5sx57j22bt1aGTVqlBqbvr6+8u2336rlt2/fVkxMTJRhw4YpiqIoFy9eVDQajZKSkqLVTrNmzZRx48ble41du3Ypurq6ytWrV9Vjp0+fVgDl8OHDiqIoyrFjxxRASU5OLjTesWPHKo6OjurzuXTpkqLRaNRn86y0tDQFUE6ePKkoiqIkJycrgDJ37lytelFRUYqFhUWh165ataryxRdfqJ8dHR2VHj16qJ9zcnIUa2trZdGiRVrXOnbsmFrn5s2bCqDExMQUei0hhBDidSQjxkII8YbaunUrZmZmGBkZ4ePjg6+vL1988QVnzpwhIyOD5s2bY2Zmpv7873//05q6C+Dt7a3+d2JiIo8ePaJZs2b5Xi8+Pp6LFy9ibm6utlmmTBkyMjK02q1WrRq6urrqZ1tbW9LS0gq9l+zsbKZNm0bNmjWxsrLCzMyMXbt2qaOgly5dIjMzk3r16qnnWFhY4O7urn7+9ddfURSFypUra913bGxsnvt++p4rVqyoNQpctWpVLC0tSUxMLDTmZ/Xp04crV66wd+9e4MlocYUKFQgMDASeTIHu3r07Li4ulCpVSp3O/OxI79O/k/zcv3+fMWPGqHGamZlx9uzZPO08veCXRqPBxsam0N9DmTJlCAsLU0egP//8c3VEXwghhHjd6RV3AEIIIf4bAQEBLFq0CH19fezs7NDX1wcgOTkZgB9//BF7e3utcwwNDbU+m5qaqv9tbGxc6PVycnLw8vJi1apVecrKlSun/nduHLk0Gk2e6cLPmjVrFnPmzGHu3LnUqFEDU1NThg8frk7RVhRFbetpucdz49PV1SU+Pl4rMQcwMzPL97qKouQ7Vbig44Vxc3OjSZMmREVFERAQwMqVK+nduzc6Ok++o27Xrh0VK1Zk2bJl2NnZkZOTQ/Xq1bWmoYP27yQ/o0ePZufOnXz22We4urpibGxMp06d8rTzd34PUVFRDB06lB07drBu3To+/PBDdu/eTYMGDYr6GIQQQohXkiTGQgjxhjI1NcXV1TXP8apVq2JoaMjVq1fx8/Mrcntubm4YGxuzZ88e3n333TzlderUYd26dVhbW1OqVKm/HbeBgQHZ2dlax/bv309QUBA9evQAniS5Fy5cwMPDA4BKlSqhr6/P4cOH1dHd9PR0Lly4oN6jp6cn2dnZpKWl0aRJkyLFUrVqVa5evcpvv/2mtnvmzBnu3LmjXvtF9OnTh4EDBxIUFMTvv/9O7969gSeLiyUmJrJkyRI1tgMHDrxw+/DkWYWFhfH2228DT945zm9BsMIYGBgA5Pk9wJPn6Onpybhx4/Dx8WH16tWSGAshhHjtyVRqIYQoYczNzXn//fcZMWIEK1euJCkpiWPHjrFgwQJWrlxZ4HlGRkaMHTuWMWPGqNOuDx06xJdffglAaGgoZcuWJSgoiP3795OcnExsbCzDhg3j999/L3J8Tk5O7Nu3j5SUFG7cuAGAq6sru3fvJi4ujsTERPr3709qaqrWPfXq1YvRo0cTHR3N6dOnCQ8PR0dHRx3ZrVy5MqGhofTs2ZONGzeSnJzMkSNHiIyMZNu2bfnGEhgYSM2aNQkNDeXXX3/l8OHD9OzZEz8/v+dOac5P586d0dfXp3///jRr1gwnJycAdRXvpUuXcvHiRfbu3cvIkSNfuH148qw2btxIQkICx48fp3v37s8dCX6WtbU1xsbG7Nixgz///JM7d+6QnJzMuHHj+Pnnn7ly5Qq7du3i/Pnzf+sLAiGEEOJVI4mxEEKUQB9//DERERHMmDEDDw8PWrZsyZYtW9T3WgsyceJERo0aRUREBB4eHoSEhKjvpZqYmLBv3z4cHBzo0KEDHh4ehIeH8/DhwxcaQZ4yZQqXL1+mUqVK6hTsiRMnUqdOHVq2bIm/vz82NjYEBwdrnTd79mx8fHxo27YtgYGBNGrUCA8PD4yMjNQ6UVFR9OzZk1GjRuHu7k779u355Zdf8l1JGp5ML/7hhx8oXbo0vr6+BAYG4uLiwrp164p8P08zMTGha9eu3Lp1i/DwcPW4jo4Oa9euJT4+nurVqzNixAg+/fTTv3WNOXPmULp0aRo2bEi7du1o2bIlderUeaE29PT0mDdvHkuWLMHOzo6goCBMTEw4e/YsHTt2pHLlyvTr14/BgwfTv3//vxWnEEII8SrRKE+/gCWEEEK8Ie7fv4+9vT2zZs2iT58+xR2OEEIIIV5h8o6xEEKIN8KxY8c4e/Ys9erV486dO0yZMgWAoKCgYo5MCCGEEK86SYyFEEK8MT777DPOnTuHgYEBXl5e7N+/n7JlyxZ3WEIIIYR4xclUaiGEEEIIIYQQJZosviWEEEIIIYQQokSTxFgIIYQQQgghRIkmibEQQgghhBBCiBJNEmMhhBBCCCGEECWaJMZCCCGEEEIIIUo0SYyFEEIIIYQQQpRokhgLIYQQQgghhCjRJDEWQgghhBBCCFGiSWIshBBCCCGEEKJE+380kQSRJegeQgAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "# Disease categories\n", + "categories = [\n", + " \"Benign\",\n", + " \"Cardiovascular & Hematological\",\n", + " \"Immunological & Hematopoietic\",\n", + " \"Metabolic / Mitochondrial\",\n", + " \"Musculoskeletal & Connective Tissue\",\n", + " \"Neurological & Neurodevelopmental\",\n", + " \"Oncological / Cancer\",\n", + " \"Other / Multisystem / Syndromic\",\n", + " \"Sensory Disorders\"\n", + "]\n", + "\n", + "# Raw counts per dataset\n", + "kegg_counts = [0, 0, 17, 121, 0, 764, 316, 231, 0]\n", + "vep_coding_counts = [17398, 1550, 1876, 1863, 2887, 5715, 1254, 16199, 1341]\n", + "vep_non_snv_counts = [6559, 3145, 1439, 2781, 3605, 7147, 21932, 20197, 2320]\n", + "\n", + "datasets = [kegg_counts, vep_coding_counts, vep_non_snv_counts]\n", + "dataset_labels = ['KEGG', 'VEP Coding', 'VEP Non-SNV']\n", + "\n", + "# Convert counts to percentages\n", + "datasets_perc = []\n", + "for data in datasets:\n", + " total = sum(data)\n", + " perc = [val / total * 100 if total > 0 else 0 for val in data]\n", + " datasets_perc.append(perc)\n", + "\n", + "# Plotting\n", + "fig, ax = plt.subplots(figsize=(10, 5))\n", + "y = np.arange(len(datasets_perc))\n", + "bar_height = 0.5\n", + "\n", + "for i, category in enumerate(categories):\n", + " values = [d[i] for d in datasets_perc]\n", + " left = np.sum([d[:i] for d in datasets_perc], axis=1) if i > 0 else np.zeros(len(datasets_perc))\n", + " ax.barh(y, values, left=left, height=bar_height, label=category)\n", + "\n", + "# Axes and formatting\n", + "ax.set_yticks(y)\n", + "ax.set_yticklabels(dataset_labels)\n", + "ax.set_xlabel(\"Percentage of Variants\")\n", + "ax.set_title(\"Percent Stacked Bar Plot of Disease Categories Across Datasets\")\n", + "ax.legend(loc='lower right', bbox_to_anchor=(1.25, 0))\n", + "\n", + "plt.tight_layout()\n", + "plt.savefig(\"three_stacked_bar_plots_percent.svg\", format=\"svg\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ac53b01-53dc-4889-bed8-b254053f0d65", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/BioReason-main/data/KEGG_Data_1.ipynb b/BioReason-main/data/KEGG_Data_1.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a96c3d000b75f9b30a86d5dab4f96526c7373a8b --- /dev/null +++ b/BioReason-main/data/KEGG_Data_1.ipynb @@ -0,0 +1,11483 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "5077734e", + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration - Update these paths for your environment\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "# Create and navigate to kegg_data directory\n", + "data_dir = Path('kegg_data')\n", + "data_dir.mkdir(exist_ok=True)\n", + "os.chdir(data_dir)\n", + "\n", + "# Configuration parameters\n", + "CONFIG = {\n", + " # Output directories\n", + " 'network_dir': 'kegg_network',\n", + " 'variant_network_dir': 'network_variant', \n", + " 'variant_info_dir': 'variant_info',\n", + " \n", + " # Reference data paths (update these to point to your reference files)\n", + " 'cosmic_fusion_data': 'data/Cosmic_Fusion_v101_GRCh38.tsv', # Update path as needed\n", + " 'reference_genome': 'data/GRCh38_genomic.fna', # Update path as needed\n", + " \n", + " # Processing parameters\n", + " 'num_threads': 4, # Adjust based on your system\n", + " 'batch_size': 1000\n", + "}\n", + "\n", + "# Create required directories\n", + "for dir_name in [CONFIG['network_dir'], CONFIG['variant_network_dir'], CONFIG['variant_info_dir']]:\n", + " Path(dir_name).mkdir(exist_ok=True)\n", + "\n", + "print(f\"Working directory: {os.getcwd()}\")\n", + "print(\"Configuration loaded. Directory structure created.\")\n", + "print(\"\\n📝 Update CONFIG dictionary above with your actual file paths for reference data\")" + ] + }, + { + "cell_type": "markdown", + "id": "b77a0f2c", + "metadata": {}, + "source": [ + "# KEGG Data Processing Pipeline - Part 1: Data Retrieval and Network Analysis\n", + "\n", + "## Overview\n", + "\n", + "This notebook is the first part of a comprehensive KEGG (Kyoto Encyclopedia of Genes and Genomes) data processing pipeline for genetic variant analysis. It focuses on downloading and processing KEGG network data, disease associations, and variant information.\n", + "\n", + "## What This Notebook Does\n", + "\n", + "1. **KEGG Data Retrieval**: Downloads disease lists, network data, and pathway information from KEGG REST API\n", + "2. **Network Analysis**: Processes KEGG network files to identify reference vs disease networks\n", + "3. **Variant Extraction**: Identifies and extracts genetic variants from network data\n", + "4. **Data Filtering**: Cleans and filters variant information for downstream analysis\n", + "5. **Reference Data**: Processes genomic reference sequences and chromosome data\n", + "\n", + "## Prerequisites\n", + "\n", + "- Python 3.7+ with required packages (see requirements below)\n", + "- `kegg_pull` package for KEGG data retrieval\n", + "- `seqkit` for sequence processing\n", + "- Internet connection for KEGG API access\n", + "- Sufficient storage space (several GB for full dataset)\n", + "\n", + "## Required Packages\n", + "\n", + "```bash\n", + "pip install kegg-pull biopython pandas\n", + "```\n", + "\n", + "## Directory Structure\n", + "\n", + "This notebook expects and creates the following structure:\n", + "```\n", + "kegg_data/\n", + "├── kegg_diseases.txt\n", + "├── network_pathway.tsv\n", + "├── network_disease.tsv\n", + "├── kegg_network/\n", + "├── network_variant/\n", + "├── variant_info/\n", + "└── output files...\n", + "```\n", + "\n", + "## Important Notes\n", + "\n", + "- **Processing Time**: Full dataset processing can take several hours\n", + "- **Storage Requirements**: ~5-10GB of storage needed for complete dataset\n", + "- **API Limits**: KEGG REST API has rate limits; process may need pausing\n", + "- **Network Access**: Requires stable internet connection for data downloads\n", + "\n", + "## Next Steps\n", + "\n", + "After completing this notebook:\n", + "1. Run `KEGG_Data_2.ipynb` for variant information parsing\n", + "2. Run `KEGG_Data_3.ipynb` for final dataset creation with sequences" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "Set up paths and parameters for the data processing pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4297e63d-0309-45c4-920b-7a5cc1f42771", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d48693e3", + "metadata": {}, + "outputs": [], + "source": [ + "curl -s \"https://rest.kegg.jp/list/disease\" > kegg_diseases.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6e489c3f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "KEGG_data.ipynb\t\tclassify.py\t\tmodel.py\n", + "LICENSE\t\t\tdataset.py\t\tmodel_decoder.py\n", + "README.md\t\tdna_classifier.py\tplayground.ipynb\n", + "baseline.py\t\tfinetune.py\t\trequirements.txt\n", + "baseline_model.py\tkegg_diseases.txt\n" + ] + } + ], + "source": [ + "ls" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0cfda653", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 1593\n" + ] + } + ], + "source": [ + "curl -s \"https://rest.kegg.jp/list/network\" | wc -l" + ] + }, + { + "cell_type": "markdown", + "id": "4b2c1ed0-90a5-4005-bdb0-41be43070a8b", + "metadata": {}, + "source": [ + "Use kegg_pull for retrieving KEGG data https://github.com/MoseleyBioinformaticsLab/kegg_pull" + ] + }, + { + "cell_type": "markdown", + "id": "998a046f-604c-4378-8563-3df7de0f85c3", + "metadata": {}, + "source": [ + "```python3 -m pip install kegg-pull```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "65894de2-27f1-46c1-9eab-b54c7630fe86", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.1.0\n" + ] + } + ], + "source": [ + "kegg_pull -v" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "63e1de4e-ee4a-4cba-aabe-9a801735e643", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Usage:\n", + " kegg_pull -h | --help Show this help message.\n", + " kegg_pull -v | --version Displays the package version.\n", + " kegg_pull --full-help Show the help message of all sub commands.\n", + " kegg_pull pull ... Pull, separate, and store an arbitrary number of KEGG entries to the local file system.\n", + " kegg_pull entry-ids ... Obtain a list of KEGG entry IDs.\n", + " kegg_pull map ... Obtain a mapping of entry IDs (KEGG or outside databases) to the IDs of related entries.\n", + " kegg_pull pathway-organizer ... Creates a flattened version of a pathways Brite hierarchy.\n", + " kegg_pull rest ... Executes one of the KEGG REST API operations.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\n", + "Usage:\n", + " kegg_pull pull -h | --help\n", + " kegg_pull pull database [--force-single-entry] [--multi-process] [--n-workers=] [--output=] [--print] [--sep=] [--entry-field=] [--n-tries=] [--time-out=] [--sleep-time=] [--ut=]\n", + " kegg_pull pull entry-ids [--force-single-entry] [--multi-process] [--n-workers=] [--output=] [--print] [--sep=] [--entry-field=] [--n-tries=] [--time-out=] [--sleep-time=] [--ut=]\n", + "\n", + "Options:\n", + " -h --help Show this help message.\n", + " database Pulls all the entries in a KEGG database.\n", + " The KEGG database from which to pull entries.\n", + " --force-single-entry Forces pulling only one entry at a time for every request to the KEGG web API. This flag is automatically set if is \"brite\".\n", + " --multi-process If set, the entries are pulled across multiple processes to increase speed. Otherwise, the entries are pulled sequentially in a single process.\n", + " --n-workers= The number of sub-processes to create when pulling. Defaults to the number of cores available. Ignored if --multi-process is not set.\n", + " --output= The directory where the pulled KEGG entries will be stored. Defaults to the current working directory. If ends in \".zip\", entries are saved to a ZIP archive instead of a directory. Ignored if --print is set.\n", + " --print If set, prints the entries to the screen rather than saving them to the file system. Separates entries by the --sep option if set.\n", + " --sep= The string that separates the entries which are printed to the screen when the --print option is set. Ignored if the --print option is not set. Defaults to printing the entry id, followed by the entry, followed by a newline.\n", + " --entry-field= Optional field to extract from the entries pulled rather than the standard flat file format (or \"htext\" in the case of brite entries).\n", + " --n-tries= The number of times to attempt a KEGG request before marking it as timed out or failed. Defaults to 3.\n", + " --time-out= The number of seconds to wait for a KEGG request before marking it as timed out. Defaults to 60.\n", + " --sleep-time= The amount of time to wait after a KEGG request times out (or potentially blacklists with a 403 error code) before attempting it again. Defaults to 5.0.\n", + " --ut= If set, the ratio of unsuccessful entry IDs (failed or timed out) to total entry IDs at which kegg_pull quits. Valid values are between 0.0 and 1.0 non-inclusive.\n", + " entry-ids Pulls entries specified by a comma separated list. Or from standard input: one entry ID per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull pull entry-ids - ...).\n", + " Comma separated list of entry IDs to pull (e.g. id1,id2,id3 etc.). Or if equal to \"-\", entry IDs are read from standard input. Will likely need to set --force-single-entry if any of the entries are from the brite database.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\n", + "Usage:\n", + " kegg_pull entry-ids -h | --help\n", + " kegg_pull entry-ids database [--output=]\n", + " kegg_pull entry-ids keywords [--output=]\n", + " kegg_pull entry-ids molec-attr (--formula=|--em=...|--mw=...) [--output=]\n", + "\n", + "Options:\n", + " -h --help Show this help message.\n", + " database Pulls all the entry IDs within a given database.\n", + " The KEGG database from which to pull a list of entry IDs.\n", + " --output= Path to the file (either in a directory or ZIP archive) to store the output (1 entry ID per line). Prints to the console if not specified. If a ZIP archive, the file path must be in the form of /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:file.txt).\n", + " keywords Searches for entries within a database based on provided keywords.\n", + " Comma separated list of keywords to search entries with (e.g. kw1,kw2,kw3 etc.). Or if equal to \"-\", keywords are read from standard input, one keyword per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull rest find brite - ...).\n", + " molec-attr Searches a database of molecule-type KEGG entries by molecular attributes.\n", + " --formula= Sequence of atoms in a chemical formula format to search for (e.g. \"O5C7\" searches for molecule entries containing 5 oxygen atoms and/or 7 carbon atoms).\n", + " --em= Either a single number (e.g. \"--em=155.5\") or two numbers (e.g. \"--em=155.5 --em=244.4\"). If a single number, searches for molecule entries with an exact mass equal to that value rounded by the last decimal point. If two numbers, searches for molecule entries with an exact mass within the two values (a range).\n", + " --mw= Same as \"--em=\" but searches based on the molecular weight.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\n", + "Usage:\n", + " kegg_pull map -h | --help\n", + " kegg_pull map conv [--reverse] [--output=]\n", + " kegg_pull map link [--deduplicate] [--add-glycans] [--add-drugs] [--output=]\n", + " kegg_pull map (link|conv) entry-ids [--reverse] [--output=]\n", + " kegg_pull map link [--deduplicate] [--add-glycans] [--add-drugs] [--output=]\n", + "\n", + "Options:\n", + " -h --help Show this help message.\n", + " conv Converts the output of the KEGG \"conv\" operation into a JSON mapping.\n", + " The name of the KEGG database with entry IDs mapped to the outside database.\n", + " The name of the outside database with entry IDs mapped from the KEGG database.\n", + " --reverse Reverses the mapping with the target becoming the source and the source becoming the target.\n", + " --output= The location (either a directory or ZIP archive) of the JSON file to store the mapping. If not set, prints a JSON representation of the mapping to the console. If a ZIP archive, the file path must be in the form of /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:mapping.json).\n", + " link Converts the output of the KEGG \"link\" operation into a JSON mapping.\n", + " The name of the database with entry IDs mapped to the target database.\n", + " The name of the database with entry IDs mapped from the source database.\n", + " --deduplicate Some mappings including pathway entry IDs result in half beginning with the normal \"path:map\" prefix but the other half with a different prefix. If set, removes the IDs corresponding to identical entries but with a different prefix. Raises an exception if neither the source nor the target database are \"pathway\".\n", + " --add-glycans Whether to add the corresponding compound IDs of equivalent glycan entries. Logs a warning if neither the source nor the target database are \"compound\".\n", + " --add-drugs Whether to add the corresponding compound IDs of equivalent drug entries. Logs a warning if neither the source nor the target database are \"compound\".\n", + " entry-ids Create a mapping to a target database from a list of specific entry IDs.\n", + " Comma separated list of entry IDs (e.g. Id1,Id2,Id3 etc.). Or if equal to \"-\", entry IDs are read from standard input, one entry ID per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull map entry-ids drug - ...).\n", + " The name of an intermediate KEGG database with which to find cross-references to cross-references e.g. \"kegg_pull map link ko reaction compound\" creates a mapping from ko-to-compound via ko-to-reaction cross-references connected to reaction-to-compound cross-references.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\n", + "Usage:\n", + " kegg_pull pathway-organizer [--tln=] [--fn=] [--output=]\n", + "\n", + "Options:\n", + " -h --help Show this help message.\n", + " --tln= Node names in the highest level of the hierarchy to select from. If not set, all top level nodes are traversed to create the mapping of node key to node info. Either a comma separated list (e.g. node1,node2,node3 etc.) or if equal to \"-\", read from standard input one node per line; Press CTRL+D to finalize input or pipe (e.g. cat nodes.txt | kegg_pull pathway-organizer --tln=- ...). If both \"--tln\" and \"--fn\" are set as \"-\", one of the lines must be the delimiter \"---\" without quotes in order to distinguish the input, with the top level nodes first and filter nodes second.\n", + " --fn= Names (not keys) of nodes to exclude from the mapping of node key to node info. Neither these nodes nor any of their children will be included. If not set, no nodes will be excluded. Either a comma separated list (e.g. node1,node2,node3 etc.) or if equal to \"-\", read from standard input one node per line; Press CTRL+D to finalize input or pipe (e.g. cat nodes.txt | kegg_pull pathway-organizer --fn=- ...). If both \"--tln\" and \"--fn\" are set as \"-\", one of the lines must be the delimiter \"---\" without quotes in order to distinguish the input, with the top level nodes first and filter nodes second.\n", + " --output= The file to store the flattened Brite hierarchy as a JSON structure with node keys mapping to node info, either a JSON file or ZIP archive. Prints to the console if not set. If saving to a ZIP archive, the file path must be in the form of /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:mapping.json).\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\n", + "Usage:\n", + " kegg_pull rest -h | --help\n", + " kegg_pull rest info [--test] [--output=]\n", + " kegg_pull rest list [--test] [--output=]\n", + " kegg_pull rest get [--entry-field=] [--test] [--output=]\n", + " kegg_pull rest find [--test] [--output=]\n", + " kegg_pull rest find (--formula=|--em=...|--mw=...) [--test] [--output=]\n", + " kegg_pull rest conv [--test] [--output=]\n", + " kegg_pull rest conv entry-ids [--test] [--output=]\n", + " kegg_pull rest link [--test] [--output=]\n", + " kegg_pull rest link entry-ids [--test] [--output=]\n", + " kegg_pull rest ddi [--test] [--output=]\n", + "\n", + "Options:\n", + " -h --help Show this help message.\n", + " info Executes the \"info\" KEGG API operation, pulling information about a KEGG database.\n", + " The name of the database to pull information about or entry IDs from.\n", + " --test If set, test the request to ensure it works rather than sending it. Print True if the request would succeed and False if the request would fail. Ignores --output if this options is set along with --test.\n", + " --output= Path to the file (either in a directory or ZIP archive) to store the response body from the KEGG web API operation. Prints to the console if not specified. If a ZIP archive, the file path must be in the form of /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:file.txt).\n", + " list Executes the \"list\" KEGG API operation, pulling the entry IDs of the provided database.\n", + " get Executes the \"get\" KEGG API operation, pulling the entries of the provided entry IDs.\n", + " Comma separated list of entry IDs (e.g. id1,id2,id3 etc.). Or if equal to \"-\", entry IDs are read from standard input, one entry ID per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull rest get - ...).\n", + " --entry-field= Optional field to extract from an entry instead of the default entry info (i.e. flat file or htext in the case of brite entries).\n", + " find Executes the \"find\" KEGG API operation, finding entry IDs based on provided queries.\n", + " Comma separated list of keywords to search entries with (e.g. kw1,kw2,kw3 etc.). Or if equal to \"-\", keywords are read from standard input, one keyword per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull rest find brite - ...).\n", + " --formula= Sequence of atoms in a chemical formula format to search for (e.g. \"O5C7\" searches for molecule entries containing 5 oxygen atoms and/or 7 carbon atoms).\n", + " --em= Either a single number (e.g. --em=155.5) or two numbers (e.g. --em=155.5 --em=244.4). If a single number, searches for molecule entries with an exact mass equal to that value rounded by the last decimal point. If two numbers, searches for molecule entries with an exact mass within the two values (a range).\n", + " --mw= Same as --em but searches based on the molecular weight.\n", + " conv Executes the \"conv\" KEGG API operation, converting entry IDs from an outside database to those of a KEGG database and vice versa.\n", + " The name of the KEGG database from which to view equivalent outside database entry IDs.\n", + " The name of the non-KEGG database from which to view equivalent KEGG database entry IDs.\n", + " entry-ids Perform the \"conv\" or \"link\" operation of the form that maps specific provided entry IDs to a target database.\n", + " link Executes the \"link\" KEGG API operation, showing the IDs of entries that are connected/related to entries of other databases.\n", + " The name of the database that the entry IDs of the source database or provided entry IDs are mapped to.\n", + " The name of the database from which cross-references are found in the target database.\n", + " ddi Executes the \"ddi\" KEGG API operation, searching for drug to drug interactions. Providing one entry ID reports all known interactions, while providing multiple checks if any drug pair in a given set of drugs is CI or P. If providing multiple, all entries must belong to the same database.\n", + " Comma separated list of drug entry IDs from the following databases: drug, ndc, or yj (e.g. id1,id2,id3 etc.). Or if equal to \"-\", entry IDs are read from standard input, one entry ID per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull rest ddi - ...).\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull --full-help" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "12e35258-92f8-4ece-9d18-177263d1e97c", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "N00001\tEGF-EGFR-RAS-ERK signaling pathway\n", + "N00002\tBCR-ABL fusion kinase to RAS-ERK signaling pathway\n", + "N00003\tMutation-activated KIT to RAS-ERK signaling pathway\n", + "N00004\tDuplication or mutation-activated FLT3 to RAS-ERK signaling pathway\n", + "N00005\tMutation-activated MET to RAS-ERK signaling pathway\n", + "N00006\tAmplified EGFR to RAS-ERK signaling pathway\n", + "N00007\tEML4-ALK fusion kinase to RAS-ERK signaling pathway\n", + "N00008\tRET fusion kinase to RAS-ERK signaling pathway\n", + "N00009\tTRK fusion kinase to RAS-ERK signaling pathway\n", + "N00010\tMutation-inactivated PTCH1 to Hedgehog signaling pathway\n", + "N00011\tMutation-activated FGFR3 to RAS-ERK signaling pathway\n", + "N00012\tMutation-activated KRAS/NRAS to ERK signaling pathway\n", + "N00013\tMutation-activated BRAF to ERK signaling pathway\n", + "N00014\tMutation-activated EGFR to RAS-ERK signaling pathway\n", + "N00015\tPDGF-PDGFR-RAS-ERK signaling pathway\n", + "N00016\tPDGF-overexpression to RAS-ERK signaling pathway\n", + "N00017\tMutation-activated SMO to Hedgehog signaling pathway\n", + "N00018\tAmplified PDGFR to RAS-ERK signaling pathway\n", + "N00019\tFGF-FGFR-RAS-ERK signaling pathway\n", + "N00020\tAmplified FGFR to RAS-ERK signaling pathway\n", + "N00021\tEGF-ERBB2-RAS-ERK signaling pathway\n", + "N00022\tERBB2-overexpression to RAS-ERK signaling pathway\n", + "N00023\tEGF-EGFR-PLCG-ERK signaling pathway\n", + "N00024\tMutation-activated EGFR to PLCG-ERK signaling pathway\n", + "N00025\tEML4-ALK fusion kinase to PLCG-ERK signaling pathway\n", + "N00026\tEGF-EGFR-PLCG-CAMK signaling pathway\n", + "N00027\tAmplified EGFR to PLCG-CAMK signaling pathway\n", + "N00028\tPDGF-PDGFR-PLCG-CAMK signaling pathway\n", + "N00029\tAmplified PDGFR to PLCG-CAMK signaling pathway\n", + "N00030\tEGF-EGFR-RAS-PI3K signaling pathway\n", + "N00031\tDuplication or mutation-activated FLT3 to RAS-PI3K signaling pathway\n", + "N00032\tMutation-activated KRAS/NRAS to PI3K signaling pathway\n", + "N00033\tEGF-EGFR-PI3K signaling pathway\n", + "N00034\tERBB2-overexpression to PI3K signaling pathway\n", + "N00035\tAmplified EGFR to PI3K signaling pathway\n", + "N00036\tMutation-activated EGFR to PI3K signaling pathway\n", + "N00037\tFGF-FGFR-PI3K signaling pathway\n", + "N00038\tAmplified FGFR to PI3K signaling pathway\n", + "N00039\tPDGF-PDGFR-PI3K signaling pathway\n", + "N00040\tAmplified PDGFR to PI3K signaling pathway\n", + "N00041\tEGFR-overexpression to RAS-ERK signaling pathway\n", + "N00042\tEGFR-overexpression to PI3K signaling pathway\n", + "N00043\tHGF-MET-PI3K signaling pathway\n", + "N00044\tMutation-activated MET to PI3K signaling pathway\n", + "N00045\tKITLG-KIT-PI3K signaling pathway\n", + "N00046\tMutation-activated KIT to PI3K signaling pathway\n", + "N00047\tEML4-ALK fusion kinase to PI3K signaling pathway\n", + "N00048\tBCR-ABL fusion kinase to PI3K signaling pathway\n", + "N00049\tMutation-activated PI3K to PI3K signaling pathway\n", + "N00050\tAmplified PI3K to PI3K signaling pathway\n", + "N00051\tDeleted PTEN to PI3K signaling pathway\n", + "N00052\tMutation-inactivated PTEN to PI3K signaling pathway\n", + "N00053\tCytokine-Jak-STAT signaling pathway\n", + "N00054\tDuplication or mutation-activated FLT3 to Jak-STAT signaling pathway\n", + "N00055\tBCR-ABL fusion kinase to Jak-STAT signaling pathway\n", + "N00056\tWnt signaling pathway\n", + "N00057\tMutation-inactivated APC to Wnt signaling pathway\n", + "N00058\tMutation-activated CTNNB1 to Wnt signaling pathway\n", + "N00059\tFZD7-overexpression to Wnt signaling pathway\n", + "N00060\tLRP6-overexpression to Wnt signaling pathway\n", + "N00061\tCDH1-reduced expression to beta-catenin signaling pathway\n", + "N00062\tHedgehog signaling pathway\n", + "N00063\tTGF-beta signaling pathway\n", + "N00064\tMutation-inactivated TGFBR2 to TGF-beta signaling pathway\n", + "N00065\tMutation-inactivated SMAD2 to TGF-beta signaling pathway\n", + "N00066\tMDM2-p21-Cell cycle G1/S\n", + "N00067\tDeleted p14(ARF) to p21-cell cycle G1/S\n", + "N00068\tAmplified MDM2 to p21-cell cycle G1/S\n", + "N00069\tp16-Cell cycle G1/S\n", + "N00070\tMutation-inactivated p16(INK4a) to p16-cell cycle G1/S\n", + "N00071\tDeleted p16(INK4a) to p16-cell cycle G1/S\n", + "N00072\tAmplified CDK4 to cell cycle G1/S\n", + "N00073\tMutation-activated CDK4 to cell cycle G1/S\n", + "N00074\tLoss of RB1 to cell cycle G1/S\n", + "N00075\tMutation-inactivated RB1 to cell cycle G1/S\n", + "N00076\tMutation-inactivated p14(ARF) to p21-cell cycle G1/S\n", + "N00077\tHRAS-overexpression to ERK signaling pathway\n", + "N00078\tMutation-activated HRAS to ERK signaling pathway\n", + "N00079\tHIF-1 signaling pathway\n", + "N00080\tLoss of VHL to HIF-1 signaling pathway\n", + "N00081\tMutation-inactivated VHL to HIF-1 signaling pathway\n", + "N00082\tLoss of NKX3-1 to PI3K signaling pathway\n", + "N00083\tAndrogen receptor signaling pathway\n", + "N00084\tAmplified AR to androgen receptor signaling pathway\n", + "N00085\tMutation-activated AR to androgen receptor signaling pathway\n", + "N00086\tNotch signaling pathway\n", + "N00087\tNOTCH-overexpression to Notch signaling pathway\n", + "N00088\tAmplified MYC to p15-cell cycle G1/S\n", + "N00089\tAmplified MYC to cell cycle G1/S\n", + "N00090\tp15-Cell cycle G1/S\n", + "N00091\tp27-Cell cycle G1/S\n", + "N00092\tAmplified MYC to p27-cell cycle G1/S\n", + "N00093\tLoss of CDKN1B to p27-cell cycle G1/S\n", + "N00094\tEGF-Jak-STAT signaling pathway\n", + "N00095\tERBB2-overexpression to EGF-Jak-STAT signaling pathway\n", + "N00096\tEGF-EGFR-RAS-RASSF1 signaling pathway\n", + "N00097\tLoss of RASSF1 to RAS-RASSF1 signaling pathway\n", + "N00098\tIntrinsic apoptotic pathway\n", + "N00099\tMutation-inactivated BAX to apoptotic pathway\n", + "N00100\tBCL2-overexpression to intrinsic apoptotic pathway\n", + "N00101\tDCC-apoptotic pathway\n", + "N00102\tLoss of DCC to DCC-apoptotic pathway\n", + "N00103\tEGF-EGFR-RAS-RalGDS signaling pathway\n", + "N00104\tMutation-activated KRAS to RalGDS signaling pathway\n", + "N00105\tEML4-ALK fusion kinase to Jak-STAT signaling pathway\n", + "N00106\tAML1-EVI1 fusion to TGF-beta signaling pathway\n", + "N00107\tEVI-1 overexpression to TGF-beta signaling pathway\n", + "N00108\tAML1-ETO fusion to transcriptional activtion\n", + "N00109\tPML-RARA fusion to transcriptional activtion\n", + "N00110\tPLZF-RARA fusion to transcriptional activtion\n", + "N00111\tAML1-ETO fusion to CEBPA-mediated transcription\n", + "N00112\tAML1-ETO fusion to PU.1-mediated transcription\n", + "N00113\tPML-RARA fusion to transcriptional repression\n", + "N00114\tPLZF-RARA fusion to transcriptional repression\n", + "N00115\tMutation-inactivated TP53 to transcription\n", + "N00116\tMutation-inactivated RUNX1 to transcription\n", + "N00117\tE2A-PBX1 fusion to transcriptional activation\n", + "N00118\tTEL-AML1 fusion to transcriptional repression\n", + "N00119\tMLL-AF4 fusion to transcriptional activation\n", + "N00120\tMLL-ENL fusion to transcriptional activation\n", + "N00121\tLMO2-rearrangement to transcriptional activation\n", + "N00122\tLMO2-rearrangement to transcriptional repression\n", + "N00123\tAmplified REL to transcription\n", + "N00124\tIGH-MAF fusion to transcriptional activation\n", + "N00125\tIGH-MMSET fusion to transcriptional activation\n", + "N00126\tPAX8-PPARG fusion to PPARG-mediated transcription\n", + "N00127\tPRCC-TFE3 fusion to transcriptional activation\n", + "N00128\tTMPRSS2-ERG fusion to transcriptional activation\n", + "N00129\tTMPRSS2-ERG fusion to transcriptional repression\n", + "N00130\tTMPRSS2-ETV5 fusion to transcriptional activation\n", + "N00131\tAmplified MYCN to transcriptional activation\n", + "N00132\tAmplified MYCN to transcriptional repression\n", + "N00133\tEWSR1-FLI1 fusion to transcriptional activation\n", + "N00134\tEWSR1-FLI1 fusion to transcriptional repression\n", + "N00135\tEWSR1-ERG fusion to transcriptional activation\n", + "N00136\tEWSR1-ATF1 fusion to transcriptional activation\n", + "N00137\tEWSR1-WT1 fusion to transcriptional activation\n", + "N00138\tEWSR1-NR4A3\n", + "N00139\tFUS-DDIT3 fusion to CEBPB-mediated transcription\n", + "N00140\tFUS-DDIT3 fusion to NFKB-mediated transcription\n", + "N00141\tPAX3-FOXO1 fusion to transcriptional activation\n", + "N00142\tSYT-SSX fusion to transcriptional repression\n", + "N00143\tASPL-TFE3 fusion to transcriptional activation\n", + "N00144\tTLX1 rearrangement to transcriptional repression\n", + "N00145\tExtrinsic apoptotic pathway\n", + "N00146\tCrosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00147\tEGF-EGFR-PLCG-calcineurin signaling pathway\n", + "N00148\tTLR3-IRF7 signaling pathway\n", + "N00149\tTLR3-IRF3 signaling pathway\n", + "N00150\tType I IFN signaling pathway\n", + "N00151\tTNF-NFKB signaling pathway\n", + "N00152\tCXCR-GNB/G-ERK signaling pathway\n", + "N00153\tCCR/CXCR-GNB/G-PI3K-RAC signaling pathway\n", + "N00154\tCXCR-GNB/G-PI3K-AKT signaling pathway\n", + "N00155\tAutophagy-vesicle nucleation/elongation/maturation, mTORC1-PI3KC3-C1\n", + "N00156\tAutophagy-vesicle nucleation/elongation/maturation, LC3-II formation\n", + "N00157\tKSHV vGPCR to GNB/G-ERK signaling pathway\n", + "N00158\tKSHV vGPCR to GNB/G-PI3K-AKT signaling pathway\n", + "N00159\tKSHV K1 to PI3K signaling pathway\n", + "N00160\tKSHV K1 to RAS-ERK signaling pathway\n", + "N00161\tKSHV vIRF1/2 to TLR3-IRF3 signaling pathway\n", + "N00162\tKSHV vIRF3 to TLR3-IRF7 signaling pathway\n", + "N00163\tKSHV KIE1/2 to TLR3-IRF7 signaling pathway\n", + "N00164\tKSHV vBCL2 to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00165\tKSHV vIAP to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00166\tKSHV vFLIP to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00167\tKSHV vIRF1/3 to p21-cell cycle G1/S\n", + "N00168\tKSHV vCyclin to cell cycle G1/S\n", + "N00169\tKSHV LANA to p21-cell cycle G1/S\n", + "N00170\tKSHV LANA to cell cycle G1/S\n", + "N00171\tKSHV vFLIP to NFKB signaling pathway\n", + "N00172\tKSHV K15 to PLCG-calcineurin signaling pathway\n", + "N00173\tKSHV K15 to TNF-NFKB signaling pathway\n", + "N00174\tKSHV vFLIP to TNF-NFKB signaling pathway\n", + "N00175\tKSHV LANA to Wnt signaling pathway\n", + "N00176\tKSHV vFLIP to autophagy-vesicle elongation\n", + "N00177\tKSHV vBCL2 to autophagy-vesicle nucleation\n", + "N00178\tKSHV vGPCR to GNB/G-PI3K-JNK signaling pathway\n", + "N00179\tKSHV K1 to PI3K-NFKB signaling pathway\n", + "N00180\tKSHV K1 to PLCG-calcineurin signaling pathway\n", + "N00181\tKSHV vIL-6 to Jak-STAT signaling pathway\n", + "N00182\tIGF-IGFR-PI3K-NFKB signaling pathway\n", + "N00184\tKSHV MIR1/2 to antigen processing and presentation by MHC class I molecules\n", + "N00185\tKSHV MIR2 to cell surface molecule-endocytosis\n", + "N00186\tIL1-IL1R-p38 signaling pathway\n", + "N00187\tKSHV Kaposin B to p38 signaling pathway\n", + "N00188\tIL1-IL1R-JNK signaling pathway\n", + "N00189\tKSHV K15 to JNK signaling pathway\n", + "N00212\tKSHV vCCL2 to CCR signaling pathway\n", + "N00213\tKSHV Kaposin to alternative pathway of complement cascade\n", + "N00215\tKITLG-KIT-RAS-ERK signaling pathway\n", + "N00216\tHGF-MET-RAS-ERK signaling pathway\n", + "N00217\tFLT3LG-FLT3-RAS-ERK signaling pathway\n", + "N00218\tFLT3LG-FLT3-RAS-PI3K signaling pathway\n", + "N00219\tFLT3LG-FLT3-STAT5 signaling pathway\n", + "N00220\tPTEN-PIP3-AKT signaling pathway\n", + "N00221\tHTLV-1 Tax to spindle assembly checkpoint signaling\n", + "N00222\tHTLV-1 Tax to spindle assembly checkpoint signaling\n", + "N00223\tEBV EBNA1 to p53-mediated transcription\n", + "N00224\tEBV EBNALP RBP-Jk-mediated transcription\n", + "N00225\tEBV EBNA2 to RBP-Jk-mediated transcription\n", + "N00226\tEBV EBNA3A/3B/3C to RBP-Jk-mediated transcription\n", + "N00227\tTGFA-EGFR-PLCG-PKC signaling pathway\n", + "N00228\tTGFA-overexpression to PLCG-PKC signaling pathway\n", + "N00229\tTGFA-EGFR-RAS-ERK signaling pathway\n", + "N00230\tTGFA-overexpression to RAS-ERK signaling pathway\n", + "N00231\tTGFA-EGFR-PI3K signaling pathway\n", + "N00232\tTGFA-overexpression to PI3K signaling pathway\n", + "N00233\tIGF-IGF1R-RAS-ERK signaling pathway\n", + "N00234\tIGF2-IGF1R-PI3K signaling pathway\n", + "N00235\tIGF2-overexpression to RAS-ERK signaling pathway\n", + "N00236\tIGF2-overexpression to PI3K signaling pathway\n", + "N00237\tIGF1R-overexpression to RAS-ERK signaling pathway\n", + "N00238\tIGF1R-overexpression to PI3K signaling pathway\n", + "N00239\tTelomerase activity\n", + "N00240\tTERT-overexpression to telomerase activity\n", + "N00241\tTGFBR2-reduced expression to TGF-beta signaling pathway\n", + "N00242\tMutation-inactivated AXIN to Wnt signaling pathway\n", + "N00243\tKEAP1-NRF2 signaling pathway\n", + "N00244\tMutation-inactivated KEAP1 to KEAP1-NRF2 signaling pathway\n", + "N00245\tMutation-activated NRF2 to KEAP1-NRF2 signaling pathway\n", + "N00246\tHGF-overexpression to RAS-ERK signaling pathway\n", + "N00247\tHGF-overexpression to PI3K signaling pathway\n", + "N00248\tMET-overexpression to RAS-ERK signaling pathway\n", + "N00249\tMET-overexpression to PI3K signaling pathway\n", + "N00250\tCDX2-overexpression to transcriptional activation\n", + "N00251\tCDX2-overexpression to transcriptional repression\n", + "N00252\tAmplified ERBB2 to RAS-ERK signaling pathway\n", + "N00253\tAmplified ERBB2 to PI3K signaling pathway\n", + "N00254\tCDKN1B-reduced expression to p27-cell cycle G1/S\n", + "N00255\tAmplified CCNE to cell cycle G1/S\n", + "N00256\tTGFBR1-reduced expression to TGF-beta signaling pathway\n", + "N00257\tLoss of CDH1 to beta-catenin signaling pathway\n", + "N00258\tMutation-inactivated CDH1 to beta-catenin signaling pathway\n", + "N00259\tAmplified MET to RAS-ERK signaling pathway\n", + "N00260\tAmplified MET to PI3K signaling pathway\n", + "N00261\tKSHV vIRF2 to IFN signaling pathway\n", + "N00262\tEBV EBNA3C to intrinsic apoptotic pathway\n", + "N00263\tEBV EBNA3C to p53-mediated transcription\n", + "N00264\tEBV EBNA3C to p27-Cell cycle G1/S\n", + "N00265\tEBV LMP1 to NFKB signaling pathway\n", + "N00266\tEBV LMP2A to PI3K signaling pathway\n", + "N00267\tHBV HBx to PI3K signaling pathway\n", + "N00268\tHBV HBx to RIG-I-like receptor signaling pathway\n", + "N00269\tHCV core to TNF-NFKB signaling pathway\n", + "N00270\tHCV Core to IFN signaling pathway\n", + "N00271\tHCV NS3/4A to RIG-I-like receptor signaling pathway\n", + "N00272\tHCV NS5A to PI3K signaling pathway\n", + "N00273\tHCV NS5A to oligoadenylate synthetase (OAS)/RNase L pathway\n", + "N00274\tHCV NS5A to RAS-ERK signaling pathway\n", + "N00275\tAmplified CCND1 to cell cycle G1/S\n", + "N00276\tEGF-overexpression to RAS-ERK signaling pathway\n", + "N00277\tEREG-EGFR-RAS-ERK signaling pathway\n", + "N00278\tEREG-overexpression to RAS-ERK signaling pathway\n", + "N00279\tAREG-EGFR-RAS-ERK signaling pathway\n", + "N00280\tAREG-overexpression to RAS-ERK signaling pathway\n", + "N00281\tEGF-overexpression to PI3K signaling pathway\n", + "N00282\tEREG-EGFR-PI3K signaling pathway\n", + "N00283\tEREG-overexpression to PI3K signaling pathway\n", + "N00284\tAREG-EGFR-PI3K signaling pathway\n", + "N00285\tAREG-overexpression to PI3K signaling pathway\n", + "N00286\tNuclear-initiated estrogen signaling pathway\n", + "N00287\tESR1-positive to nuclear-initiated estrogen signaling pathway\n", + "N00288\tPTH-PTH1R-PKA signaling pathway\n", + "N00290\tMutation-inactivated MEN1 to transcription\n", + "N00291\tCaSR-PTH signaling pathway\n", + "N00293\tGCM2-mediated transcription\n", + "N00297\tACTH-cortisol signaling pathway\n", + "N00298\tCYP11B1-CYP11B2 fusion to ACTH-cortisol signaling pathway\n", + "N00301\tAngiotensin-aldosterone signaling pathway\n", + "N00302\tMutation-activated CACNA1D/H to angiotensin-aldosterone signaling pathway\n", + "N00303\tMutation-activated KCNJ5 to angiotensin-aldosterone signaling pathway\n", + "N00304\tMutation-inactivated ATP1A1 to angiotensin-aldosterone signaling pathway\n", + "N00305\tMutation-inactivated ATP2B3 to angiotensin-aldosterone signaling pathway\n", + "N00306\tSF-1-mediated transcription\n", + "N00309\tCortisone reduction\n", + "N00311\tNADPH generation\n", + "N00313\tTransport of cortisol\n", + "N00315\tMutation-inactivated AIP to AhR-mediated transcription\n", + "N00316\tMutation-inactivated CDKN1B to p27-cell cycle G1/S\n", + "N00317\tAhR signaling pathway\n", + "N00318\tEGFR-ERK-ACTH signaling pathway\n", + "N00319\tMutation-activated USP8 to EGFR-ERK-ACTH signaling pathway\n", + "N00320\tMutation-activated PRKACA to ACTH-cortisol signaling pathway\n", + "N00321\tMutation-activated GNAS to ACTH-cortisol signaling pathway\n", + "N00322\tMutation-inactivated PRKAR1A to ACTH-cortisol signaling pathway\n", + "N00323\tMutation-inactivated PDE11A/PDE8B to ACTH-cortisol signaling pathway\n", + "N00324\tCRHR-PKA-ACTH signaling pathway\n", + "N00325\tMutation-inactivated RASD1 to CRHR-PKA-ACTH signaling pathway\n", + "N00326\tMutation-activated GNAS to CRHR-PKA-ACTH signaling pathway\n", + "N00327\tMutation-inactivated PRKAR1A to CRHR-PKA-ACTH signaling pathway\n", + "N00332\tVesicular uptake of lipoproteins\n", + "N00336\tPCSK9-mediated LDLR degradation\n", + "N00338\tSteroid hormone biosynthesis, progesterone to cortisol/cortisone\n", + "N00339\tSteroid hormone biosynthesis, progesterone to aldosterone\n", + "N00340\tThe Scribble/Dlg/Lgl polarity module\n", + "N00341\tHPV E6 to the Scribble/Dlg/Lgl polarity module\n", + "N00342\tMAGI-PTEN signaling pathway\n", + "N00343\tHPV E6 to MAGI-PTEN signaling pathway\n", + "N00344\tCRB3-Pals1-PATJ complex\n", + "N00345\tHPV E6 to CRB3-Pals1-PATJ complex\n", + "N00346\tHPV E6 to TLR-IRF3 signaling pathway\n", + "N00347\tp300-p21-Cell cycle G1/S\n", + "N00348\tHPV E6 to p300-p21-Cell cycle G1/S\n", + "N00349\tHPV E6 to p300-p21-Cell cycle G1/S\n", + "N00350\tHPV E6 to extrinsic apoptotic pathway\n", + "N00351\tHPV E6 to extrinsic apoptotic pathway\n", + "N00352\tHPV E6 to extrinsic apoptotic pathway\n", + "N00353\tHPV E6 to PTEN-PIP3-AKT signaling pathway\n", + "N00354\tHPV E6 to PTEN-PIP3-AKT signaling pathway\n", + "N00355\tPP2A-AKT signaling pathway\n", + "N00356\tHPV E7 to PP2A-AKT signaling patyway\n", + "N00357\tHPV E6 to MTOR signaling pathway\n", + "N00358\tHPV E6 to p21-cell cycle G1/S\n", + "N00359\tHPV E7 to p27-cell cycle G1/S\n", + "N00360\tHPV E7 to p27-cell cycle G1/S\n", + "N00361\tHPV E7 to cell cycle G1/S\n", + "N00362\tHPV E5 to p21-cell cycle G1/S\n", + "N00363\tAntigen processing and presentation by MHC class I molecules\n", + "N00364\tHPV E5 to antigen processing and presentation by MHC class I molecules\n", + "N00365\tHPV E7 to cell cycle G1/S\n", + "N00366\tHPV E5 to EGFR-PI3K signaling pathway\n", + "N00367\tHPV E5 to EGFR-RAS-ERK signaling pathway\n", + "N00368\tHPV E5 to PDGFR-PI3K signaling pathway\n", + "N00369\tHPV E5 to PDGFR-RAS-ERK signaling pathway\n", + "N00370\tPyruvate generation\n", + "N00371\tHPV E7 to pyruvate generation\n", + "N00372\tHPV E7 to p300-p21-Cell cycle G1/S\n", + "N00373\tHPV E6 to NFX1-mediated transcription\n", + "N00374\tTNF-IRF1 signaling pathway\n", + "N00375\tHPV E7 to TNF-IRF1 signaling pathway\n", + "N00376\tHPV E7 to TBP1-mediated transcription\n", + "N00377\tHPV E6 to IFN signaling pathway\n", + "N00378\tHPV E6 to IFN signaling pathway\n", + "N00379\tHPV E7 to IFN signaling pathway\n", + "N00380\tHPV E6 to Notch signaling pathway\n", + "N00381\tHPV E6 to Notch signaling pathway\n", + "N00382\tHPV E6 to Notch signaling pathway\n", + "N00383\tHPV E6 to intrinsic apoptotic pathway\n", + "N00384\tHPV E6 to intrinsic apoptotic pathway\n", + "N00385\tHCMV gB to PDGFR-PI3K signaling pathway\n", + "N00386\tHCMV gB to PDGFR-RAS-ERK signaling pathway\n", + "N00387\tHCMV IE1-72/IE2-86 to PI3K signaling pathway\n", + "N00388\tHCMV UL38 to MTOR signaling pathway\n", + "N00389\tHCMV IE1-72 to transcription\n", + "N00390\tEGF-EGFR-PI3K-NFKB signaling pathway\n", + "N00391\tHCMV gB to EGFR-PI3K-NFKB signaling pathway\n", + "N00392\tHCMV gB to EGFR-RAS-ERK signaling pathway\n", + "N00393\tITGA/B-RhoGAP-RhoA signaling pathway\n", + "N00394\tHCMV gH to ITGA/B-RhoA signaling pathway\n", + "N00395\tcGAS-STING signaling pathway\n", + "N00396\tHCMV UL82 to cGAS-STING signaling pathway\n", + "N00397\tHCMV UL26 to NFKB signaling pathway\n", + "N00398\tHCMV IE2-86 to TNF-NFKB signaling pathway\n", + "N00399\tCCR2-GNB/G-PI3K-NFKB signaling pathway\n", + "N00400\tHCMV US28 to GNB/G-PI3K-NFKB signaling pathway\n", + "N00401\tCXCR4-GNAQ-PLCB/G-calcineurin signaling pathway\n", + "N00402\tHCMV US28 to GNAQ-PLCB/G-calcineurin signaling pathway\n", + "N00403\tCX3CR1-GNAI-AC-PKA signaling pathway\n", + "N00404\tHCMV US28 to GNAI-AC-PKA signaling pathway\n", + "N00405\tCXCR4-GNA12/13-Rho signaling pathway\n", + "N00406\tHCMV US28 to GNA12/13-Rho signaling pathway\n", + "N00407\tHCMV UL33 to GNAQ-PLCB/G-calcineurin signaling pathway\n", + "N00408\tLPAR-GNB/G-Rho signaling pathway\n", + "N00409\tHCMV UL33 to GNB/G-Rho signaling pathway\n", + "N00410\tDRD1-GNAS-AC-PKA signaling pathway\n", + "N00411\tHCMV UL33 to GNAS-AC-PKA signaling pathway\n", + "N00412\tHCMV UL33 to GNAI-AC-PKA signaling pathway\n", + "N00413\tCXCR4-GNB/G-PLCB-PKC signaling pathway\n", + "N00414\tHCMV US27 to CXCR4-GNB/G-PLCB-PKC signaling pathway\n", + "N00415\tIL10 family to Jak-STAT signaling pathway\n", + "N00416\tHCMV vIL10 to IL10-JAK-STAT signaling pathway\n", + "N00417\tHCMV US6 to antigen processing and presentation by MHC class I molecules\n", + "N00418\tHCMV US2/11 to antigen processing and presentation by MHC class I molecules\n", + "N00419\tHCMV US3/10 to antigen processing and presentation by MHC class I molecules\n", + "N00420\tHCMV IE2-86 to p21-cell cycle G1/S\n", + "N00421\tHCMV IE2-86 to p21-cell cycle G1/S\n", + "N00422\tHCMV IE2-86 to cell cycle G1/S\n", + "N00423\tHCMV IE1-72 to cell cycle G1/S\n", + "N00424\tHCMV pp71 to cell cycle G1/S\n", + "N00425\tHCMV UL36 to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00426\tHCMV UL37x1 to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00427\tHCMV vCXCL to CXCR-GNB/G-PI3K-AKT signaling pathway\n", + "N00428\tCCR5-GNB/G-PLCB/G-PKC signaling pathway\n", + "N00429\tHCMV UL22A to CCR5-GNB/G-PLCB/G-PKC signaling pathway\n", + "N00430\tCXCR4-GNAI-PI3K-BAD signaling pathway\n", + "N00431\tHIV gp120 to CXCR4-GNAI-PI3K-BAD signaling pathway\n", + "N00432\tHIV gp120 to CXCR4-GNAQ-PLCB/G-calcineurin\n", + "N00433\tCXCR4-GNB/G-RAC signaling pathway\n", + "N00434\tHIV gp120 to CXCR4-GNB/G-RAC signaling pathway\n", + "N00435\tTLR1/2/4-NFKB signaling pathway\n", + "N00436\tHIV Tat to TLR2/4-NFKB signaling pathway\n", + "N00437\tHIV Vpu to TLR2/4-NFKB signaling pathway\n", + "N00438\tTLR2/4-MAPK signaling pathway\n", + "N00439\tHIV Nef to TLR2/4-MAPK signaling pathway\n", + "N00440\tHIV Vpu/Vif/Vpr to cGAS-STING signaling pathway\n", + "N00441\tHIV gp120 to TNF-NFKB signaling pathway\n", + "N00442\tHIV Nef to TNF-NFKB signaling pathway\n", + "N00443\tHIV Vpr/Nef/Tat to TNF-NFKB signaling pathway\n", + "N00444\tTNF-p38 signaling pathway\n", + "N00445\tHIV Tat/Nef to TNF-p38 signaling pathway\n", + "N00446\tTNF-JNK signaling pathway\n", + "N00447\tHIV Vpr/Tat to TNF-JNK signaling pathway\n", + "N00448\tHIV Tat/Nef to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00449\tHIV Tat/Nef to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00450\tHIV Tat to intrinsic apoptotic pathway\n", + "N00451\tHIV Tat to intrinsic apoptotic pathway\n", + "N00452\tHIV Nef to intrinsic apoptotic pathway\n", + "N00453\tHIV Vpr to intrinsic apoptotic pathway\n", + "N00454\tHIV Vpr to intrinsic apoptotic pathway\n", + "N00455\tCDC25-Cell cycle G2/M\n", + "N00456\tHIV Vpr to CDC25-cell cycle G2M\n", + "N00457\tHIV Vpr to cell cycle G2M\n", + "N00458\tHIV Vpr to CDC25-cell cycle G2M\n", + "N00459\tWEE1-Cell cycle G2/M\n", + "N00460\tHIV Vpr to WEE1-cell cycle G2M\n", + "N00461\tHIV Nef to antigen processing and presentation by MHC class I molecules\n", + "N00462\tKSHV vCCL1/2/3 to CCR signaling pathway\n", + "N00465\tDeleted DMD to dystrophin-associated protein complex\n", + "N00466\tEBV BPLF1 to TLR2/4-NFKB signaling pathway\n", + "N00467\tEBV BPLF1 to TLR2/4-NFKB signaling pathway\n", + "N00468\tEBV BPLF1 to TLR2/4-NFKB signaling pathway\n", + "N00469\tRIG-I-IRF7/3 signaling pathway\n", + "N00470\tEBV BGLF4 to RIG-I-like receptor signaling pathway\n", + "N00471\tEBV LMP2A/2B to IFN signaling pathway\n", + "N00472\tEBV LMP1 to IFN signaling pathway\n", + "N00473\tEBV BGLF4 to IFN signaling pathway\n", + "N00474\tEBV BHRF1 to intrinsic apoptotic pathway\n", + "N00475\tEBV BHRF1 to intrinsic apoptotic pathway\n", + "N00476\tEBV BHRF1 to intrinsic apoptotic pathway\n", + "N00477\tEBV BHRF1 to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00478\tEBV BARF1 to intrinsic apoptotic pathway\n", + "N00479\tEBV BNLF2a to antigen processing and presentation by MHC class I molecules\n", + "N00480\tEBV BILF1 to antigen processing and presentation by MHC class I molecules\n", + "N00481\tEBV BZLF1 to p53-mediated transcription\n", + "N00482\tEBV EBNA3C to p27-Cell cycle G1/S\n", + "N00483\tEBV EBNA3C to cell cycle G1/S\n", + "N00484\tEBV EBNA3C to cell cycle G1/S\n", + "N00485\tEBV LMP1 to PI3K signaling pathway\n", + "N00486\tEBV LMP1 to Jak-STAT signaling pathway\n", + "N00487\tBCR-PLCG-Calcineurin signaling pathway\n", + "N00488\tEBV LMP2A to BCR signaling pathway\n", + "N00489\tHTLV-1 p30II to c-myc-mediated transcription\n", + "N00490\tHTLV-1 p12 to calcineurin signaling pathway\n", + "N00491\tHTLV-1 p12 to Jak-STAT signaling pathway\n", + "N00492\tHTLV-1 p12 to antigen processing and presentation by MHC class I molecules\n", + "N00493\tSpindle assembly checkpoint signaling\n", + "N00494\tHTLV-1 Tax to p16-cell cycle G1/S\n", + "N00495\tHTLV-1 Tax to p15-cell cycle G1/S\n", + "N00497\tHTLV-1 Tax to p21-cell cycle G1/S\n", + "N00498\tHTLV-1 Tax to p21-cell cycle G1/S\n", + "N00499\tATR-p21-Cell cycle G2/M\n", + "N00500\tHTLV-1 Tax to p21-cell cycle G2/M\n", + "N00501\tHTLV-1 Tax to EGFR-PI3K-NFKB signaling pathway\n", + "N00502\tHTLV-1 Tax to PTEN-PIP3-AKT signaling pathway\n", + "N00503\tHTLV-1 Tax to TNF-JNK signaling pathway\n", + "N00504\tHTLV-1 Tax to NFKB signaling pathway\n", + "N00505\tCD40-NFKB signaling pathway\n", + "N00506\tHTLV-1 Tax to CD40-NFKB signaling pathway\n", + "N00507\tHTLV-1 Tax to TGF-beta signaling pathway\n", + "N00508\tHTLV-1 Tax to NFY-mediated transcription\n", + "N00509\tHTLV-1 Tax to SRF-mediated transcription\n", + "N00510\tHTLV-1 Tax to CREB-mediated transcription\n", + "N00511\tHTLV-1 Tax to E47-mediated transcription\n", + "N00512\tHTLV-1 Tax to c-myc-mediated transcription\n", + "N00513\tMutation-activated EGFR to RAS-ERK signaling pathway\n", + "N00514\tMutation-activated EGFR to PI3K signaling pathway\n", + "N00515\tOligoadenylate synthetase (OAS)/RNase L pathway\n", + "N00516\tHCV NS3/4A to TLR3-IRF3 signaling pathway\n", + "N00517\tHCV NS3/4A to TLR3-IRF3 signaling pathway\n", + "N00518\tHCV Core to ERK signaling pathway\n", + "N00519\tHCV Core to ERK signaling pathway\n", + "N00520\tHCV NS5A to p21-cell cycle G1/S\n", + "N00521\tHCV Core to p21-cell cycle G1/S\n", + "N00522\tHCV NS3 to p21-cell cycle G1/S\n", + "N00523\tHCV Core to p21-cell cycle G1/S\n", + "N00524\tHCV NS5A to extrinsic apoptotic pathway\n", + "N00525\tHCV NS5A to TNF-NFKB signaling pathway\n", + "N00526\tHCV NS3 to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00527\tHCV Core to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00528\tHCV core to extrinsic apoptotic pathway\n", + "N00529\tHCV core to RXRA/PPARA-mediated transcription\n", + "N00530\tHCV core to RXRA/LXRA-mediated transcription\n", + "N00531\tHBV HBx to TGF-beta signaling pathway\n", + "N00532\tHBV HBx to Egr-mediated transcription\n", + "N00533\tHBV HBx to Crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00534\tHBV HBx to Crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00535\tHBV HBx to p53-mediated transcription\n", + "N00536\tMDM2-p21-Cell cycle G1/S\n", + "N00537\tHBV HBx to cell cycle G1/S\n", + "N00538\tCa2+-PYK2-RAS-ERK signaling pathway\n", + "N00539\tHBV HBx to Ca2+-PYK2-RAS-ERK signaling pathway\n", + "N00540\tHBV HBx to RAS-ERK signaling pathway\n", + "N00541\tHBV HBx to RAS-ERK signaling pathway\n", + "N00542\tEGF-EGFR-RAS-JNK signaling pathway\n", + "N00543\tHBV HBx to JNK signaling pathway\n", + "N00544\tHBV HBx to CREB-mediated transcription\n", + "N00545\tHBV HBx to ERK signaling pathway\n", + "N00546\tCXCL12-CXCR4-PKC-ERK signaling pathaway\n", + "N00547\tHBV LHBs to PKC-ERK signaling pathway\n", + "N00548\tHBV HBx to Jak-STAT signaling pathway\n", + "N00549\tHBV HBeAg to TLR2/4-NFKB signaling pathway\n", + "N00550\tHBV HBeAg to TLR2/4-NFKB signaling pathway\n", + "N00551\tHBV HBs to TLR2/4-MAPK signaling pathway\n", + "N00552\tHBV pol to TLR3-IRF3 signaling pathway\n", + "N00553\tTLR4-IRF3/7 signaling pathway\n", + "N00554\tHBV HBe to TLR4-IRF3/7 signaling pathway\n", + "N00555\tHBV HBe to TLR4-IRF3/7 signaling pathway\n", + "N00556\tHBV HBe to TLR2/4-NFKB signaling pathway\n", + "N00557\tHBV HBe to TLR2/4-NFKB signaling pathway\n", + "N00558\tHBV pol to IFN signaling pathway\n", + "N00559\tLIGHT-HVEM-NFKB signaling pathway\n", + "N00560\tHSV gD to HVEM-NFKB signaling pathway\n", + "N00561\tHSV ICP0 to TLR2/4-NFKB signaling pathway\n", + "N00562\tHSV US3 to TLR2/4-NFKB signaling pathway\n", + "N00563\tTLR3-NFKB signaling pathway\n", + "N00564\tHSV US3 to TLR3-NFKB signaling pathway\n", + "N00565\tHSV US11 to RIG-I-like receptor signaling pathway\n", + "N00566\tHSV UL36USP to RIG-I-like receptor signaling pathway\n", + "N00567\tHSV ICP34.5 to TBK1 signaling pathway\n", + "N00568\tHSV US3 to IRF3 signaling pathway\n", + "N00569\tHSV UL41 to cGAS-STING signaling pathway\n", + "N00570\tHSV ICP0 to cGAS-STING signaling pathway\n", + "N00571\tPKR-eIF2alpha signaling pathway\n", + "N00572\tHSV ICP34.5 to PKR-eIF2alpha signaling pathway\n", + "N00573\tHSV US11 to PKR-eIF2alpha signaling pathway\n", + "N00574\tHSV US11 to oligoadenylate synthetase (OAS)/RNase L pathway\n", + "N00575\tHSV ICP27 to IFN signaling pathway\n", + "N00576\tHSV UL41/UL13 to IFN signaling pathway\n", + "N00577\tHSV UL41 to IFN signaling pathway\n", + "N00578\tHSV UL41 to IFN signaling pathway\n", + "N00579\tHSV ICP6 to extrinsic apoptotic pathway\n", + "N00580\tHSV ICP0 to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00581\tHSV ICP47 to antigen processing and presentation by MHC class I molecules\n", + "N00582\tIGF-IGF1R-PI3K signaling pathway\n", + "N00583\tHSV VP11/12 to PI3K signaling pathway\n", + "N00584\tHSV US3 to MTOR signaling pathway\n", + "N00585\tHSV US3 to intrinsic apoptotic pathway\n", + "N00586\tNuclear export of mRNA\n", + "N00587\tHSV ICP27 to Nuclear export of mRNA\n", + "N00588\tHSV VP16 to Oct-1-mediated transcription\n", + "N00589\tHSV gC to alternative pathway of complement cascade\n", + "N00590\tAntigen processing and presentation by MHC class II molecules\n", + "N00591\tHSV gB to antigen processing and presentation by MHC class II molecules\n", + "N00592\tHSV ICP0 to p53-mediated transcription\n", + "N00593\tUrea cycle\n", + "N00599\tObligate allosteric activation of CPS1 by NAG\n", + "N00600\tNAGS deficiency in urea cycle\n", + "N00601\tHeme biosynthesis\n", + "N00610\tDermatan sulfate degradation\n", + "N00615\tHeparan sulfate degradation\n", + "N00623\tKeratan sulfate degradation\n", + "N00627\tMannose type O-glycan biosynthesis, POMT to POMK\n", + "N00640\tHydrolysis of lactosylceramide\n", + "N00642\tSaposin stimulation of GBA and GALC\n", + "N00643\tLoss of saposin stimulation\n", + "N00644\tHydrolysis of galabiosylceramide\n", + "N00647\tHydrolysis of galactosylceramide sulfate\n", + "N00649\tHydrolysis of sphingomyelin\n", + "N00653\tN-Glycan precursor biosynthesis, ALG7 to ALG11\n", + "N00667\tN-Glycan precursor biosynthesis, Glc-6P to Man-P-Dol\n", + "N00673\tN-Glycan precursor biosynthesis, Glc-6P to UDP-Glu\n", + "N00675\tN-Glycan precursor biosynthesis, farnesy-PP to P-Dol\n", + "N00679\tGlucosylceramide synthesis in GBA deficiency\n", + "N00680\tN-Glycan precursor biosynthesis, ALG3 to ALG9\n", + "N00681\tN-Glycan precursor biosynthesis, ALG6 to OST\n", + "N00682\tN-Glycan precursor biosynthesis, P-Dol to Glc-P-Dol\n", + "N00683\tCD80/CD86-CD28-PI3K signaling pathway\n", + "N00684\tMV F/H to CD28-PI3K signaling pathway\n", + "N00685\tMV V to RIG-I-IRF7/3 signaling pathway\n", + "N00686\tMV N to RIG-I-IRF7/3 signaling pathway\n", + "N00687\tMV V/C to RIG-I-IRF7/3 signaling pathway\n", + "N00688\tRIG-I-NFKB signaling pathway\n", + "N00689\tMV V/P/C to RIG-I-NFKB signaling pathway\n", + "N00690\tTLR7/9-IRF7 signaling pathway\n", + "N00691\tMV V to TLR7/9-IRF7 signaling pathway\n", + "N00692\tMV P to TLR2/4-NFKB signaling pathway\n", + "N00693\tMV V/P to IFN signaling pathway\n", + "N00694\tMV V/P/C to IFN signaling pathway\n", + "N00695\tMV V to p73-mediated transcription\n", + "N00696\tMV C to PKR-eIF2alpha signaling pathway\n", + "N00697\tHV P to p53-mediated transcription\n", + "N00698\tMannose type O-glycan biosynthesis, Rib-ol-5P to CDP-Rib-ol\n", + "N00699\tMannose type O-glycan biosynthesis, FKTN to LARGE\n", + "N00700\tTyrosine biosynthesis\n", + "N00702\tTetrahydrobiopterin biosynthesis, GTP to BH4\n", + "N00705\tTetrahydrobiopterin biosynthesis, BH4OH to BH4\n", + "N00708\tTyrosine degradation\n", + "N00713\tGlycogen biosynthesis\n", + "N00718\tGlycogen degradation\n", + "N00720\tGlycogen degradation (amylase)\n", + "N00724\tIAV NS1 to oligoadenylate synthetase (OAS)/RNase L pathway\n", + "N00725\tIAV NS1 to PKR-eIF2alpha signaling pathway\n", + "N00726\tIAV NP to PKR-eIF2alpha signaling pathway\n", + "N00727\tIAV NS1 to RIG-I-like receptor signaling pathway\n", + "N00728\tIAV NS1 to RIG-I-like receptor signaling pathway\n", + "N00729\tIAV NS1 to RIG-I-like receptor signaling pathway\n", + "N00730\tIAV NS1 to RIG-I-like receptor signaling pathway\n", + "N00731\tGlycolysis\n", + "N00732\tIAV PB1-F2/PB2 to RIG-I-like receptor signaling pathway\n", + "N00734\tIAV PB1-F2/PB2 to RIG-I-like receptor signaling pathway\n", + "N00736\tIAV NS1 to PI3K signaling pathway\n", + "N00738\tIAV NS1 to IFN signaling pathway\n", + "N00741\tIAV M2 to cell cycle G1/S\n", + "N00742\tNLRP3 inflammasome signaling pathway\n", + "N00743\tIAV NS1 to NLRP3 inflammasome signaling pathway\n", + "N00744\tIAV HA to ERK signaling pathway\n", + "N00745\tIAV PB1-F2 to intrinsic apoptotic pathway\n", + "N00746\tIAV NS1 to nuclear export of mRNA\n", + "N00748\tGPI-anchor biosynthesis\n", + "N00759\tSteroid hormone biosynthesis, cholesterol to pregnenolone/progesterone\n", + "N00765\tbeta-Oxidation, acyl-CoA synthesis\n", + "N00776\tbeta-Oxidation, peroxisome, VLCFA\n", + "N00779\tbeta-Oxidation, peroxisome, bile acid\n", + "N00782\tTSH-TG signaling pathway\n", + "N00786\tTransport of iodide\n", + "N00789\tMutation-inactivated TPO to iodide organification/coupling reactions\n", + "N00791\tDeiodination of MIT and DIT\n", + "N00793\tTSH-DUOX2-TG signaling pathway\n", + "N00795\tDUOX2-generated H2O2 production\n", + "N00798\tThyroid hormone signaling pathway\n", + "N00803\tIodide organification/coupling reactions\n", + "N00804\tbeta-Oxidation\n", + "N00805\tBile acid biosynthesis\n", + "N00812\tTransport of carnitine\n", + "N00814\tTransport of L-palmitoylcarnitine\n", + "N00816\tTransport of glucose 6-phosphate\n", + "N00818\tTransport of glucose\n", + "N00820\tN-Glycan biosynthesis\n", + "N00824\tTransport of GDP-fucose\n", + "N00826\tTransport of UDP-galactose\n", + "N00828\tTransport of CMP-N-acetylneuraminate\n", + "N00830\tTransport of Man5GlcNAc2-PP-dolichol\n", + "N00832\tBranched-chain amino acids degradation 1\n", + "N00842\tPropanoyl-CoA metabolism\n", + "N00847\tGalactose degradation\n", + "N00851\tLeucine degradation\n", + "N00852\tValine degradation\n", + "N00856\tIsoleucine degradation\n", + "N00859\tYersinia YopP/J to TLR2/4-NFKB signaling pathway\n", + "N00862\tYersinia YopP/J to TLR2/4-MAPK signaling pathway\n", + "N00863\tYersinia YopM to NLRP3 Inflammasome signaling pathway\n", + "N00864\tYersinia YopK to NLRP3 Inflammasome signaling pathway\n", + "N00865\tYersinia YopK to NLRC4 Inflammasome signaling pathway\n", + "N00866\tYersinia YopM to Pyrin Inflammasome signaling pathway\n", + "N00867\tNLRC4 inflammasome signaling pathway\n", + "N00868\tPyrin inflammasome signaling pathway\n", + "N00869\tKISS1-KISS1R-PLCB-PKC signaling pathway\n", + "N00873\tGnRH-GnRHR-PLCB-PKC signaling pathway\n", + "N00879\tPROK-PRKR-Gi-ERK signaling pathway\n", + "N00882\tTAC3-TACR3-PLC-PKC signaling pathway\n", + "N00885\tLHCGR-GNAS-PKA signaling pathway\n", + "N00888\tHypoxanthine oxidation\n", + "N00890\tMolybdenum cofactor biosynthesis\n", + "N00899\t5-Oxoproline metabolism\n", + "N00904\tGlutathione reduction\n", + "N00905\tNADP+ reduction\n", + "N00907\tGH-Jak-STAT signaling pathway\n", + "N00910\tGHRHR-PKA-GH signaling pathway\n", + "N00915\tAVP-V2R-PKA signaling pathway\n", + "N00918\tTRH-TRHR-PLCB-PKC signaling pathway\n", + "N00920\tPRL-JAK-STAT signaling pathway\n", + "N00922\tFSHR-GNAS-PKA signaling pathway\n", + "N00924\tGlucocorticoid receptor signaling pathway\n", + "N00926\tEscherichia Tir to TLR2/4-MAPK signaling pathway\n", + "N00927\tEscherichia/Shigella NleE/OspZ to TNF-NFKB signaling pathway\n", + "N00928\tEscherichia NleB to TNF-NFKB signaling pathway\n", + "N00929\tEscherichia NleC to TNF-NFKB signaling pathway\n", + "N00930\tEscherichia NleD to TNF-JNK signaling pathway\n", + "N00931\tEscherichia NleD to TNF-p38 signaling pathway\n", + "N00932\tEscherichia NleH1 to TNF-NFKB signaling pathway\n", + "N00933\tEscherichia NleA to NLRP3 inflammasome signaling pathway\n", + "N00934\tNon-canonical inflammasome signaling pathway\n", + "N00935\tEscherichia NleF to non-canonical inflammasome signaling pathway\n", + "N00936\tEscherichia NleB1 to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00937\tEscherichia NleF to extrinsic apoptotic pathway\n", + "N00938\tEscherichia NleH to intrinsic apoptotic pathway\n", + "N00939\tEscherichia EspF to intrinsic apoptotic pathway\n", + "N00940\tNOD-NFKB signaling pathway\n", + "N00941\tShigella IpaH9.8 to NOD-NFKB signaling pathway\n", + "N00942\tShigella OspG to TNF-NFKB signaling pathway\n", + "N00943\tShigella IpaH4.5 to TNF-NFKB signaling pathway\n", + "N00944\tShigella OspI to TNF-NFKB signaling pathway\n", + "N00945\tShigella IpaH1.4/2.5 to TNF-NFKB signaling pathway\n", + "N00946\tShigella IpaJ to cGAS-STING signaling pathway\n", + "N00947\tShigella Ipa4.5 to cGAS-STING signaling pathway\n", + "N00948\tShigella IpaH7.8 to NLRP3 Inflammasome signaling pathway\n", + "N00949\tShigella IpaB to NLRC4 Inflammasome signaling pathway\n", + "N00950\tShigella FimA to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00951\tITGA/B-RHOG-RAC signaling pathway\n", + "N00952\tShigella IpgB1 to ITGA/B-RHOG-RAC signaling pathway\n", + "N00953\tmGluR1-TRPC3 signaling pathway\n", + "N00954\tMutation-activated GRM1 to mGluR1-TRPC3 signaling pathway\n", + "N00955\tMutation-inactivated PRKCG to mGluR1-TRPC3 signaling pathway\n", + "N00956\tMutation-activated PRKCG to mGluR1-TRPC3 signaling pathway\n", + "N00957\tMutation-caused abberant ATXN2/3 to mGluR5-Ca2+ -apoptotic pathway\n", + "N00958\tMutation-activated ITPR1 to mGluR1-TRPC3 signaling pathway\n", + "N00959\tITPR1-reduced expression to mGluR1-TRPC3 signaling pathway\n", + "N00960\tMutation-caused aberrant SPTBN2 to mGluR1-TRPC3 signaling pathway\n", + "N00961\tMutation-activated TRPC3 to mGluR1-TRPC3 signaling pathway\n", + "N00962\tMutation-inactivated ATXN3 to autophagy-vesicle nucleation\n", + "N00963\tRELN-VLDLR-PI3K signaling pathway\n", + "N00964\tDAB1-overexpression to RELN-VLDLR-PI3K signaling pathway\n", + "N00965\tRORA-mediated transcription\n", + "N00966\tMutation-caused aberrant ATXN1 to RORA-mediated transcription\n", + "N00967\tVGCC-Ca2+ -apoptotic pathway\n", + "N00968\tMutation-activated CACNA1A to VGCC-Ca2+ -apoptotic pathway\n", + "N00969\tMutation-inactivated CACNA1A to VGCC-Ca2- -apoptotic pathway\n", + "N00970\tTransport of calcium\n", + "N00971\tMutation-caused aberrant PDYN to transport of calcium\n", + "N00972\tTransport of potassium\n", + "N00973\tMutation-inactivated KCNC3 to transport of potassium\n", + "N00974\tTransport of potassium\n", + "N00975\tMutation-inactivated KCND3 to transport of potassium\n", + "N00976\tRetrograde axonal transport\n", + "N00977\tMutation-caused aberrant Htt to retrograde axonal transport\n", + "N00978\tAnterograde axonal transport\n", + "N00979\tMutation-caused aberrant Htt to anterograde axonal transport\n", + "N00980\tMutation-caused aberrant Htt to REST-mediated transcriptional repression\n", + "N00981\tMutation-caused aberrant Htt to CREB-mediated transcription\n", + "N00982\tMutation-caused aberrant Htt to p53-mediated transcription\n", + "N00983\tMutation-caused aberrant Htt to extrinsic apoptotic pathway\n", + "N00984\tmGluR5-Ca2+ -apoptotic pathway\n", + "N00985\tMutation-caused aberrant Htt to mGluR5-Ca2+ -apoptotic pathway\n", + "N00986\tMutation-caused aberrant Htt to VGCC-Ca2+ -apoptotic pathway\n", + "N00987\tMutation-caused aberrant Htt to transport of calcium\n", + "N00988\tElectron transfer in Complex II\n", + "N00989\tMutation-caused aberrant Htt to electron transfer in Complex II\n", + "N00990\tElectron transfer in Complex III\n", + "N00991\tMutation-caused aberrant Htt to electron transfer in Complex III\n", + "N00992\tMutation-caused aberrant Htt to TNF-JNK signaling pathway\n", + "N00993\tMutation-caused aberrant Htt to autophagy-vesicle nucleation\n", + "N00994\tAGE-RAGE signaling pathway\n", + "N00995\tElectron transfer in Complex I\n", + "N00996\tMutation-caused aberrant Abeta to AGE-RAGE signaling pathway\n", + "N00997\tMutation-caused aberrant Abeta to electron transfer in Complex I\n", + "N00998\tElectron transfer in Complex IV\n", + "N00999\tMutation-caused aberrant Abeta to electron transfer in Complex IV\n", + "N01000\tmAChR-Ca2+ -apoptotic pathway\n", + "N01001\tMutation-caused aberrant Abeta to mAchR-Ca2+ -apoptotic pathway\n", + "N01002\tMutation-caused aberrant Abeta to mGluR5-Ca2+ -apoptotic pathway\n", + "N01003\tMutation-caused aberrant Abeta to transport of calcium\n", + "N01004\tMutation-caused aberrant Abeta to VGCC-Ca2+ -apoptotic pathway\n", + "N01005\tMutation-caused aberrant Abeta to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N01006\tMutation-caused aberrant Abeta to VGCC-Ca2+ -apoptotic pathway\n", + "N01007\tMutation-caused aberrant PSEN to mGluR5-Ca2+ -apoptotic pathway\n", + "N01008\tMutation-caused aberrant PSEN1 to mGluR5-Ca2+ -apoptotic pathway\n", + "N01009\tPERK-ATF4 signaling pathway\n", + "N01010\tMutation-caused aberrant PSEN1 to PERK-ATF4 signaling pathway\n", + "N01011\tIRE1a-XBP1 signaling pathway\n", + "N01012\tMutation-caused aberrant PSEN1 to IRE1a-XBP1 signaling pathway\n", + "N01013\tIRE1a-JNK signaling pathway\n", + "N01014\tMutation-caused aberrant Abeta to IRE1a-JNK signaling pathway\n", + "N01015\tATF6-mediated transcription\n", + "N01016\tMutation-caused aberrant PSEN1 to ATF6-mediated transcription\n", + "N01017\tMutation-caused aberrant PSEN1 to anterograde axonal transport\n", + "N01018\tMutation-caused aberrant Abeta to anterograde axonal transport\n", + "N01019\tParkin-mediated ubiquitination\n", + "N01020\tMutation-inactivated PRKN to Parkin-mediated ubiquitination\n", + "N01021\tParkin-mediated ubiquitination\n", + "N01022\tMutation-inactivated PRKN to Parkin-mediated ubiquitination\n", + "N01023\tParkin-mediated ubiquitination\n", + "N01024\tMutation-inactivated PRKN to Parkin-mediated ubiquitination\n", + "N01025\tParkin-mediated ubiquitination\n", + "N01026\tMutation-inactivated PRKN to Parkin-mediated ubiquitination\n", + "N01027\tUCHL1-mediated hydrolysis\n", + "N01028\tMutation-inactivated UCHL1 to UCHL1-mediated hydrolysis\n", + "N01029\t26S proteasome-mediated protein degradation\n", + "N01030\tMutation-caused aberrant SNCA to 26S proteasome-mediated protein degradation\n", + "N01031\tMutation-caused aberrant SNCA to VGCC-Ca2+ -apoptotic pathway\n", + "N01032\tMutation-inactivated PRKN to mGluR1 signaling pathway\n", + "N01033\tMutation-caused aberrant SNCA to ATF6-mediated transcription\n", + "N01034\tMutation-caused aberrant SNCA to IRE1a-XBP1 signaling pathway\n", + "N01035\tMutation-caused aberrant SNCA to PERK-ATF4 signaling pathway\n", + "N01037\tMutation-caused aberrant SNCA to L-DOPA generation\n", + "N01039\tMutation-inactivated PRKN to DOPAL generation\n", + "N01040\tTransport of dopamine to synaptic vesicle\n", + "N01041\tMutation-caused aberrant SNCA to transport of dopamine\n", + "N01042\tMutation-caused aberrant SNCA to electron transfer in Complex I\n", + "N01043\tMutation-inactivated PINK1 to electron transfer in Complex I\n", + "N01044\tMPP+ to electron transfer in Complex I\n", + "N01045\tRotenone to electron transfer in Complex I\n", + "N01046\tManeb to electron transfer in Complex III\n", + "N01047\tMutation-activated LRRK2 to intrinsic apoptotic pathway\n", + "N01048\tMutation-inactivated PINK1 to intrinsic apoptotic pathway\n", + "N01049\tMutation-inactivated PRKN to intrinsic apoptotic pathway\n", + "N01050\tMutation-inactivated PINK1 to intrinsic apoptotic pathway\n", + "N01051\tMutation-inactivated DJ1 to intrinsic apoptotic pathway\n", + "N01052\tPINK1-Parkin-mediated MFN2 degradation\n", + "N01053\tMutation-inactivated PINK1 to PINK1-Parkin-mediated MFN2 degradation\n", + "N01054\tMutation-inactivated PRKN to PINK1-Parkin-mediated MFN2 degradation\n", + "N01055\tMutation-caused aberrant SNCA to anterograde axonal transport\n", + "N01056\tFAS-JNK signaling pathway\n", + "N01057\tMutation-inactivated DJ1 to FAS-JNK signaling patwhay\n", + "N01058\tMutation-inactivated DJ1 to to p53-mediated transcription\n", + "N01059\tMutation-inactivated DJ1 to KEAP1-NRF2 signaling pathway\n", + "N01060\tMutation-caused aberrant Abeta to 26S proteasome-mediated protein degradation\n", + "N01061\tMutation-caused aberrant Htt to 26S proteasome-mediated protein degradation\n", + "N01062\tMutation-activated MET to RAS-ERK signaling pathway\n", + "N01063\tMutation-activated MET to PI3K signaling pathway\n", + "N01064\tMutation-activated RET to RAS-ERK signaling pathway\n", + "N01065\tMutation-activated RET to PI3K signaling pathway\n", + "N01066\tARNO-ARF-ACTB_G signaling pathway\n", + "N01067\tShigella IpgD to ARNO-ARF-ACTB_G signaling pathway\n", + "N01068\tITGA/B-FAK-RAC signaling pathway\n", + "N01069\tShigella IpgB1 to ITGA/B-FAK-RAC signaling pathway\n", + "N01070\tITGA/B-FAK-CDC42 signaling pathway\n", + "N01071\tShigella IpgB1 to ITGA/B-FAK-CDC42 signaling pathway\n", + "N01072\tITGA/B-RhoGEF-RhoA signaling pathway\n", + "N01073\tShigella IpgB2 to ITGA/B-RhoGEF-RhoA signaling pathway\n", + "N01074\tShigella IpaA to ITGA/B-RhoGEF-RhoA signaling pathway\n", + "N01075\tShigella IcsB to ITGA/B-RhoGEF-RhoA signaling pathway\n", + "N01076\tShigella IcsB to ITGA/B-FAK-CDC42 signaling pathway\n", + "N01077\tShigella IcsB to ITGA/B-FAK-RAC signaling pathway\n", + "N01078\tEGF-EGFR-Actin signaling pathway\n", + "N01079\tShigella IpaC to Actin signaling pathway\n", + "N01080\tITGA/B-TALIN/VINCULIN signaling pathway\n", + "N01081\tShigella IpaB/C/D to ITGA/B-TALIN/VINCULIN signaling pathway\n", + "N01082\tShigella IpaA to ITGA/B-TALIN/VINCULIN signaling pathway\n", + "N01083\tShigella OspE to ITGA/B-TALIN/VINCULIN signaling pathway\n", + "N01084\tEscherichia EspG to ARNO-ARF-ACTB/G signaling pathway\n", + "N01085\tEscherichia EspG to ARNO-ARF-ACTB/G signaling pathway\n", + "N01086\tEscherichia EspT to RAC signaling pathway\n", + "N01087\tEscherichia EspW to RAC signaling pathway\n", + "N01088\tEscherichia EspH to LPA-GNA12/13-RhoA signaling pathway\n", + "N01089\tEscherichia EspM to LPA-GNA12/13-Rho signaling pathway\n", + "N01090\tIGG-FCGR-RAC signaling pathway\n", + "N01091\tEscherichia EspJ to IGG-FCGR-RAC signaling pathway\n", + "N01092\tEscherichia Eae/Tir to Actin signaling pathway\n", + "N01093\tEscherichia EspJ/Tir to Actin signaling pathway\n", + "N01094\tEscherichia Eae/Tir/TccP to Actin signaling pathway\n", + "N01095\tEscherichia Map to LPA-GNA12/13-RhoA signaling pathway\n", + "N01096\tEscherichia Map to CDC42 signaling pathway\n", + "N01097\tLPA-GNA12/13-RhoA signaling pathway\n", + "N01098\tYersinia YopT to ITGA/B-RhoGEF-RhoA signaling pathway\n", + "N01099\tYersinia YopE to RhoA signaling pathway\n", + "N01100\tYersinia YopE to ITGA/B-RHOG-RAC signaling pathway\n", + "N01101\tYersinia YopT to ITGA/B-RHOG-RAC signaling pathway\n", + "N01102\tYersinia YopE to ITGA/B-RHOG-RAC signaling pathway\n", + "N01103\tYersinia YpkA to IGG-FCGR-RAC signaling pathway\n", + "N01104\tLPA-GNAQ/11-RhoA signaling pathway\n", + "N01105\tYersinia YpkA to LPA-GNAQ-RhoA signaling pathway\n", + "N01106\tTCR-PLCG-ITPR signaling pathway\n", + "N01107\tYersinia YopH to TCR-NFAT signaling pathway\n", + "N01108\tYersinia YopH to TCR-NFAT signaling pathway\n", + "N01109\tYersinia YopH to ITGA/B-FAK-RAC signaling pathway\n", + "N01110\tYersinia YopH to ITGA/B-FAK-RAC signaling pathway\n", + "N01111\tYersinia YopH to ITGA/B-FAK-RAC signaling pathway\n", + "N01112\tSalmonella SopE/E2 to NOD-NFKB signaling pathway\n", + "N01113\tSalmonella SseK1 to TNF-NFKB signaling pathway\n", + "N01114\tSalmonella SseK3 to TNF-NFKB signaling pathway\n", + "N01116\tSalmonella SseL to TNF-NFKB signaling pathway\n", + "N01117\tSalmonella GogB to TNF-NFKB signaling pathway\n", + "N01118\tSalmonella SpvD to TNF-NFKB signaling pathway\n", + "N01119\tRAC/CDC42-PAK-ERK signaling pathway\n", + "N01120\tSalmonella SptP to RAC/CDC42-PAK-ERK signaling pathway\n", + "N01121\tSalmonella SpvC to ERK signaling pathway\n", + "N01122\tSalmonella PipA/GogA/GtgA to TNF-NFKB signaling pathway\n", + "N01123\tSalmonella AvrA to TNF-NFKB signaling pathway\n", + "N01124\tSalmonella AvrA to beta-catenin signaling pathway\n", + "N01125\tSalmonella AvrA to TNF-JNK signaling pathway\n", + "N01126\tSalmonella SipB to Inflammasome signaling pathway\n", + "N01127\tSalmonella SopE to Inflammasome signaling pathway\n", + "N01128\tSalmonella SopE to RAC signaling pathway\n", + "N01129\tSalmonella SopB to ARNO-ARF-ACTB/G signaling pathway\n", + "N01130\tSalmonella SopB to RhoA signaling pathway\n", + "N01131\tSalmonella SopE/E2 to RhoA signaling pathway\n", + "N01132\tSalmonella SopE/E2 to RhoG signaling pathway\n", + "N01133\tSalmonella SopB to RhoG signaling pathway\n", + "N01134\tSalmonella SopB to CDC42 signaling pathway\n", + "N01135\tMutation-caused aberrant SOD1 to intrinsic apoptotic pathway\n", + "N01136\tMutation-caused aberrant TDP43 to electron transfer in Complex I\n", + "N01137\tPINK-Parkin-mediated autophagosome formation\n", + "N01138\tMutation-inactivated OPTN to PINK-Parkin-mediated autophagosome formation\n", + "N01139\tMutation-inactivated p62 to PINK-Parkin-mediated autophagosome formation\n", + "N01140\tTBK1-mediated autophagosome formation\n", + "N01141\tMutation-inactivated TBK1 to TBK1-mediated autophagosome formation\n", + "N01142\tC9orf72-mediated autophagy initiation\n", + "N01143\tMutation-inactivated C9orf72 to C9orf72-mediated autophagy initiation\n", + "N01144\tMutation-caused aberrant SOD1 to 26S proteasome-mediated protein degradation\n", + "N01145\tMutation-inactivated VCP to 26S proteasome-mediated protein degradation\n", + "N01146\tMutation-inactivated UBQLN2 to 26S proteasome-mediated protein degradation\n", + "N01147\tMutation-caused aberrant SOD1 to ATF6-mediated transcription\n", + "N01148\tMutation-caused aberrant SOD1 to IRE1a-XBP1 signaling pathway\n", + "N01149\tMutation-caused aberrant SOD1 to PERK-ATF4 signaling pathway\n", + "N01150\tMutation-inactivated VAPB to ATF6-mediated transcription\n", + "N01151\tMutation-inactivated SIGMAR1 to Ca2+ -apoptotic pathway\n", + "N01152\tNuclear export of mRNA\n", + "N01153\tMutation-caused aberrant GLE1 to nuclear export of mRNA\n", + "N01154\tTDP-43-regulated splicing\n", + "N01155\tMutation-caused aberrant TDP43 to TDP-43-regulated splicing\n", + "N01156\tFUS-regulated splicing\n", + "N01157\tMutation-caused aberrant FUS to FUS-regulated splicing\n", + "N01158\tMutation-caused aberrant DCTN1 to retrograde axonal transport\n", + "N01159\tMutation-caused aberrant TUBA4A to retrograde axonal transport\n", + "N01160\tMutation-caused aberrant SOD1 to retrograde axonal transport\n", + "N01161\tActin polymerization\n", + "N01162\tMutation-caused aberrant PFN1 to actin polymerization\n", + "N01163\tNRG-ERBB4-PI3K signaling pathway\n", + "N01164\tMutation-inactivated ERBB4 to NRG-ERBB4-PI3K signaling pathway\n", + "N01165\tPDL/PD1-SHP-PI3K signaling pathway\n", + "N01197\tScrapie conformation PrPSc to 26S proteasome-mediated protein degradation\n", + "N01198\tScrapie conformation PrPSc to PERK-ATF4 signaling pathway\n", + "N01199\tScrapie conformation PrPSc to mGluR5-Ca2+ -apoptotic pathway\n", + "N01200\tScrapie conformation PrPSc to transport of calcium\n", + "N01201\tScrapie conformation PrPSc to VGCC-Ca2+ -apoptotic pathway\n", + "N01202\tOligomeric conformation PrPc to anterograde axonal transport\n", + "N01203\tScrapie conformation PrPSc to Notch singling pathway\n", + "N01204\tPRNP-PI3K-NOX2 signaling pathway\n", + "N01205\tScrapie conformation PrPSc to PRNP-PI3K-NOX2 signaling pathway\n", + "N01282\tRegulation of CAV1.1\n", + "N01283\tShigella OspF to TLR2/4-MAPK signaling pathway\n", + "N01284\tShigella IcsP to Autophagy-vesicle elongation\n", + "N01285\tMicrotubule-RHOA signaling pathway\n", + "N01286\tEscherichia EspG to Microtubule-RHOA signaling pathway\n", + "N01287\tTight junction-Actin signaling pathway\n", + "N01288\tEscherichia EspF to Tight junction-Actin signaling pathway\n", + "N01289\tCOPII vesicle formation\n", + "N01290\tEscherichia NleA to COPII vesicle formation\n", + "N01291\tTRAPPI-RAB1 signaling pathway\n", + "N01292\tEscherichia EspG to RAB1 signaling pathway\n", + "N01293\tCOPI vesicle formation\n", + "N01294\tEscherichia NleF to COPI vesicle formation\n", + "N01295\tRab7-regulated microtubule minus-end directed transport\n", + "N01296\tSalmonella SopD2 to Rab7-regulated microtubule minus-end directed transport\n", + "N01297\tArl8-regulated microtubule plus-end directed transport\n", + "N01298\tSalmonella SifA to microtubule plus-end directed transport\n", + "N01299\tSalmonella PipB2 to microtubule plus-end directed transport\n", + "N01300\tTethering of late endosomes and lysosomes\n", + "N01301\tSalmonella SifA to Tethering of late endosomes and lysosomes\n", + "N01302\tEarly endosomal fusion\n", + "N01303\tSalmonella SopB to Early endosomal fusion\n", + "N01304\tANXA2-S100A10-regulated actin cytoskeleton\n", + "N01305\tSalmonella SopB to ANXA2-S100A10-regulated actin cytoskeleton\n", + "N01306\tAngII-AT1R-NOX2 signaling pathway\n", + "N01307\tSARS-CoV-2 S to AngII-AT1R-NOX2 signaling pathway\n", + "N01308\tMDA5-IRF7/3 signaling pathway\n", + "N01309\tSARS-CoV-2 nsp3 to MDA5-IRF7/3 signaling pathway\n", + "N01310\tSARS-CoV-2 nsp13 to RIG-I-IRF7/3 signaling pathway\n", + "N01312\tSARS-CoV-2 S to lectin pathway of complement cascade\n", + "N01314\tSARS-CoV-2 S to classical pathway of complement cascade\n", + "N01315\tLectin pathway of coagulation cascade, prothrombin to thrombin\n", + "N01316\tSARS-CoV-2 S/N to lectin pathway of coagulation cascade\n", + "N01317\tTranslation initiation\n", + "N01318\tSARS-CoV-2 nsp1 to translation initiation\n", + "N01319\tSARS-CoV-2 nsp6 and ORF6 to RIG-I-IRF7/3 signaling pathway\n", + "N01320\tSARS-CoV-2 nsp3 to RIG-I-IRF7/3 signaling pathway\n", + "N01321\tSARS-CoV-2 nsp1/6/13, ORF3a/6/7b and M to IFN signaling pathway\n", + "N01322\tSARS-CoV-2 nsp6/13 and ORF7a/7b to IFN signaling pathway\n", + "N01336\tCHRNA7-E2F signaling pathway\n", + "N01337\tNNK/NNN to CHRNA7-E2F signaling pathway\n", + "N01338\tACH-CHRN-PI3K signaling pathway\n", + "N01339\tNNK/NNN to PI3K signaling pathway\n", + "N01340\tACH-CHRN-JAK-STAT signaling pathway\n", + "N01341\tNNK/NNN to Jak-STAT signaling pathway\n", + "N01342\tNicotine to Jak-STAT signaling pathway\n", + "N01343\tACH-CHRN-RAS-ERK signaling pathway\n", + "N01344\tNNK/NNN to RAS-ERK signaling pathway\n", + "N01345\tEP/NE-ADRB-cAMP signaling pathway\n", + "N01346\tNicotine/NNK to cAMP signaling pathway\n", + "N01347\tEP/NE-ADRB-PI3K signaling pathway\n", + "N01348\tNicotine/NNK to PI3K signaling pathway\n", + "N01349\tACH-CHRN-PI3K signaling pathway\n", + "N01350\tNNK/NNN to PI3K signaling pathway\n", + "N01351\tE2-ER-RAS-ERK signaling pathway\n", + "N01352\tBPA to RAS-ERK signaling pathway\n", + "N01353\tE2 to RAS-ERK signaling pathway\n", + "N01354\tBPA to RAS-ERK signaling pathway\n", + "N01355\tArsenic to PI3K signaling pathway\n", + "N01356\tMembrane-initiated progesterone signaling pathway\n", + "N01357\tP4/MPA to membrane-initiated progesterone signaling pathway\n", + "N01358\tP4-PR-PI3K signaling pathway\n", + "N01359\tP4/MPA to PR-PI3K signaling pathway\n", + "N01360\tP4-PR-RAS-ERK signaling pathway\n", + "N01361\tP4/MPA to PR-RAS-ERK signaling pathway\n", + "N01362\tNuclear-initiated progesterone signaling pathway\n", + "N01363\tP4/MPA to nuclear-initiated progesterone signaling pathway\n", + "N01364\tE2 to nuclear-initiated estrogen signaling pathway\n", + "N01365\tTCDD to Ahr signaling pathway\n", + "N01366\tBaP to Ahr signaling pathway\n", + "N01367\tPCB to Ahr signaling pathway\n", + "N01368\tHCB to Ahr signaling pathway\n", + "N01369\t4-ABP to DNA adducts\n", + "N01370\tPhIP to DNA adducts\n", + "N01371\tPhIP to DNA adducts\n", + "N01372\tIQ to DNA adducts\n", + "N01373\tMeIQx to DNA adducts\n", + "N01374\tBaP to DNA adducts\n", + "N01375\tDMBA to DNA adducts\n", + "N01376\tMelphalan to DNA adducts/cross-links\n", + "N01377\tThiotepa to DNA adducts/cross-links\n", + "N01378\tAFB1 to DNA adducts\n", + "N01379\tNNK to DNA adducts\n", + "N01380\tNNK to DNA adducts\n", + "N01381\tNNK to DNA adducts\n", + "N01382\tNNK to DNA adducts\n", + "N01383\tNDMA to DNA adducts\n", + "N01384\tEO to DNA adducts\n", + "N01385\tVC to DNA adducts\n", + "N01386\tDCE to DNA adducts\n", + "N01387\tSM to DNA adducts/cross-links\n", + "N01388\tSOD/Cat-mediated ROS neutralization\n", + "N01389\tLead to SOD/Cat-mediated ROS neutralization\n", + "N01390\tp,p'-DDT to SOD/Cat-mediated ROS neutralization\n", + "N01391\tLead to SOD/Cat-mediated ROS neutralization\n", + "N01392\tArsenic to electron transfer in complex II\n", + "N01393\tArsenic to electron transfer in complex II\n", + "N01394\tArsenic to electron transfer in complex IV\n", + "N01395\tCadmium to electron transfer in complex III\n", + "N01396\t4-Aminobiphenyl to CYP-mediated metabolism\n", + "N01397\t4-Aminobiphenyl to CYP-mediated metabolism\n", + "N01398\tPentachlorophenol to CYP-mediated metabolism\n", + "N01399\tBenzene to CYP-mediated metabolism\n", + "N01400\tBenzene to CYP-mediated metabolism\n", + "N01401\tBenzo[a]pyrenre to CYP-mediated metabolism\n", + "N01402\tManganese to electron transfer in Complex II\n", + "N01403\tZn to anterograde axonal transport\n", + "N01404\t17beta-estradiol to CYP-mediated metabolism\n", + "N01405\t17beta-estradiol to CYP-mediated metabolism\n", + "N01406\tEthanol to CYP-mediated metabolism\n", + "N01407\tMetals to JNK signaling pathway\n", + "N01408\tMetals to RAS-ERK signaling pathway\n", + "N01409\tMetals to PI3K signaling pathway\n", + "N01410\tMetals to NFKB signaling pathway\n", + "N01411\tMetals to NFKB signaling pathway\n", + "N01412\tMetals to HTF-1 signaling pathway\n", + "N01413\tMetals to KEAP1-NRF2 signalig pathway\n", + "N01414\tIron to anterograde axonal transport\n", + "N01415\tNEP-mediated Abeta degradation\n", + "N01416\tMercury to NEP-mediated Abeta degradation\n", + "N01417\tParaquat to FAS-JNK signaling pathway\n", + "N01418\tPurine salvage pathway, adenine to AMP\n", + "N01419\tAPRT deficiency in purine salvage pathway\n", + "N01420\tAPRT deficiency in adenine metabolism\n", + "N01421\tPurine salvage pathway, hypoxanthine/guanine to IMP/GMP\n", + "N01422\tHPRT1 deficiency in purine salvage pathway\n", + "N01423\tHPRT1 deficiency in hypoxanthine metabolism\n", + "N01424\tHPRT1 deficiency in guanine metabolism\n", + "N01425\tGlobal genome NER\n", + "N01426\tBMP9/10 signaling pathway\n", + "N01427\tWNT5A-ROR signaling pathway\n", + "N01428\tBMP signaling pathway, BMP antagonist\n", + "N01429\tCytosolic Ca2+ removal, PMCA\n", + "N01430\tTranscription-coupled NER\n", + "N01431\tCore NER reaction\n", + "N01432\tMismatch repair\n", + "N01433\tBase excision and strand cleavage by monofunctional glycosylase\n", + "N01434\tBase excision and strand cleavage by bifunctional glycosylase\n", + "N01435\tBase excision and strand cleavage by NEIL glycosylase\n", + "N01436\tLong patch BER\n", + "N01437\tShort patch BER\n", + "N01438\tMitochondrial BER\n", + "N01439\tDouble-strand break signaling\n", + "N01440\tWnt signaling modulation, LGR/RSPO\n", + "N01441\tWnt signaling modulation, SOST/LRP4\n", + "N01442\tWnt signaling modulation, Wnt inhibitor\n", + "N01443\tWnt signaling modulation, Wnt acylation\n", + "N01444\tNXN mutation to WNT5A-ROR signaling pathway\n", + "N01445\tNon-homologous end-joining\n", + "N01446\tDNA end resection and RPA loading\n", + "N01447\tDouble Holliday junction dissolution\n", + "N01448\tDouble Holliday junction resolution\n", + "N01449\tSynthesis-dependent strand annealing\n", + "N01450\tBreak induced replication\n", + "N01451\tATR signaling\n", + "N01452\tHomologous recombination\n", + "N01453\tBMP signaling pathway\n", + "N01454\tAMH signaling pathway\n", + "N01455\tBMP15 signaling pathway\n", + "N01456\tActivin signaling pathway\n", + "N01457\tMyostatin signaling pathway\n", + "N01458\tBMP-HAMP signaling pathway\n", + "N01459\tNodal signaling pathway\n", + "N01460\tPlasmin mediated activation of latent TGF-beta\n", + "N01461\tBMP-HAMP signaling pathway, auxiliary factor\n", + "N01462\tBMP9/10 signaling pathway, BMP9/10 coreceptor\n", + "N01464\tFanconi anemia pathway\n", + "N01465\tLesion bypass by TLS and DSB formation\n", + "N01466\tHomologous recombination in ICLR\n", + "N01467\tV(D)J recombination\n", + "N01468\tDNA replication licensing\n", + "N01469\tCdt1 downregulation\n", + "N01470\tPre-IC formation\n", + "N01471\tOrigin unwinding and elongation\n", + "N01472\tOkazaki fragment maturation\n", + "N01473\tDNA replication termination\n", + "N01474\tTRAIP-dependent replisome disassembly\n", + "N01475\tTelomerase RNA maturation\n", + "N01476\tAssembly and trafficking of telomerase\n", + "N01477\tTelomere elongation\n", + "N01478\tNotch proteolytic activation\n", + "N01479\tNotch ligand ubiquitylation\n", + "N01480\tNotch-HES7 signaling\n", + "N01481\tNotch-MESP2 signaling\n", + "N01482\tCohesin loading\n", + "N01483\tCohesin acetylation\n", + "N01484\tEstablishment of cohesion\n", + "N01485\tCohesin dissociation in prophase\n", + "N01486\tCohesin dissociation in anaphase\n", + "N01487\tClassical pathway of complement cascade, C4/C2 to C3 convertase formation\n", + "N01489\tClassical/Lectin pathway of complement cascade, C5 convertase formation\n", + "N01490\tCommon pathway of complement cascade, MAC formation\n", + "N01491\tLectin pathway of complement cascade, C4/C2 to C3 convertase formation\n", + "N01493\tAlternative pathway of complement cascade, C3 convertase formation\n", + "N01494\tAlternative pathway of complement cascade, C3/5 convertase formation\n", + "N01495\tClassical/Lectin pathway of complement cascade, C4b breakdown\n", + "N01496\tAlternative pathway of complement cascade, C3b breakdown\n", + "N01497\tCondensin loading\n", + "N01498\tInhibition of condensin II association\n", + "N01499\tModifying of condensin II subunits\n", + "N01500\tModifying of condensin I subunits\n", + "N01501\tInactivation of condensin I\n", + "N01502\tLectin pathway of coagulation cascade, fibrinogen to fibrin\n", + "N01503\tExtrinsic pathway of coagulation cascade, F7 activation\n", + "N01504\tRegulation of complement cascade, CFHR\n", + "N01505\tRegulation of complement cascade, MAC inhibition\n", + "N01506\tIntrinsic pathway of coagulation cascade, F12 activation\n", + "N01507\tIntrinsic pathway of coagulation cascade, F11 activation\n", + "N01508\tIntrinsic pathway of coagulation cascade, F9 activation\n", + "N01509\tIntrinsic pathway of coagulation cascade, F8 activation\n", + "N01510\tCommon pathway of coagulation cascade, F10 activation\n", + "N01511\tCommon pathway of coagulation cascade, F5 activation\n", + "N01512\tCommon pathway of coagulation cascade, prothrombin activation\n", + "N01513\tCommon pathway of coagulation cascade, fibrinogen to fibrin\n", + "N01514\tCommon pathway of coagulation cascade, F13 activation\n", + "N01515\tRegulation of coagulation cascade, protein C system\n", + "N01516\tKallikrein-kinin system, prekallikrein activation\n", + "N01517\tKallikrein-kinin system, HMWK to bradykinin\n", + "N01518\tFibrinolytic system\n", + "N01519\tRegulation of coagulation cascade, AT3\n", + "N01520\tRegulation of fibrinolytic system, C1INH\n", + "N01521\tRegulation of coagulation cascade, HCF2\n", + "N01522\tRegulation of fibrinolytic system, AAP\n", + "N01523\tRegulation of fibrinolytic system, AAT\n", + "N01524\tRegulation of fibrinolytic system, PAI\n", + "N01525\tOrganization of the inner kinetochore\n", + "N01526\tOrganization of the outer kinetochore\n", + "N01527\tKSHV Kaposin to classical/Lectin pathway of complement cascade, C4b breakdown\n", + "N01528\tKSHV Kaposin to alternative pathway of complement cascade, C3b breakdown\n", + "N01529\tRecruitment and formation of the MCC\n", + "N01530\tDopamine metabolism\n", + "N01531\tCENPE interaction with NDC80 complex\n", + "N01532\tKinetochore targeting of MAD1-MAD2\n", + "N01533\tDisassembly of MCC\n", + "N01534\tDynein recruitment to the kinetochore\n", + "N01535\tKinetochore microtubule attachment\n", + "N01536\tDephosphorylation of kinetochore\n", + "N01537\tHedgehog signaling pathway, HH ligand secretion\n", + "N01538\tHedgehog signaling pathway, PTCH coreceptor\n", + "N01539\tRAD51 -dsDNA destabilization\n", + "N01540\tEstrogen biosynthesis\n", + "N01541\tTestosterone biosynthesis\n", + "N01542\tPKA holoenzyme\n", + "N01543\tTLR7/8/9-IRF5 signaling pathway\n", + "N01544\tMicrotubule nucleation\n", + "N01545\tRegulation of TNF-NFKB signaling pathway, LUBAC-mediated linear ubiquitination\n", + "N01546\tRegulation of TNF-NFKB signaling pathway, OTULIN/TNFAIP3-mediated deubiquitination\n", + "N01547\tKinetochore fiber organization\n", + "N01548\tKinetochore-fiber stabilization\n", + "N01549\tBranching microtubule nucleation\n", + "N01550\tAdrenaline metabolism\n", + "N01551\tSerotonin metabolism\n", + "N01552\tEumelanin biosynthesis\n", + "N01553\tPromotion of microtubule growth\n", + "N01554\tIL2 family to Jak-STAT signaling pathway\n", + "N01555\tHormone-like-cytokine to Jak-STAT signaling pathway\n", + "N01556\tIL6 family to Jak-STAT signaling pathway\n", + "N01557\tIL12/23 to Jak-STAT signaling pathway\n", + "N01558\tType I interferon to Jak-STAT signaling pathway\n", + "N01559\tType II interferon to Jak-STAT signaling pathway\n", + "N01560\tRegulation of type I interferon to Jak-STAT signaling pathway, USP18\n", + "N01561\tMicrotubule depolymerization\n", + "N01562\tMicrotubule depolymerization at the minus ends\n", + "N01563\tInhibition of Kif2A\n", + "N01564\tPost-translational modifications of RIG-I and MDA5\n", + "N01565\tAdenosine-to-inosine RNA editing by ADAR\n", + "N01566\tTLR5-NFKB signaling pathway\n", + "N01567\tNLRP1 inflammasome signaling pathway\n", + "N01568\tRegulation of NLRP3 inflammasome signaling pathway, NLRP3 inhibition\n", + "N01569\tNALP12 inflammasome signaling pathway\n", + "N01570\tRegulation of Pyrin inflammasome signaling pathway, PSTPIP1\n", + "N01571\tDNA degradation by extracellular/endolysosomal DNAse\n", + "N01572\tRNASEH2-mediated RNA degradation in RNA-DNA hybrids\n", + "N01573\tSAMHD1-mediated dNTP degradation\n", + "N01574\tGlycosaminoglycan biosynthesis, linkage tetrasaccharide\n", + "N01575\tTSC1/2-mTORC1 signaling pathway\n", + "N01576\tSTRAD/STK11- TSC signaling pathway\n", + "N01577\tGene silencing by methylation of H3K27 and ubiquitination of H2AK119\n", + "N01578\tGATOR1-mTORC1 signaling pathway\n", + "N01579\tCD80/CD86-CTLA4-PP2A signaling pathway\n", + "N01580\tChondroitin sulfate biosynthesis\n", + "N01581\tDermatan sulfate biosynthesis\n", + "N01582\tHeparan sulfate biosynthesis\n", + "N01583\tRegulation of extrinsic apoptotic pathway, XIAP\n", + "N01584\tFLCN-mTORC1 signaling pathway\n", + "N01585\tDeubiquitination of H2AK119\n", + "N01586\tActivation of PRC2.2 by ubiquitination of H2AK119\n", + "N01587\tFe-TF transport\n", + "N01588\tFe3+ Ferritin transport\n", + "N01589\tGlutathione biosynthesis\n", + "N01590\tArachidonate/Adrenic acid metabolism\n", + "N01591\tFe2+ Ferroportin transport\n", + "N01592\tGF-RTK-RAS-ERK signaling pathway\n", + "N01593\tRegulation of GF-RTK-RAS-ERK signaling, PTP\n", + "N01594\tMLK-JNK signaling pathway\n", + "N01595\tRegulation of GF-RTK-RAS-ERK signaling pathway, adaptor proteins\n", + "N01596\tRegulation of GF-RTK-RAS-ERK signaling, RAS ubiquitination by CUL3 complex\n", + "N01597\tRegulation of GF-RTK-RAS-ERK signaling, SPRED and NF1\n", + "N01598\tRegulation of GF-RTK-RAS-ERK signaling, MRAS-SHOC2-PP1 holophosphatase\n", + "N01599\tRegulation of GF-RTK-RAS-ERK signaling, ubiquitination of RTK by CBL\n", + "N01600\tRegulation of GF-RTK-RAS-ERK signaling, RasGAP\n", + "N01601\tERK-RSK signaling\n", + "N01602\tERK-MYC signaling pathway\n", + "N01603\tPyruvate oxidation\n", + "N01604\tCitrate cycle, first carbon oxidation\n", + "N01605\tGluconeogenesis\n", + "N01606\tGlycolysis\n", + "N01607\tMethionine degradation\n", + "N01608\tSerine biosynthesis\n", + "N01609\tCitrate cycle, second carbon oxidation 1\n", + "N01610\tDihydrolipoamide dehydrogenase\n", + "N01611\tGlycine cleavage system\n", + "N01612\tCreatine pathway\n", + "N01613\tGlycine cleavage system, Gly to MTHF\n", + "N01614\tActivation of PRC2.2 by ubiquitination of H2AK119 in germline genes\n", + "N01615\tTransport of creatine\n", + "N01616\tDihydrolipoamide dehydrogenase\n", + "N01617\tCitrate cycle, second carbon oxidation 2\n", + "N01618\tProline biosynthesis, Orn to Pro\n", + "N01619\tBranched-chain amino acids degradation 2\n", + "N01620\tBlocking ubiquitination of H2AK119 by CK2\n", + "N01621\tTNF-RIPK1/3 signaling pathway\n", + "N01622\tProline degradation\n", + "N01623\tSpermine biosynthesis\n", + "N01624\tCholesterol biosynthesis\n", + "N01625\tCYLD regulation of RIPK1/3\n", + "N01626\tCholecalciferol biosynthesis\n", + "N01627\tAdenosine phosphorylation\n", + "N01628\tCysteine biosynthesis\n", + "N01629\tRemethylation, THF to 5-MTHF\n", + "N01630\tRemethylation, Hcy to Met\n", + "N01631\tTNFSF10-RIPK1/3 signaling pathway\n", + "N01632\tFASLG-RIPK1/3 signaling pathway\n", + "N01633\tTLR3-RIPK3 signaling pathway\n", + "N01634\tTLR4-RIPK3 signaling pathway\n", + "N01635\tMevalonate pathway\n", + "N01636\tLoading of the SMC5-SMC6 complex\n", + "N01637\tCa2+ entry, Voltage-gated Ca2+ channel\n", + "N01638\tSkeletal-type VGCC-RYR signaling\n", + "N01639\tCardiac-type VGCC-RYR signaling\n", + "N01640\tGPCR-PLCB-ITPR signaling pathway\n", + "N01641\tRTK-PLCG-ITPR signaling pathway\n", + "N01642\tCa2+ entry, Ligand-gated Ca2+ channel\n", + "N01643\tCa2+ entry, Store-operated Ca2+ channel\n", + "N01644\tLysosomal Ca2+ release\n", + "N01645\tCytosolic Ca2+ removal, SERCA\n", + "N01646\tRegulation of SERCA\n", + "N01647\tCa2+/CAM-CN signaling pathway\n", + "N01648\tCa2+/CAM-CAMK signaling pathway\n", + "N01649\tCa2+/CAM-VGCC/RYR signaling pathway\n", + "N01650\tSQSTM1 regulation of RIPK1/3\n", + "N01651\tBlood group H (O) antigen type 1 biosynthesis\n", + "N01652\tBlood group A antigen type 1 biosynthesis\n", + "N01653\tBlood group B antigen type 1 biosynthesis\n", + "N01654\tForssman blood group antigen biosynthesis\n", + "N01655\tCa2+-PLCD-ITPR signaling pathway\n", + "N01656\tGF-RTK-PI3K signaling pathway\n", + "N01657\tGPCR-PI3K signaling pathway\n", + "N01658\tGF-RTK-RAS-PI3K signaling pathway\n", + "N01659\tLewis b antigen biosynthesis\n", + "N01660\tLewis a antigen biosynthesis\n", + "N01661\tSialyl lewis a antigen biosynthesis\n", + "N01662\tIFN-RIPK1/3 signaling pathway\n", + "N01663\tCASP8 regulation of RIPK1/3\n", + "N01664\tBlood group A/B Lewis b antigen biosynthesis\n", + "N01666\tBlood group H (O) antigen type 2 biosynthesis\n", + "N01667\tBlood group A antigen type 2 biosynthesis\n", + "N01668\tBlood group B antigen type 2 biosynthesis\n", + "N01669\tBlood group A/B Lewis y antigen biosynthesis\n", + "N01670\tBlood group antigen type 3 biosynthesis\n", + "N01672\tLewis x antigen biosynthesis\n", + "N01673\tLewis y antigen biosynthesis\n", + "N01674\tSialyl lewis x antigen biosynthesis\n", + "N01675\tSID blood group Sd(a) antigen biosynthesis\n", + "N01676\tP1 antigen biosynthesis\n", + "N01677\tPX2 antigen biosynthesis\n", + "N01678\tIi blood group antigen biosynthesis\n", + "N01679\tPk and P antigens biosynthesis\n", + "N01680\tNOR antigen biosynthesis\n", + "N01682\tBlood group A antigen type 4 (Globo-A) biosynthesis\n", + "N01683\tOh (Bombay), deficiency of ABH antigens\n", + "N01684\tLipoic acid biosynthesis\n", + "N01685\tLysine degradation 1\n", + "N01686\tLysine degradation 2\n", + "N01687\tLysine degradation 3\n", + "N01688\tADRB3-UCP1 signaling pathway\n", + "N01689\tFUT2 nonsecretor\n", + "N01690\tBlood group H antigen type 4 (Globo-H) biosynthesis\n", + "N01691\tmitochondrial complex - UCP1 in Thermogenesis\n", + "N01695\tBCR-BCAP/CD19-PI3K signaling pathway\n", + "N01696\tICOSLG/ICOS-PI3K signaling pathway\n", + "N01697\tP/PX2 negative, Pk positive\n", + "N01698\tP1/Pk/P/NOR all negative (P null)\n", + "N01699\tP1 negative\n", + "N01700\tLewis negative, Le (a-b-)\n", + "N01701\tTranscriptional activation by acetylation of H3K27\n", + "N01702\tSd(a) negative\n", + "N01703\tBlood group B antigen type 4 (Globo-B) biosynthesis\n", + "N01704\tI negative (adult i)\n", + "N01708\tINS-AKT signaling pathway\n", + "N01709\tHydrolysis of globoside\n", + "N01710\tHydrolysis of ganglioside\n", + "N01711\tHydrolysis of GA1\n", + "N01712\tHydrolysis of psychosine\n", + "N01713\tGM2A activation of HEXA and HEXB\n", + "N01714\tLoss of GM2A activation\n", + "N01715\tAutophagy-vesicle nucleation/elongation/maturation, PI3P synthesis by PI3KC3-C1\n", + "N01716\tAutophagy-vesicle nucleation/elongation/maturation, sequestosome-1-like receptor\n", + "N01717\tRegulation of autophagy-vesicle nucleation/elongation/maturation, ATXN3\n", + "N01718\tAutophagy-vesicle nucleation/elongation/maturation, PACER-RUBCN-PI3KC3-C2\n", + "N01719\tAutophagy-vesicle nucleation/elongation/maturation, E3 ubiquitin-ligase Malin\n", + "N01720\tAutophagosome and lysosome fusion, trans-SNARE\n", + "N01721\tAutophagosome and lysosome fusion, tethering factor\n", + "N01722\tAutophagosome and lysosome fusion, tethering factor, GRASP55\n", + "N01723\tNAD biosynthesis\n", + "N01724\tNAD+ phosphorylation\n", + "N01725\tTetrahydrofolate biosynthesis\n", + "N01726\tFolate cycle\n", + "N01727\tHistidine degradation\n", + "N01729\tHistamine biosynthesis\n", + "N01741\tCa2+/TRPC3 signaling pathway\n", + "N01743\tRenin-angiotensin signaling pathway\n", + "N01746\tCCR/CXCR-GNB/G-PI3K signaling pathway\n", + "N01747\tFind-me signal (nucleotide)\n", + "N01748\tFind-me signal (LPC)\n", + "N01749\tFind-me signal (CX3CL1)\n", + "N01750\tFind-me signal (S1P)\n", + "N01751\tMacrophage EPO signaling\n", + "N01752\tTranslocation of phosphatidylserine to the inner leaflet\n", + "N01753\tExposure of phosphatidylserine to the outer leaflet\n", + "N01754\tActivation of XKR8\n", + "N01756\tPINK-Parkin-independent ubiquitin-mediated mitophagy\n", + "N01757\tPINK-Parkin-independent ubiquitin-mediated mitophagy, ubiquitin E3 ligase\n", + "N01758\tDesmosome - Vimentin filaments\n", + "N01759\tINK1-Parkin-mediated MFN2 degradation, VCP-OPA1\n", + "N01760\tEndosomal Rab cycles\n", + "N01761\tActivation of CRK-DOCK-Rac1 pathway\n", + "N01762\tMERTK-mediated recognition and engulfment\n", + "N01763\tMEGF10-mediated recognition and engulfment\n", + "N01764\tCalreticulin-LRP1 mediated recognition and engulfment\n", + "N01765\tCXCR4-GNAQ-PLCB/G signaling pathway\n", + "N01766\tCX3CR1-GNAI-AC signaling pathway\n", + "N01767\tCXCR4-GNAI-Src signaling pathway\n", + "N01768\tCXCR4-GNA12/13 signaling pathway\n", + "N01769\tCCR5-GNB/G-PLCB/G signaling pathway\n", + "N01770\tCCR2-GNB/G-PI3K signaling pathway\n", + "N01771\tCXCR4-GNB/G signaling pathway\n", + "N01772\tInduction of the PTGS2\n", + "N01773\tPTGS2-PGE2-TGFB1 pathway\n", + "N01774\tERK-DUSP4 negative feedback pathway\n", + "N01775\tInactivation of CaMKII by inducing SERCA2\n", + "N01776\tCaMK2-p38-MK2-ALOX5 pathway\n", + "N01777\tEfferocytosis-induced NAD production\n", + "N01778\tProduction of IL10 via the Sirtuin1 signaling cascade\n", + "N01779\tContinual efferocytosis enhanced by the AC-derived arginine and ornithine\n", + "N01780\tHydrolyzing AC-derived cholesterol esters in the lysosome\n", + "N01781\tActivation of LXRs by oxysterols\n", + "N01782\tGHRL-GHSR signaling\n", + "N01783\tNPPA-NPR1 signaling\n", + "N01784\tGlucose uptake and lactate release induced by efferocytosis\n", + "N01785\tDon't eat me signal (CD47)\n", + "N01786\tDon't eat me signal (CD24)\n", + "N01787\tNPPC-NPR2 signaling\n", + "N01788\tADIPOQ-ADIPOR signaling pathway\n", + "N01789\tBetaine metabolism\n", + "N01790\tTransport of dopamine into the neuron\n", + "N01791\tGlycine metabolism, Ser to Gly\n", + "N01792\tEDN-EDNR signaling pathway\n", + "N01793\tGAL-GALR signaling pathway\n", + "N01794\tHCRT-HCRTR signaling pathway\n", + "N01796\tTNFSF4-TNFRSF4 signaling pathway\n", + "N01797\tEDA-EDAR signaling pathway\n", + "N01798\tTNFSF11-TNFRSF11A signaling pathway\n", + "N01799\tCD70-CD27 signaling pathway\n", + "N01800\tLEP-LEPR signaling pathway\n", + "N01801\tTNFSF13-TNFRSF13B/C signaling pathway\n", + "N01802\tDihydrotestosterone biosynthesis\n", + "N01804\tIL3 family to Jak-STAT signaling pathway\n", + "N01806\tCobalamin (Vitamin B12) absorption\n", + "N01807\tTransfer of cobalamin to the portal blood\n", + "N01808\tIntracellular processing of cobalamin (reduction)\n", + "N01809\tMutation-caused epigenetic silencing of MMACHC\n", + "N01810\tRegulation of MMACHC expression\n", + "N01811\tMitochondrial adenocylation of cobalamin and loading onto MMUT\n", + "N01812\tCobalamin loading and activation of MTR\n", + "N01813\tEnhancement of NIPBL loading\n", + "N01814\tExtracellular matrix - Basal lamina\n", + "N01815\tVinculin-talin-integrin macromolecular complex\n", + "N01816\tCostamere\n", + "N01817\tMyosin thick filament\n", + "N01818\tActin thin filament, muscle contraction\n", + "N01819\tActin thin filament, length regulation\n", + "N01820\tSarcomere, Z-disc\n", + "N01821\tSarcomere, M-band\n", + "N01822\tLinker of nucleoskeleton and cytoskeleton (LINC) complex\n", + "N01823\tFGF23-NCC/NPT signaling pathway\n", + "N01824\tSGK1-NHERF1+NPT signaling pathway\n", + "N01831\tRegulation of VWF-GPIb-IX-V interaction, ADAMTS13\n", + "N01832\tNTN1-MAP1B axon guidance signaling\n", + "N01833\tDRAXIN-MAP1B axon guidance signaling\n", + "N01834\tSEMA3A-MAP1B axon guidance signaling\n", + "N01835\tSEMA3-CRMP2/MAPT axon guidance signaling\n", + "N01836\tMicrotubule plus end regulation network\n", + "N01837\tRegulation of neurite extension, NAV1-TRIO\n", + "N01838\tRegulation of synaptic plasticity, p140Cap\n", + "N01839\tSevering of microtubule, SPAST/KATN\n", + "N01840\tSevering of microtubule, KIF2A\n", + "N01841\tAnterograde axonal transport, Kinesin-2\n", + "N01842\tAnterograde axonal/dendrite transport, Kinesin-3\n", + "N01843\tAnterograde dendrite transport, Kinesin-4\n", + "N01844\tAnterograde dendrite transport, Kinesin-6\n", + "N01845\tAnterograde axonal/dendrite transport, Kinesin-12\n", + "N01846\tRetrograde axonal/dendrite transport, Dynein\n", + "N01847\tRegulation of dynein-mediated retrograde transport\n", + "N01848\tMembrane-associated periodic skeleton (MPS)\n", + "N01849\tAxonal actin ring structure\n", + "N01850\tMYO5B-mediated vesicle transport\n", + "N01851\tMYO5A-mediated vesicle transport\n", + "N01852\tMYO6-mediated vesicle transport\n", + "N01853\tNeurofilament structure\n", + "N01854\tNeurofilament regulation, ubiqutination by TRIM2\n", + "N01855\tNeurofilament regulation, ubiqutination by Gigaxonin\n", + "N01856\tCytomatrix at the active zone (CAZ) protein complex\n", + "N01857\tSEMA3A-DCX axon guidance signaling\n", + "N01858\tEFNB1-MAPT axon guidance signaling\n", + "N01859\tAnterograde axonal/dendrite transport, Kinesin-1\n", + "N01860\tGPI-anchor remodeling\n", + "N01867\tDemethylation of dimethylglycine\n", + "N01868\tDemethylation of sarcosine\n", + "N01869\tTHF conversion, THF to 5,10-MTHF\n", + "N01870\tHIF-2A signaling pathway\n", + "N01871\tHydroxylation of HIF\n", + "N01872\tProteasomal degradation of HIF by VHL complex\n", + "N01873\tVHL mutation to HIF-2 signaling pathway\n", + "N01874\tNRG-ERBB2/ERBB3 pathway (RAS-ERK signaling)\n", + "N01875\tNRG-ERBB2/ERBB3 pathway (P13K signaling)\n", + "N01876\tNRG1 fusion to NRG-ERBB2/ERBB3 pathway\n", + "N01877\tERBB4 mutation to GF-RTK-PI3K signaling pathway\n", + "N01878\tGlutamate-GRM-GNAQ/S signaling pathway\n", + "N01879\tGlutamate-GRM-GNAI/O signaling pathway\n", + "N01880\tGRM1/5-interacting scaffold proteins\n", + "N01881\tGRM1/5-interacting partners\n", + "N01882\tTransport of natrium, KA receptor\n", + "N01883\tTransport of natrium, AMPAR\n", + "N01884\tTransport of glutamate, EAAT\n", + "N01885\tTransport of glutamine, SNAT\n", + "N01886\tGlutamate transport in synapse\n", + "N01887\tTransport of chloride, GABAA receptor\n", + "N01888\tGABA-GABBR-GNAI/O signaling pathway\n", + "N01889\tGbeta/gamma-KCNJ signaling\n", + "N01890\tGephyrin-containing complex at inhibitory synapse\n", + "N01891\tGABAA receptor trafficking\n", + "N01892\tGABA metabolism and transport in glia\n", + "N01893\tGlutamine metabolism and transport in neuron\n", + "N01894\tAcetylcholine-CHRM-GNAQ/11 signaling pathway\n", + "N01895\tTransport of natrium/calcium, CHRN\n", + "N01896\tAcetylcholine metabolism and transport in neuron\n", + "N01897\tDopamine-DRD-GNAQ/S signaling pathway\n", + "N01898\tDopamine-DRD-GNAI/O signaling pathway\n", + "N01899\tGbeta/gamma-CACNA signaling\n", + "N01900\tSerotonin-HTR2-GNAQ/11 signaling pathway\n", + "N01901\tSerotonin-HTR1/5-GNAI/O signaling pathway\n", + "N01902\tTransport of serotonin, SLC6A4\n", + "N01903\tNorepinephrine-ADRA2-GNAI/O signaling pathway\n", + "N01904\tNorepinephrine-ADRB-GNAS signaling pathway\n", + "N01905\tAC-PKA-HCN signaling\n", + "N01906\tGlycine transport in neuron\n", + "N01907\tTransport of chloride, GLR\n", + "N01908\tADP/UDP-glucose-P2RY-GNAI/O signaling pathway\n", + "N01909\tTransport of calcium, P2RX\n", + "N01910\tAdenine nucleotide conversion\n", + "N01911\tTransport of ATP, SLC17A9\n", + "N01912\tHistamine metabolism and transport in neuron\n", + "N01913\tMelanocortin receptor signaling, MSH\n", + "N01914\tMelanocortin receptor signaling, AgRP\n", + "N01915\tTachykinin receptor signaling\n", + "N01916\tPreprohormone cleavage, POMC\n", + "N01917\tPreprohormone cleavage, PDYN\n", + "N01918\tDopamine metabolism in astrocyte\n", + "N01919\tDopamine/Adrenaline metabolism in presynaptic neuron\n", + "N01920\tTransport of norepinephrine into neuron\n", + "nt06031\tCitrate cycle and pyruvate metabolism\n", + "nt06017\tGlycogen metabolism\n", + "nt06023\tGalactose degradation\n", + "nt06020\tbeta-Oxidation in mitochondria\n", + "nt06021\tbeta-Oxidation in peroxisome\n", + "nt06034\tCholesterol biosynthesis\n", + "nt06019\tSteroid hormone biosynthesis\n", + "nt06022\tBile acid biosynthesis\n", + "nt06014\tSphingolipid degradation\n", + "nt06027\tPurine salvage pathway\n", + "nt06033\tGlycine, serine and arginine metabolism\n", + "nt06030\tMethionine metabolism\n", + "nt06024\tValine, leucine and isoleucine degradation\n", + "nt06036\tLysine degradation\n", + "nt06010\tUrea cycle\n", + "nt06037\tHistidine metabolism\n", + "nt06016\tPhenylalanine and tyrosine metabolism\n", + "nt06028\tDopamine and serotonin metabolism\n", + "nt06026\tGlutathione biosynthesis\n", + "nt06015\tN-Glycan biosynthesis\n", + "nt06013\tO-Glycan biosynthesis\n", + "nt06029\tGlycosaminoglycan biosynthesis\n", + "nt06012\tGlycosaminoglycan degradation\n", + "nt06018\tGPI-anchor biosynthesis\n", + "nt06035\tBlood group carbohydrate antigen biosynthesis\n", + "nt06032\tLipoic acid metabolism\n", + "nt06038\tFolate metabolism\n", + "nt06025\tMolybdenum cofactor biosynthesis\n", + "nt06011\tHeme biosynthesis\n", + "nt06538\tCobalamin transport and metabolism\n", + "nt06509\tDNA replication\n", + "nt06510\tTelomere length regulation\n", + "nt06504\tBase excision repair\n", + "nt06502\tNucleotide excision repair\n", + "nt06503\tMismatch repair\n", + "nt06506\tDouble-strand break repair\n", + "nt06508\tInterstrand crosslink repair\n", + "nt06526\tMAPK signaling\n", + "nt06530\tPI3K signaling\n", + "nt06505\tWNT signaling\n", + "nt06511\tNOTCH signaling\n", + "nt06501\tHH signaling\n", + "nt06507\tTGFB signaling\n", + "nt06518\tJAK-STAT signaling\n", + "nt06516\tTNF signaling\n", + "nt06528\tCalcium signaling\n", + "nt06522\tmTOR signaling\n", + "nt06542\tHIF signaling\n", + "nt06543\tNRG-ERBB signaling\n", + "nt06523\tEpigenetic regulation by Polycomb complexes\n", + "nt06512\tChromosome cohesion and segregation\n", + "nt06515\tRegulation of kinetochore-microtubule interactions\n", + "nt06534\tUnfolded protein response\n", + "nt06532\tAutophagy\n", + "nt06536\tMitophagy\n", + "nt06535\tEfferocytosis\n", + "nt06524\tApoptosis\n", + "nt06525\tFerroptosis\n", + "nt06527\tNecroptosis\n", + "nt06529\tThermogenesis\n", + "nt06539\tCytoskeleton in muscle cells\n", + "nt06541\tCytoskeleton in neurons\n", + "nt06544\tNeuroactive ligand signaling\n", + "nt06513\tComplement cascade\n", + "nt06514\tCoagulation cascade\n", + "nt06517\tTLR signaling\n", + "nt06521\tNLR signaling\n", + "nt06519\tRLR signaling\n", + "nt06520\tCGAS-STING signaling\n", + "nt06537\tTCR/BCR signaling\n", + "nt06533\tChemokine signaling\n", + "nt06310\tCRH-ACTH-cortisol signaling\n", + "nt06322\tTRH-TSH-TH signaling\n", + "nt06323\tKISS1-GnRH-LH/FSH-E2 signaling\n", + "nt06324\tGHRH-GH-IGF signaling\n", + "nt06318\tCaSR-PTH signaling\n", + "nt06316\tRenin-angiotensin-aldosterone signaling\n", + "nt06325\tHormone/cytokine signaling\n", + "nt06320\tAPOB-LDLR signaling\n", + "nt06260\tColorectal cancer\n", + "nt06261\tGastric cancer\n", + "nt06262\tPancreatic cancer\n", + "nt06263\tHepatocellular carcinoma\n", + "nt06264\tRenal cell carcinoma\n", + "nt06265\tBladder cancer\n", + "nt06266\tNon-small cell lung cancer\n", + "nt06267\tSmall cell lung cancer\n", + "nt06268\tMelanoma\n", + "nt06269\tBasal cell carcinoma\n", + "nt06270\tBreast cancer\n", + "nt06271\tEndometrial cancer\n", + "nt06272\tProstate cancer\n", + "nt06273\tGlioma\n", + "nt06274\tThyroid cancer\n", + "nt06275\tAcute myeloid leukemia\n", + "nt06276\tChronic myeloid leukemia\n", + "nt06210\tERK signaling (cancer)\n", + "nt06214\tPI3K signaling (cancer)\n", + "nt06213\tOther RAS signaling (cancer)\n", + "nt06211\tOther MAPK signaling (cancer)\n", + "nt06215\tWNT signaling (cancer)\n", + "nt06216\tNOTCH signaling (cancer)\n", + "nt06217\tHH signaling (cancer)\n", + "nt06218\tTGFB signaling (cancer)\n", + "nt06219\tJAK-STAT signaling (cancer)\n", + "nt06220\tCalcium signaling (cancer)\n", + "nt06234\tcAMP signaling (cancer)\n", + "nt06222\tIFN signaling (cancer)\n", + "nt06223\tTNF signaling (cancer)\n", + "nt06224\tCXCR signaling (cancer)\n", + "nt06225\tHIF-1 signaling (cancer)\n", + "nt06226\tKEAP1-NRF2 signaling (cancer)\n", + "nt06227\tNuclear receptor signaling (cancer)\n", + "nt06229\tMHC presentation (cancer)\n", + "nt06230\tCell cycle (cancer)\n", + "nt06231\tApoptosis (cancer)\n", + "nt06232\tTelomerase activity (cancer)\n", + "nt06240\tTranscription (cancer)\n", + "nt06250\tDNA adduct formation (cancer)\n", + "nt06251\tCYP-mediated ROS formation (cancer)\n", + "nt06252\tMitochondrial ROS formation (cancer)\n", + "nt06253\tAntioxidant system (cancer)\n", + "nt06460\tAlzheimer disease\n", + "nt06463\tParkinson disease\n", + "nt06464\tAmyotrophic lateral sclerosis\n", + "nt06461\tHuntington disease\n", + "nt06462\tSpinocerebellar ataxia\n", + "nt06465\tPrion disease\n", + "nt06466\tPathways of neurodegeneration\n", + "nt06360\tCushing syndrome\n", + "nt06160\tHuman T-cell leukemia virus 1 (HTLV-1)\n", + "nt06161\tHuman immunodeficiency virus 1 (HIV-1)\n", + "nt06162\tHepatitis B virus (HBV)\n", + "nt06163\tHepatitis C virus (HCV)\n", + "nt06171\tSARS coronavirus 2 (SARS-CoV-2)\n", + "nt06170\tInfluenza A virus (IAV)\n", + "nt06169\tMeasles virus (MV)\n", + "nt06168\tHerpes simplex virus 1 (HSV-1)\n", + "nt06167\tHuman cytomegalovirus (HCMV)\n", + "nt06164\tKaposi sarcoma-associated herpesvirus (KSHV)\n", + "nt06165\tEpstein-Barr virus (EBV)\n", + "nt06166\tHuman papillomavirus (HPV)\n", + "nt06180\tPathogenic Escherichia coli\n", + "nt06181\tSalmonella\n", + "nt06182\tShigella\n", + "nt06183\tYersinia\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull rest list network" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8bc3095e-6122-46cd-a7ff-7f77cbbaf28f", + "metadata": {}, + "outputs": [], + "source": [ + "#kegg_pull pull database network\n", + "\n", + "# Pulling all nodes in the network database. Will download it to current working directory. " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "94ea7e25-deb4-4b13-8ca9-9e865f792ccd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "network KEGG Network Database\n", + "ne Release 114.0+/04-11, Apr 25\n", + " Kanehisa Laboratories\n", + " 1,637 entries\n", + "\n", + "linked db pathway\n", + " ko\n", + " hsa\n", + " compound\n", + " variant\n", + " disease\n", + " pubmed\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull rest info network" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "379395b3-8bb4-4282-9967-3b9305540771", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 1415\n" + ] + } + ], + "source": [ + "kegg_pull rest link network pathway | wc -l" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "d53d358e-7f55-4a49-b277-d6781cabf389", + "metadata": {}, + "outputs": [], + "source": [ + "kegg_pull rest link network pathway --output network_pathway.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "40c35d4e-4eee-4c23-97ba-e21410d74c4c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 1414 network_pathway.tsv\n" + ] + } + ], + "source": [ + "wc -l network_pathway.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "ead20644-6632-4002-aae7-2e73962dafe8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "path:hsa05225\tne:N00005\n", + "path:hsa05211\tne:N00005\n", + "path:hsa05223\tne:N00007\n", + "path:hsa05216\tne:N00009\n", + "path:hsa05210\tne:N00012\n", + "path:hsa05212\tne:N00012\n", + "path:hsa05226\tne:N00012\n", + "path:hsa05216\tne:N00012\n", + "path:hsa05221\tne:N00012\n", + "path:hsa05213\tne:N00012\n" + ] + } + ], + "source": [ + "head network_pathway.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a697076f-118f-4151-8720-4b0bcda35a5d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "62077ac1-2eb4-421c-8133-8da3610b0c3b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 1306\n" + ] + } + ], + "source": [ + "kegg_pull rest link network disease | wc -l" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "4ad19eb5-09fd-4f88-8d22-ab04b1b0d12f", + "metadata": {}, + "outputs": [], + "source": [ + "kegg_pull rest link network disease --output network_disease.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "460b3d26-0221-4347-abd4-860dd6b3e125", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 1305 network_disease.tsv\n" + ] + } + ], + "source": [ + "wc -l network_disease.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "a2841f84-d8fb-445d-8bef-47155a13cb3e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ds:H01489\tne:nt06018\n", + "ds:H01486\tne:nt06018\n", + "ds:H01488\tne:nt06018\n", + "ds:H01487\tne:nt06018\n", + "ds:H01127\tne:nt06018\n", + "ds:H01485\tne:nt06018\n", + "ds:H00216\tne:nt06019\n", + "ds:H02314\tne:nt06019\n", + "ds:H00259\tne:nt06019\n", + "ds:H01111\tne:nt06019\n" + ] + } + ], + "source": [ + "head network_disease.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8363fad9-f5b6-42b6-a3b6-dac6c86af932", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90785681-45ac-4f65-85a4-5fb5852acce2", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "3a86642c-9caa-4ba4-a493-1811bb060cd0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████| 1/1 [00:01<00:00, 1.37s/it]\n" + ] + } + ], + "source": [ + "kegg_pull pull entry-ids H01489" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81ff19a1-1016-4658-909a-7aaef6790c19", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "765b890a-bf17-43cc-8ac9-6e0b6b497e16", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 76\n" + ] + } + ], + "source": [ + "kegg_pull rest link disease pathway | wc -l" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "1744f449-033c-482d-9fd7-60f37a319fc5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "path:hsa05211\tds:H00021\n", + "path:hsa05110\tds:H00110\n", + "path:hsa05220\tds:H00004\n", + "path:hsa05210\tds:H00020\n", + "path:hsa05212\tds:H00019\n", + "path:hsa05217\tds:H00039\n", + "path:hsa05130\tds:H00277\n", + "path:hsa05130\tds:H00278\n", + "path:hsa05332\tds:H00084\n", + "path:hsa05132\tds:H00111\n", + "path:hsa05223\tds:H00014\n", + "path:hsa05135\tds:H00298\n", + "path:hsa05214\tds:H00042\n", + "path:hsa05221\tds:H00003\n", + "path:hsa05166\tds:H00009\n", + "path:hsa05226\tds:H00018\n", + "path:hsa05224\tds:H00031\n", + "path:hsa05216\tds:H00032\n", + "path:hsa05161\tds:H00412\n", + "path:hsa05144\tds:H00361\n" + ] + } + ], + "source": [ + "kegg_pull rest link disease pathway | head -20" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "4931f2d0-e53e-4db3-b3ed-0b0d875d3384", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pathway KEGG Pathway Database\n", + "path Release 114.0+/04-11, Apr 25\n", + " Kanehisa Laboratories\n", + " 579 entries\n", + "\n", + "linked db module\n", + " ko\n", + " \n", + " genome\n", + " compound\n", + " glycan\n", + " reaction\n", + " rclass\n", + " enzyme\n", + " network\n", + " disease\n", + " drug\n", + " pubmed\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull rest info pathway" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "a12375f3-3bcb-4f00-8662-01e6b941cfbb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "disease KEGG Disease Database\n", + "ds Release 114.0+/04-11, Apr 25\n", + " Kanehisa Laboratories\n", + " 2,900 entries\n", + "\n", + "linked db pathway\n", + " brite\n", + " ko\n", + " hsa\n", + " genome\n", + " network\n", + " variant\n", + " drug\n", + " pubmed\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull rest info disease" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "c6dff0fa-7a12-4e0e-8af6-76f7d37adb71", + "metadata": {}, + "outputs": [], + "source": [ + "kegg_pull rest list network --output kegg_network.txt" + ] + }, + { + "cell_type": "markdown", + "id": "bf0aa25e-dfae-4bef-aaaa-a45d3417c125", + "metadata": {}, + "source": [ + "## Getting the number of reference vs disease networks" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "488d4d96-4f3f-4725-930f-77efb4aac6a8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████| 1637/1637 [11:53<00:00, 2.30it/s]\n" + ] + } + ], + "source": [ + "kegg_pull pull database network --output kegg_network" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "3c0adc9f-544a-4f52-9e67-232798caa6b0", + "metadata": {}, + "outputs": [], + "source": [ + "# Output file\n", + "output=\"kegg_network_types.tsv\"\n", + "> \"$output\" # Clear or create the file\n", + "\n", + "# Iterate over each .txt file in the kegg_network directory\n", + "for file in kegg_network/*.txt; do\n", + " # Get the filename without path and extension\n", + " base=$(basename \"$file\" .txt)\n", + "\n", + " # Extract the line containing TYPE\n", + " type_line=$(grep \"TYPE\" \"$file\")\n", + "\n", + " # Extract the TYPE line and remove the word \"TYPE\" and any whitespace\n", + " type_value=$(grep \"^TYPE\" \"$file\" | sed 's/TYPE[ \\t]*//')\n", + "\n", + " # Write to the output file\n", + " echo -e \"${base}\\t${type_value}\" >> \"$output\"\n", + "done" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "0ac320a9-9079-4fb6-81fd-975977a82db0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 1.17it/s]\n", + "hsa_var:1950v1\n", + "ENTRY 1950v1 Variant\n", + "NAME EGF overexpression\n", + "TYPE Gain of function\n", + "GENE EGF epidermal growth factor [KO:K04357]\n", + "ORGANISM hsa_var Human gene variants (Homo sapiens)\n", + "VARIATION overexpression\n", + "NETWORK nt06210 ERK signaling (cancer)\n", + " nt06214 PI3K signaling (cancer)\n", + " nt06260 Colorectal cancer\n", + " nt06526 MAPK signaling\n", + " nt06530 PI3K signaling\n", + "DISEASE H00020 Colorectal cancer\n", + "REFERENCE PMID:7912978\n", + " AUTHORS Hayashi Y, Widjono YW, Ohta K, Hanioka K, Obayashi C, Itoh K, Imai Y, Itoh H\n", + " TITLE Expression of EGF, EGF-receptor, p53, v-erb B and ras p21 in colorectal neoplasms by immunostaining paraffin-embedded tissues.\n", + " JOURNAL Pathol Int 44:124-30 (1994)\n", + " DOI:10.1111/j.1440-1827.1994.tb01696.x\n", + "REFERENCE PMID:15668269\n", + " AUTHORS Spano JP, Fagard R, Soria JC, Rixe O, Khayat D, Milano G\n", + " TITLE Epidermal growth factor receptor signaling in colorectal cancer: preclinical data and therapeutic perspectives.\n", + " JOURNAL Ann Oncol 16:189-94 (2005)\n", + " DOI:10.1093/annonc/mdi057\n", + "///\n", + "\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull pull entry-ids hsa_var:1950v1 --print" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "9dd433af-9634-4dd9-bdb1-e04d6f0610f1", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ENTRY 1950 CDS T01001\n", + "SYMBOL EGF, HOMG4, URG\n", + "NAME (RefSeq) epidermal growth factor\n", + "ORTHOLOGY K04357 epidermal growth factor\n", + "ORGANISM hsa Homo sapiens (human)\n", + "PATHWAY hsa01521 EGFR tyrosine kinase inhibitor resistance\n", + " hsa04010 MAPK signaling pathway\n", + " hsa04012 ErbB signaling pathway\n", + " hsa04014 Ras signaling pathway\n", + " hsa04015 Rap1 signaling pathway\n", + " hsa04020 Calcium signaling pathway\n", + " hsa04066 HIF-1 signaling pathway\n", + " hsa04068 FoxO signaling pathway\n", + " hsa04072 Phospholipase D signaling pathway\n", + " hsa04151 PI3K-Akt signaling pathway\n", + " hsa04510 Focal adhesion\n", + " hsa04540 Gap junction\n", + " hsa04630 JAK-STAT signaling pathway\n", + " hsa04810 Regulation of actin cytoskeleton\n", + " hsa05160 Hepatitis C\n", + " hsa05165 Human papillomavirus infection\n", + " hsa05200 Pathways in cancer\n", + " hsa05207 Chemical carcinogenesis - receptor activation\n", + " hsa05208 Chemical carcinogenesis - reactive oxygen species\n", + " hsa05210 Colorectal cancer\n", + " hsa05212 Pancreatic cancer\n", + " hsa05213 Endometrial cancer\n", + " hsa05214 Glioma\n", + " hsa05215 Prostate cancer\n", + " hsa05218 Melanoma\n", + " hsa05219 Bladder cancer\n", + " hsa05223 Non-small cell lung cancer\n", + " hsa05224 Breast cancer\n", + " hsa05226 Gastric cancer\n", + " hsa05231 Choline metabolism in cancer\n", + " hsa05235 PD-L1 expression and PD-1 checkpoint pathway in cancer\n", + "NETWORK nt06160 Human T-cell leukemia virus 1 (HTLV-1)\n", + " nt06162 Hepatitis B virus (HBV)\n", + " nt06163 Hepatitis C virus (HCV)\n", + " nt06164 Kaposi sarcoma-associated herpesvirus (KSHV)\n", + " nt06165 Epstein-Barr virus (EBV)\n", + " nt06166 Human papillomavirus (HPV)\n", + " nt06167 Human cytomegalovirus (HCMV)\n", + " nt06170 Influenza A virus (IAV)\n", + " nt06180 Pathogenic Escherichia coli\n", + " nt06182 Shigella\n", + " nt06210 ERK signaling (cancer)\n", + " nt06213 Other RAS signaling (cancer)\n", + " nt06214 PI3K signaling (cancer)\n", + " nt06219 JAK-STAT signaling (cancer)\n", + " nt06220 Calcium signaling (cancer)\n", + " nt06227 Nuclear receptor signaling (cancer)\n", + " nt06260 Colorectal cancer\n", + " nt06261 Gastric cancer\n", + " nt06262 Pancreatic cancer\n", + " nt06263 Hepatocellular carcinoma\n", + " nt06265 Bladder cancer\n", + " nt06266 Non-small cell lung cancer\n", + " nt06268 Melanoma\n", + " nt06270 Breast cancer\n", + " nt06271 Endometrial cancer\n", + " nt06273 Glioma\n", + " nt06274 Thyroid cancer\n", + " nt06276 Chronic myeloid leukemia\n", + " nt06526 MAPK signaling\n", + " nt06528 Calcium signaling\n", + " nt06530 PI3K signaling\n", + " ELEMENT N00001 EGF-EGFR-RAS-ERK signaling pathway\n", + " N00021 EGF-ERBB2-RAS-ERK signaling pathway\n", + " N00022 ERBB2-overexpression to RAS-ERK signaling pathway\n", + " N00023 EGF-EGFR-PLCG-ERK signaling pathway\n", + " N00026 EGF-EGFR-PLCG-CAMK signaling pathway\n", + " N00030 EGF-EGFR-RAS-PI3K signaling pathway\n", + " N00033 EGF-EGFR-PI3K signaling pathway\n", + " N00034 ERBB2-overexpression to PI3K signaling pathway\n", + " N00094 EGF-Jak-STAT signaling pathway\n", + " N00095 ERBB2-overexpression to EGF-Jak-STAT signaling pathway\n", + " N00096 EGF-EGFR-RAS-RASSF1 signaling pathway\n", + " N00103 EGF-EGFR-RAS-RalGDS signaling pathway\n", + " N00147 EGF-EGFR-PLCG-calcineurin signaling pathway\n", + " N00252 Amplified ERBB2 to RAS-ERK signaling pathway\n", + " N00253 Amplified ERBB2 to PI3K signaling pathway\n", + " N00276 EGF-overexpression to RAS-ERK signaling pathway\n", + " N00281 EGF-overexpression to PI3K signaling pathway\n", + " N00390 EGF-EGFR-PI3K-NFKB signaling pathway\n", + " N00542 EGF-EGFR-RAS-JNK signaling pathway\n", + " N01078 EGF-EGFR-Actin signaling pathway\n", + " N01364 E2 to nuclear-initiated estrogen signaling pathway\n", + " N01592 GF-RTK-RAS-ERK signaling pathway\n", + " N01641 RTK-PLCG-ITPR signaling pathway\n", + " N01656 GF-RTK-PI3K signaling pathway\n", + " N01658 GF-RTK-RAS-PI3K signaling pathway\n", + "DISEASE H00020 Colorectal cancer\n", + " H01210 Hypomagnesemia\n", + "BRITE KEGG Orthology (KO) [BR:hsa00001]\n", + " 09130 Environmental Information Processing\n", + " 09132 Signal transduction\n", + " 04014 Ras signaling pathway\n", + " 1950 (EGF)\n", + " 04015 Rap1 signaling pathway\n", + " 1950 (EGF)\n", + " 04630 JAK-STAT signaling pathway\n", + " 1950 (EGF)\n", + " 04066 HIF-1 signaling pathway\n", + " 1950 (EGF)\n", + " 04068 FoxO signaling pathway\n", + " 1950 (EGF)\n", + " 04072 Phospholipase D signaling pathway\n", + " 1950 (EGF)\n", + " 04151 PI3K-Akt signaling pathway\n", + " 1950 (EGF)\n", + " 09160 Human Diseases\n", + " 09161 Cancer: overview\n", + " 05200 Pathways in cancer\n", + " 1950 (EGF)\n", + " 05207 Chemical carcinogenesis - receptor activation\n", + " 1950 (EGF)\n", + " 05208 Chemical carcinogenesis - reactive oxygen species\n", + " 1950 (EGF)\n", + " 05231 Choline metabolism in cancer\n", + " 1950 (EGF)\n", + " 05235 PD-L1 expression and PD-1 checkpoint pathway in cancer\n", + " 1950 (EGF)\n", + " 09162 Cancer: specific types\n", + " 05210 Colorectal cancer\n", + " 1950 (EGF)\n", + " 05212 Pancreatic cancer\n", + " 1950 (EGF)\n", + " 05226 Gastric cancer\n", + " 1950 (EGF)\n", + " 05214 Glioma\n", + " 1950 (EGF)\n", + " 05218 Melanoma\n", + " 1950 (EGF)\n", + " 05219 Bladder cancer\n", + " 1950 (EGF)\n", + " 05215 Prostate cancer\n", + " 1950 (EGF)\n", + " 05213 Endometrial cancer\n", + " 1950 (EGF)\n", + " 05224 Breast cancer\n", + " 1950 (EGF)\n", + " 05223 Non-small cell lung cancer\n", + " 1950 (EGF)\n", + " 09172 Infectious disease: viral\n", + " 05160 Hepatitis C\n", + " 1950 (EGF)\n", + " 05165 Human papillomavirus infection\n", + " 1950 (EGF)\n", + " 09176 Drug resistance: antineoplastic\n", + " 01521 EGFR tyrosine kinase inhibitor resistance\n", + " 1950 (EGF)\n", + " 09180 Brite Hierarchies\n", + " 09183 Protein families: signaling and cellular processes\n", + " 04052 Cytokines and neuropeptides [BR:hsa04052]\n", + " 1950 (EGF)\n", + " Cytokines and neuropeptides [BR:hsa04052]\n", + " Cytokines\n", + " Growth factors (RTK binding)\n", + " 1950 (EGF)\n", + "POSITION 4:109912883..110013766\n", + "MOTIF Pfam: Ldl_recept_b FXa_inhibition cEGF EGF EGF_CA EGF_3 DUF5050 Vgb_lyase Plasmod_Pvs28\n", + "DBLINKS NCBI-GeneID: 1950\n", + " NCBI-ProteinID: NP_001954\n", + " OMIM: 131530\n", + " HGNC: 3229\n", + " Ensembl: ENSG00000138798\n", + " UniProt: P01133\n", + "STRUCTURE PDB\n", + "AASEQ 1207\n", + " MLLTLIILLPVVSKFSFVSLSAPQHWSCPEGTLAGNGNSTCVGPAPFLIFSHGNSIFRID\n", + " TEGTNYEQLVVDAGVSVIMDFHYNEKRIYWVDLERQLLQRVFLNGSRQERVCNIEKNVSG\n", + " MAINWINEEVIWSNQQEGIITVTDMKGNNSHILLSALKYPANVAVDPVERFIFWSSEVAG\n", + " SLYRADLDGVGVKALLETSEKITAVSLDVLDKRLFWIQYNREGSNSLICSCDYDGGSVHI\n", + " SKHPTQHNLFAMSLFGDRIFYSTWKMKTIWIANKHTGKDMVRINLHSSFVPLGELKVVHP\n", + " LAQPKAEDDTWEPEQKLCKLRKGNCSSTVCGQDLQSHLCMCAEGYALSRDRKYCEDVNEC\n", + " AFWNHGCTLGCKNTPGSYYCTCPVGFVLLPDGKRCHQLVSCPRNVSECSHDCVLTSEGPL\n", + " CFCPEGSVLERDGKTCSGCSSPDNGGCSQLCVPLSPVSWECDCFPGYDLQLDEKSCAASG\n", + " PQPFLLFANSQDIRHMHFDGTDYGTLLSQQMGMVYALDHDPVENKIYFAHTALKWIERAN\n", + " MDGSQRERLIEEGVDVPEGLAVDWIGRRFYWTDRGKSLIGRSDLNGKRSKIITKENISQP\n", + " RGIAVHPMAKRLFWTDTGINPRIESSSLQGLGRLVIASSDLIWPSGITIDFLTDKLYWCD\n", + " AKQSVIEMANLDGSKRRRLTQNDVGHPFAVAVFEDYVWFSDWAMPSVMRVNKRTGKDRVR\n", + " LQGSMLKPSSLVVVHPLAKPGADPCLYQNGGCEHICKKRLGTAWCSCREGFMKASDGKTC\n", + " LALDGHQLLAGGEVDLKNQVTPLDILSKTRVSEDNITESQHMLVAEIMVSDQDDCAPVGC\n", + " SMYARCISEGEDATCQCLKGFAGDGKLCSDIDECEMGVPVCPPASSKCINTEGGYVCRCS\n", + " EGYQGDGIHCLDIDECQLGEHSCGENASCTNTEGGYTCMCAGRLSEPGLICPDSTPPPHL\n", + " REDDHHYSVRNSDSECPLSHDGYCLHDGVCMYIEALDKYACNCVVGYIGERCQYRDLKWW\n", + " ELRHAGHGQQQKVIVVAVCVVVLVMLLLLSLWGAHYYRTQKLLSKNPKNPYEESSRDVRS\n", + " RRPADTEDGMSSCPQPWFVVIKEHQDLKNGGQPVAGEDGQAADGSMQPTSWRQEPQLCGM\n", + " GTEQGCWIPVSSDKGSCPQVMERSFHMPSYGTQTLEGGVEKPHSLLSANPLWQQRALDPP\n", + " HQMELTQ\n", + "NTSEQ 3624\n", + " atgctgctcactcttatcattctgttgccagtagtttcaaaatttagttttgttagtctc\n", + " tcagcaccgcagcactggagctgtcctgaaggtactctcgcaggaaatgggaattctact\n", + " tgtgtgggtcctgcacccttcttaattttctcccatggaaatagtatctttaggattgac\n", + " acagaaggaaccaattatgagcaattggtggtggatgctggtgtctcagtgatcatggat\n", + " tttcattataatgagaaaagaatctattgggtggatttagaaagacaacttttgcaaaga\n", + " gtttttctgaatgggtcaaggcaagagagagtatgtaatatagagaaaaatgtttctgga\n", + " atggcaataaattggataaatgaagaagttatttggtcaaatcaacaggaaggaatcatt\n", + " acagtaacagatatgaaaggaaataattcccacattcttttaagtgctttaaaatatcct\n", + " gcaaatgtagcagttgatccagtagaaaggtttatattttggtcttcagaggtggctgga\n", + " agcctttatagagcagatctcgatggtgtgggagtgaaggctctgttggagacatcagag\n", + " aaaataacagctgtgtcattggatgtgcttgataagcggctgttttggattcagtacaac\n", + " agagaaggaagcaattctcttatttgctcctgtgattatgatggaggttctgtccacatt\n", + " agtaaacatccaacacagcataatttgtttgcaatgtccctttttggtgaccgtatcttc\n", + " tattcaacatggaaaatgaagacaatttggatagccaacaaacacactggaaaggacatg\n", + " gttagaattaacctccattcatcatttgtaccacttggtgaactgaaagtagtgcatcca\n", + " cttgcacaacccaaggcagaagatgacacttgggagcctgagcagaaactttgcaaattg\n", + " aggaaaggaaactgcagcagcactgtgtgtgggcaagacctccagtcacacttgtgcatg\n", + " tgtgcagagggatacgccctaagtcgagaccggaagtactgtgaagatgttaatgaatgt\n", + " gctttttggaatcatggctgtactcttgggtgtaaaaacacccctggatcctattactgc\n", + " acgtgccctgtaggatttgttctgcttcctgatgggaaacgatgtcatcaacttgtttcc\n", + " tgtccacgcaatgtgtctgaatgcagccatgactgtgttctgacatcagaaggtccctta\n", + " tgtttctgtcctgaaggctcagtgcttgagagagatgggaaaacatgtagcggttgttcc\n", + " tcacccgataatggtggatgtagccagctctgcgttcctcttagcccagtatcctgggaa\n", + " tgtgattgctttcctgggtatgacctacaactggatgaaaaaagctgtgcagcttcagga\n", + " ccacaaccatttttgctgtttgccaattctcaagatattcgacacatgcattttgatgga\n", + " acagactatggaactctgctcagccagcagatgggaatggtttatgccctagatcatgac\n", + " cctgtggaaaataagatatactttgcccatacagccctgaagtggatagagagagctaat\n", + " atggatggttcccagcgagaaaggcttattgaggaaggagtagatgtgccagaaggtctt\n", + " gctgtggactggattggccgtagattctattggacagacagagggaaatctctgattgga\n", + " aggagtgatttaaatgggaaacgttccaaaataatcactaaggagaacatctctcaacca\n", + " cgaggaattgctgttcatccaatggccaagagattattctggactgatacagggattaat\n", + " ccacgaattgaaagttcttccctccaaggccttggccgtctggttatagccagctctgat\n", + " ctaatctggcccagtggaataacgattgacttcttaactgacaagttgtactggtgcgat\n", + " gccaagcagtctgtgattgaaatggccaatctggatggttcaaaacgccgaagacttacc\n", + " cagaatgatgtaggtcacccatttgctgtagcagtgtttgaggattatgtgtggttctca\n", + " gattgggctatgccatcagtaatgagagtaaacaagaggactggcaaagatagagtacgt\n", + " ctccaaggcagcatgctgaagccctcatcactggttgtggttcatccattggcaaaacca\n", + " ggagcagatccctgcttatatcaaaacggaggctgtgaacatatttgcaaaaagaggctt\n", + " ggaactgcttggtgttcgtgtcgtgaaggttttatgaaagcctcagatgggaaaacgtgt\n", + " ctggctctggatggtcatcagctgttggcaggtggtgaagttgatctaaagaaccaagta\n", + " acaccattggacatcttgtccaagactagagtgtcagaagataacattacagaatctcaa\n", + " cacatgctagtggctgaaatcatggtgtcagatcaagatgactgtgctcctgtgggatgc\n", + " agcatgtatgctcggtgtatttcagagggagaggatgccacatgtcagtgtttgaaagga\n", + " tttgctggggatggaaaactatgttctgatatagatgaatgtgagatgggtgtcccagtg\n", + " tgcccccctgcctcctccaagtgcatcaacaccgaaggtggttatgtctgccggtgctca\n", + " gaaggctaccaaggagatgggattcactgtcttgatattgatgagtgccaactgggggag\n", + " cacagctgtggagagaatgccagctgcacaaatacagagggaggctatacctgcatgtgt\n", + " gctggacgcctgtctgaaccaggactgatttgccctgactctactccaccccctcacctc\n", + " agggaagatgaccaccactattccgtaagaaatagtgactctgaatgtcccctgtcccac\n", + " gatgggtactgcctccatgatggtgtgtgcatgtatattgaagcattggacaagtatgca\n", + " tgcaactgtgttgttggctacatcggggagcgatgtcagtaccgagacctgaagtggtgg\n", + " gaactgcgccacgctggccacgggcagcagcagaaggtcatcgtggtggctgtctgcgtg\n", + " gtggtgcttgtcatgctgctcctcctgagcctgtggggggcccactactacaggactcag\n", + " aagctgctatcgaaaaacccaaagaatccttatgaggagtcgagcagagatgtgaggagt\n", + " cgcaggcctgctgacactgaggatgggatgtcctcttgccctcaaccttggtttgtggtt\n", + " ataaaagaacaccaagacctcaagaatgggggtcaaccagtggctggtgaggatggccag\n", + " gcagcagatgggtcaatgcaaccaacttcatggaggcaggagccccagttatgtggaatg\n", + " ggcacagagcaaggctgctggattccagtatccagtgataagggctcctgtccccaggta\n", + " atggagcgaagctttcatatgccctcctatgggacacagacccttgaagggggtgtcgag\n", + " aagccccattctctcctatcagctaacccattatggcaacaaagggccctggacccacca\n", + " caccaaatggagctgactcagtga\n", + "///\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull rest get hsa:1950" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "926ac49a-8cc3-472c-ae07-4ecd4b70f5aa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T01001 Homo sapiens (human) KEGG Genes Database\n", + "hsa Release 114.0+/04-11, Apr 25\n", + " Kanehisa Laboratories\n", + " 24,685 entries\n", + "\n", + "linked db pathway\n", + " brite\n", + " module\n", + " ko\n", + " genome\n", + " enzyme\n", + " network\n", + " disease\n", + " drug\n", + " ncbi-geneid\n", + " ncbi-proteinid\n", + " uniprot\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull rest info hsa" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "b58ec9a4-dce3-4900-8021-9bc9155a925e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "variant KEGG Variant Database\n", + "hsa_var Release 114.0+/04-12, Apr 25\n", + " Kanehisa Laboratories\n", + " 1,536 entries\n", + "\n", + "linked db network\n", + " disease\n", + " drug\n", + " pubmed\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull rest info variant" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "cd56ab73-c7ab-4e9f-bf79-c1f2fe6c3b40", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10000v1\tAKT3 mutation\n", + "10026v1\tPIGK deficiency\n", + "10075v1\tHUWE1 mutation\n", + "100v1\tADA deficiency\n", + "10111v1\tRAD50 mutation\n", + "10133v1\tOPTN mutation\n", + "10133v2\tOPTN activating mutation\n", + "10157v1\tAASS deficiency\n", + "10195v1\tALG3 deficiency\n", + "1019v1\tCDK4 amplification\n", + "1019v2\tCDK4 mutation\n", + "10243v1\tGPHN deficiency\n", + "10274v1\tSTAG1 mutation\n", + "1027v1\tCDKN1B loss\n", + "1027v2\tCDKN1B reduced expression\n", + "1027v3\tCDKN1B mutation\n", + "10280v1\tSIGMAR1 mutation\n", + "10293v1\tTRAIP mutation\n", + "10297v1\tAPC2 mutation\n", + "1029v1\tCDKN2A deletion\n" + ] + } + ], + "source": [ + "kegg_pull rest list variant | head -20" + ] + }, + { + "cell_type": "markdown", + "id": "771ddf6d-dafc-4368-a5a3-0b6a2abdeeb3", + "metadata": {}, + "source": [ + "## Subsetting data to the Variant set of the networks" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "bc2e85c1-71d8-4c68-b0be-00ec72217cc7", + "metadata": {}, + "outputs": [], + "source": [ + "mkdir network_variant" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "590d7a64-857a-44bc-9251-90cdbc1d4181", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + "cp kegg_network/$p.txt network_variant/\n", + "\n", + "done < network_variants.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "fce29a29-33ca-4a03-89e2-2ed4aafc929f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 298\n" + ] + } + ], + "source": [ + "ls network_variant/* | wc -l" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "a339aed9-8ef3-421d-a454-5aea56b0124c", + "metadata": {}, + "outputs": [], + "source": [ + "#!/bin/bash\n", + "\n", + "output=\"gene_variants.tsv\"\n", + "> \"$output\" # Clear the output file\n", + "\n", + "for file in network_variant/*.txt; do\n", + " base=$(basename \"$file\" .txt)\n", + "\n", + " # Find and extract all matches of digits-v-digits\n", + " grep -oE \"[0-9]+v[0-9]+\" \"$file\" | while read -r match; do\n", + " echo -e \"${base}\\t${match}\" >> \"$output\"\n", + " done\n", + "done" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "279cc34b-6bd9-46d2-be11-785e9793ecfe", + "metadata": {}, + "outputs": [], + "source": [ + "sort gene_variants.tsv | uniq > temp.tsv && mv temp.tsv gene_variants.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "88ffb62a-be1a-4e05-854c-b061d596985c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 328 gene_variants.tsv\n" + ] + } + ], + "source": [ + "wc -l gene_variants.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "e1895ec2-6791-4daf-bcbf-3817e2e3a963", + "metadata": {}, + "outputs": [], + "source": [ + "cut -f 2 gene_variants.tsv > gene_variants.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "21e29f30-fecb-4360-ae18-278757bdbd0e", + "metadata": {}, + "outputs": [], + "source": [ + "sort gene_variants.txt | uniq > temp.tsv && mv temp.tsv gene_variants.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "015831af-db64-4386-b595-b2ceab4369d2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 200 gene_variants.txt\n" + ] + } + ], + "source": [ + "wc -l gene_variants.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "4ddf080a-1786-44ea-acdd-e7e474952ee3", + "metadata": {}, + "outputs": [], + "source": [ + "sed -i '' 's/^/hsa_var:/' gene_variants.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "5bb68dd7-08e6-4c3d-99c0-6131df914af6", + "metadata": {}, + "outputs": [], + "source": [ + "mkdir variant_info" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "f225e592-665e-4442-a385-73addd61b902", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████| 200/200 [00:51<00:00, 3.85it/s]\n" + ] + } + ], + "source": [ + "cat gene_variants.txt | kegg_pull pull entry-ids - --output=variant_info" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "679f5a1b-f4d1-4585-8a11-93bf81fcf795", + "metadata": {}, + "outputs": [], + "source": [ + "cat variant_info/* > all_variants.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd86096d-cf04-4117-b4c2-9736327dcc86", + "metadata": {}, + "outputs": [], + "source": [ + "cp all_variants.txt all_variants_filtered.txt" + ] + }, + { + "cell_type": "markdown", + "id": "810dd902-7dad-4fe7-b028-7a865c9d35d6", + "metadata": {}, + "source": [ + "### Switching to python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5bc079e4-bf59-404a-9f4b-c8b2b706a03a", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6851a309-c86f-477f-a355-feb3e959aa48", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "def remove_references(text):\n", + " # This regex matches 'REFERENCE' lines and all subsequent indented lines (those starting with 2+ spaces)\n", + " cleaned_text = re.sub(r'REFERENCE\\s+PMID:\\d+\\n(?: {2}.*\\n)*', '', text)\n", + " return cleaned_text" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "98db6b07-480f-42e7-8b45-6f710a33b6ef", + "metadata": {}, + "outputs": [], + "source": [ + "with open('all_variants_filtered.txt', 'r') as f:\n", + " original_text = f.read()\n", + "\n", + "cleaned_text = remove_references(original_text)\n", + "\n", + "with open('all_variants_filtered.txt', 'w') as f:\n", + " f.write(cleaned_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfeab2a6-81b9-4d3a-b1bc-a6491294dd15", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "dc011a9b-7c1b-40c8-86f8-1c1b28bae6b7", + "metadata": {}, + "outputs": [], + "source": [ + "def remove_network(text):\n", + " lines = text.split('\\n')\n", + " cleaned_lines = []\n", + " skip_block = False\n", + "\n", + " for line in lines:\n", + " if line.startswith(\"NETWORK\"):\n", + " skip_block = True\n", + " continue\n", + " if skip_block:\n", + " if line.startswith(\" \") or line.startswith(\"\\t\"):\n", + " continue\n", + " else:\n", + " skip_block = False\n", + " if not skip_block:\n", + " cleaned_lines.append(line)\n", + "\n", + " return '\\n'.join(cleaned_lines)\n", + "\n", + "\n", + "with open('all_variants_filtered.txt', 'r') as f:\n", + " original_text = f.read()\n", + "\n", + "cleaned_text = remove_network(original_text)\n", + "\n", + "with open('all_variants_filtered.txt', 'w') as f:\n", + " f.write(cleaned_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "65f14020-a3b1-4c71-a529-421eb66c70f3", + "metadata": {}, + "outputs": [], + "source": [ + "def remove_network(text):\n", + " lines = text.split('\\n')\n", + " cleaned_lines = []\n", + " skip_block = False\n", + "\n", + " for line in lines:\n", + " if line.startswith(\"DISEASE\"):\n", + " skip_block = True\n", + " continue\n", + " if skip_block:\n", + " if line.startswith(\" \") or line.startswith(\"\\t\"):\n", + " continue\n", + " else:\n", + " skip_block = False\n", + " if not skip_block:\n", + " cleaned_lines.append(line)\n", + "\n", + " return '\\n'.join(cleaned_lines)\n", + "\n", + "\n", + "with open('all_variants_filtered.txt', 'r') as f:\n", + " original_text = f.read()\n", + "\n", + "cleaned_text = remove_network(original_text)\n", + "\n", + "with open('all_variants_filtered.txt', 'w') as f:\n", + " f.write(cleaned_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c17ee2ae-a9ae-4ff3-b8ae-6a8bbc40e758", + "metadata": {}, + "outputs": [], + "source": [ + "def remove_network(text):\n", + " lines = text.split('\\n')\n", + " cleaned_lines = []\n", + " skip_block = False\n", + "\n", + " for line in lines:\n", + " if line.startswith(\"DRUG_TARGET\"):\n", + " skip_block = True\n", + " continue\n", + " if skip_block:\n", + " if line.startswith(\" \") or line.startswith(\"\\t\"):\n", + " continue\n", + " else:\n", + " skip_block = False\n", + " if not skip_block:\n", + " cleaned_lines.append(line)\n", + "\n", + " return '\\n'.join(cleaned_lines)\n", + "\n", + "\n", + "with open('all_variants_filtered.txt', 'r') as f:\n", + " original_text = f.read()\n", + "\n", + "cleaned_text = remove_network(original_text)\n", + "\n", + "with open('all_variants_filtered.txt', 'w') as f:\n", + " f.write(cleaned_text)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "114de6af-3c21-40a6-ad39-76c6c85adf38", + "metadata": {}, + "source": [ + "Chatgpt to parse out this file and give me a tsv with 3 columns. Entry, Source and ID\n", + "\n", + "Source is which SNV database it is from. Omimvar or clinvar or dbsnp or cosm or dbvar or cosf" + ] + }, + { + "cell_type": "markdown", + "id": "8e397b34-94e0-4564-bb21-2a92d161b5af", + "metadata": {}, + "source": [ + "### switch back to bash" + ] + }, + { + "cell_type": "markdown", + "id": "8fcdd7d0-27de-45f0-b6d3-96f9ee39f183", + "metadata": {}, + "source": [ + "# Downloading all Variant Information" + ] + }, + { + "cell_type": "markdown", + "id": "115df1ef-4e1c-4f31-babf-cd85960e6fea", + "metadata": {}, + "source": [ + "**Not using dbVar as it has been discontinued and most of the links to dbvar are bad** ClinVar is the alternate and holds all of the data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "61487285-a1af-4c1a-8b20-a52c8f26951b", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6ea7baea-ae92-4a93-a524-f44608dbe6d9", + "metadata": {}, + "outputs": [], + "source": [ + "rm all_variants.txt\n", + "rm all_variants_filtered.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b66473bd-cebc-4fa2-bdd8-310b67e82aaf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 60\n", + " 235\n", + " 201\n", + " 202\n", + " 28\n", + " 87\n" + ] + } + ], + "source": [ + "grep OmimVar parsed_variants.tsv | wc -l\n", + "grep ClinVar parsed_variants.tsv | wc -l\n", + "grep dbSNP parsed_variants.tsv | wc -l\n", + "grep COSM parsed_variants.tsv | wc -l\n", + "grep dbVar parsed_variants.tsv | wc -l\n", + "grep COSF parsed_variants.tsv | wc -l" + ] + }, + { + "cell_type": "markdown", + "id": "0eaac99f-2166-43c7-9b16-90616b71272d", + "metadata": {}, + "source": [ + "### OmimVar" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a135480-1dab-491c-b92b-03e9cc579c71", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "esearch -db clinvar -query \"601556[mim]\" | efetch -format docsum" + ] + }, + { + "cell_type": "markdown", + "id": "1b364d25-e8dc-4815-8a62-0dd00554d875", + "metadata": {}, + "source": [ + "From the output that you get, look for the variant ID in the output and then get that specific document summary" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "cf01ffe2-79d1-4dd5-b130-2d1a07554b90", + "metadata": {}, + "outputs": [], + "source": [ + "grep OmimVar parsed_variants.tsv | cut -f3 > Omim/OmimVar_id.txt" + ] + }, + { + "cell_type": "markdown", + "id": "f48e906b-70f7-420b-ba97-5105eda3c74d", + "metadata": {}, + "source": [ + "It is being really difficult to run this with a loop in bash, so just running it all manually like this" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3697f89a-6e17-435d-906d-927947259177", + "metadata": {}, + "outputs": [], + "source": [ + "esearch -db clinvar -query \"601978[mim]\" | efetch -format docsum > Omim/601978.xml\n", + "esearch -db clinvar -query \"602533[mim]\" | efetch -format docsum > Omim/602533.xml\n", + "esearch -db clinvar -query \"609007[mim]\" | efetch -format docsum > Omim/609007.xml\n", + "esearch -db clinvar -query \"111730[mim]\" | efetch -format docsum > Omim/111730.xml\n", + "esearch -db clinvar -query \"603448[mim]\" | efetch -format docsum > Omim/603448.xml\n", + "esearch -db clinvar -query \"608300[mim]\" | efetch -format docsum > Omim/608300.xml\n", + "esearch -db clinvar -query \"601143[mim]\" | efetch -format docsum > Omim/601143.xml\n", + "esearch -db clinvar -query \"614260[mim]\" | efetch -format docsum > Omim/614260.xml\n", + "esearch -db clinvar -query \"600543[mim]\" | efetch -format docsum > Omim/600543.xml\n", + "esearch -db clinvar -query \"605078[mim]\" | efetch -format docsum > Omim/605078.xml\n", + "esearch -db clinvar -query \"137070[mim]\" | efetch -format docsum > Omim/137070.xml\n", + "esearch -db clinvar -query \"211100[mim]\" | efetch -format docsum > Omim/211100.xml\n", + "esearch -db clinvar -query \"182100[mim]\" | efetch -format docsum > Omim/182100.xml\n", + "esearch -db clinvar -query \"111100[mim]\" | efetch -format docsum > Omim/111100.xml\n", + "esearch -db clinvar -query \"189980[mim]\" | efetch -format docsum > Omim/189980.xml\n", + "esearch -db clinvar -query \"606463[mim]\" | efetch -format docsum > Omim/606463.xml\n", + "esearch -db clinvar -query \"600429[mim]\" | efetch -format docsum > Omim/600429.xml\n", + "esearch -db clinvar -query \"603371[mim]\" | efetch -format docsum > Omim/603371.xml\n", + "esearch -db clinvar -query \"613109[mim]\" | efetch -format docsum > Omim/613109.xml\n", + "esearch -db clinvar -query \"604834[mim]\" | efetch -format docsum > Omim/604834.xml\n", + "esearch -db clinvar -query \"604473[mim]\" | efetch -format docsum > Omim/604473.xml\n", + "esearch -db clinvar -query \"300264[mim]\" | efetch -format docsum > Omim/300264.xml\n", + "esearch -db clinvar -query \"613004[mim]\" | efetch -format docsum > Omim/613004.xml\n", + "esearch -db clinvar -query \"308000[mim]\" | efetch -format docsum > Omim/308000.xml\n", + "esearch -db clinvar -query \"104760[mim]\" | efetch -format docsum > Omim/104760.xml\n", + "esearch -db clinvar -query \"102600[mim]\" | efetch -format docsum > Omim/102600.xml\n", + "esearch -db clinvar -query \"176264[mim]\" | efetch -format docsum > Omim/176264.xml\n", + "esearch -db clinvar -query \"605411[mim]\" | efetch -format docsum > Omim/605411.xml\n", + "esearch -db clinvar -query \"600734[mim]\" | efetch -format docsum > Omim/600734.xml\n", + "esearch -db clinvar -query \"607047[mim]\" | efetch -format docsum > Omim/607047.xml\n", + "esearch -db clinvar -query \"176763[mim]\" | efetch -format docsum > Omim/176763.xml\n", + "esearch -db clinvar -query \"602544[mim]\" | efetch -format docsum > Omim/602544.xml\n", + "esearch -db clinvar -query \"131340[mim]\" | efetch -format docsum > Omim/131340.xml\n", + "esearch -db clinvar -query \"176610[mim]\" | efetch -format docsum > Omim/176610.xml\n", + "esearch -db clinvar -query \"607922[mim]\" | efetch -format docsum > Omim/607922.xml\n", + "esearch -db clinvar -query \"176640[mim]\" | efetch -format docsum > Omim/176640.xml\n", + "esearch -db clinvar -query \"176801[mim]\" | efetch -format docsum > Omim/176801.xml\n", + "esearch -db clinvar -query \"104311[mim]\" | efetch -format docsum > Omim/104311.xml\n", + "esearch -db clinvar -query \"600759[mim]\" | efetch -format docsum > Omim/600759.xml\n", + "esearch -db clinvar -query \"601556[mim]\" | efetch -format docsum > Omim/601556.xml\n", + "esearch -db clinvar -query \"601517[mim]\" | efetch -format docsum > Omim/601517.xml\n", + "esearch -db clinvar -query \"612895[mim]\" | efetch -format docsum > Omim/612895.xml\n", + "esearch -db clinvar -query \"608309[mim]\" | efetch -format docsum > Omim/608309.xml\n", + "esearch -db clinvar -query \"163890[mim]\" | efetch -format docsum > Omim/163890.xml\n", + "esearch -db clinvar -query \"147450[mim]\" | efetch -format docsum > Omim/147450.xml\n", + "esearch -db clinvar -query \"604985[mim]\" | efetch -format docsum > Omim/604985.xml\n", + "esearch -db clinvar -query \"606765[mim]\" | efetch -format docsum > Omim/606765.xml\n", + "esearch -db clinvar -query \"602345[mim]\" | efetch -format docsum > Omim/602345.xml\n", + "esearch -db clinvar -query \"191110[mim]\" | efetch -format docsum > Omim/191110.xml\n", + "esearch -db clinvar -query \"191342[mim]\" | efetch -format docsum > Omim/191342.xml\n", + "esearch -db clinvar -query \"601023[mim]\" | efetch -format docsum > Omim/601023.xml\n", + "esearch -db clinvar -query \"608537[mim]\" | efetch -format docsum > Omim/608537.xml\n", + "esearch -db clinvar -query \"601011[mim]\" | efetch -format docsum > Omim/601011.xml\n", + "esearch -db clinvar -query \"114206[mim]\" | efetch -format docsum > Omim/114206.xml\n", + "esearch -db clinvar -query \"603094[mim]\" | efetch -format docsum > Omim/603094.xml\n", + "esearch -db clinvar -query \"601530[mim]\" | efetch -format docsum > Omim/601530.xml\n", + "esearch -db clinvar -query \"607904[mim]\" | efetch -format docsum > Omim/607904.xml\n", + "esearch -db clinvar -query \"605704[mim]\" | efetch -format docsum > Omim/605704.xml" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "43c6599c-6ec6-48a5-8264-8e86c1869e63", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 58 Omim/OmimVar_id.txt\n" + ] + } + ], + "source": [ + "wc -l Omim/OmimVar_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "a56a9106-06c6-4d23-983f-227ca14f85a4", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "601978 exists.\n", + "602533 exists.\n", + "609007 exists.\n", + "111730 exists.\n", + "603448 exists.\n", + "608300 exists.\n", + "601143 exists.\n", + "614260 exists.\n", + "600543 exists.\n", + "605078 exists.\n", + "137070 exists.\n", + "211100 exists.\n", + "182100 exists.\n", + "111100 exists.\n", + "189980 exists.\n", + "606463 exists.\n", + "600429 exists.\n", + "603371 exists.\n", + "613109 exists.\n", + "604834 exists.\n", + "604473 exists.\n", + "300264 exists.\n", + "613004 exists.\n", + "308000 exists.\n", + "104760 exists.\n", + "102600 exists.\n", + "176264 exists.\n", + "605411 exists.\n", + "600734 exists.\n", + "607047 exists.\n", + "176763 exists.\n", + "602544 exists.\n", + "131340 exists.\n", + "176610 exists.\n", + "607922 exists.\n", + "176640 exists.\n", + "176801 exists.\n", + "104311 exists.\n", + "600759 exists.\n", + "601556 exists.\n", + "601517 exists.\n", + "612895 exists.\n", + "608309 exists.\n", + "163890 exists.\n", + "147450 exists.\n", + "604985 exists.\n", + "606765 exists.\n", + "602345 exists.\n", + "191110 exists.\n", + "191342 exists.\n", + "601023 exists.\n", + "608537 exists.\n", + "601011 exists.\n", + "114206 exists.\n", + "603094 exists.\n", + "601530 exists.\n", + "607904 exists.\n", + "605704 exists.\n" + ] + } + ], + "source": [ + "while read p; do\n", + "if test -f Omim/$p.xml; then\n", + " echo \"$p exists.\"\n", + "else\n", + " echo \"$p does not exist.\"\n", + "fi\n", + "done < Omim/OmimVar_id.txt" + ] + }, + { + "cell_type": "markdown", + "id": "60f69460-27c4-4a92-995f-80a7540cb610", + "metadata": {}, + "source": [ + "Switch to python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eba16469-12ac-4d00-8888-d6d997ce29f4", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "420ee5bf-976b-4805-9895-93e202f20ba2", + "metadata": {}, + "outputs": [], + "source": [ + "import xml.etree.ElementTree as ET\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "e9e90379-be34-418e-ad34-1d78fca075f2", + "metadata": {}, + "outputs": [], + "source": [ + "import xml.etree.ElementTree as ET\n", + "\n", + "def extract_linked_ids(xml_path, target_omim_prefix, outfile):\n", + " tree = ET.parse(xml_path)\n", + " root = tree.getroot()\n", + "\n", + " for variation_xrefs in root.iter('variation_xrefs'):\n", + " block = []\n", + " matched_omim_id = None\n", + "\n", + " for xref in variation_xrefs.findall('variation_xref'):\n", + " db_source = xref.findtext('db_source')\n", + " db_id = xref.findtext('db_id')\n", + "\n", + " if db_source and db_id:\n", + " if db_source == \"OMIM\" and db_id.startswith(target_omim_prefix):\n", + " matched_omim_id = db_id\n", + " block.append((db_source, db_id))\n", + "\n", + " if matched_omim_id:\n", + " outfile.write(f\"OMIM ID found: {matched_omim_id}\\n\")\n", + " for source, id_ in block:\n", + " if source != \"OMIM\":\n", + " outfile.write(f\"{source}:{id_}\\n\")\n", + " outfile.write(\"\\n\") # Blank line between blocks" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "f2fb49e1-feb3-41c3-81a2-a1c5e5f0c9bf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded OMIM IDs: ['601978', '602533', '609007', '111730', '603448']\n" + ] + } + ], + "source": [ + "# Load OMIM IDs from file into a list\n", + "with open(\"Omim/OmimVar_id.txt\", \"r\") as f:\n", + " omim_ids = [line.strip() for line in f if line.strip()]\n", + "\n", + "# Optional: print first few IDs\n", + "print(\"Loaded OMIM IDs:\", omim_ids[:5])" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "45c074e5-9026-49d0-8a03-18537db41451", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Fixed: 609007 → saved to Omim_fixed/609007.xml\n", + "✅ Fixed: 601143 → saved to Omim_fixed/601143.xml\n", + "✅ Fixed: 604985 → saved to Omim_fixed/604985.xml\n", + "✅ Fixed: 608537 → saved to Omim_fixed/608537.xml\n", + "✅ Fixed: 601011 → saved to Omim_fixed/601011.xml\n", + "✅ Fixed: 114206 → saved to Omim_fixed/114206.xml\n", + "✅ Fixed: 607904 → saved to Omim_fixed/607904.xml\n" + ] + } + ], + "source": [ + "import os\n", + "import re\n", + "\n", + "# There were issues with some XMLs being malformed. So editing the problematic ones to make one common root.\n", + "problematic_ids = [\n", + " \"609007\", \"601143\", \"604985\", \"608537\", \"601011\", \"114206\", \"607904\"\n", + "]\n", + "\n", + "input_folder = \"Omim\"\n", + "output_folder = \"Omim_fixed\"\n", + "os.makedirs(output_folder, exist_ok=True)\n", + "\n", + "for omim_id in problematic_ids:\n", + " input_file = os.path.join(input_folder, f\"{omim_id}.xml\")\n", + " output_file = os.path.join(output_folder, f\"{omim_id}.xml\")\n", + "\n", + " with open(input_file, \"r\") as f:\n", + " xml_content = f.read()\n", + "\n", + " # Remove leading/trailing whitespace\n", + " xml_content = xml_content.strip()\n", + "\n", + " # Remove any existing XML declaration or DOCTYPE lines\n", + " xml_content = re.sub(r'<\\?xml[^>]+\\?>', '', xml_content)\n", + " xml_content = re.sub(r']*>', '', xml_content)\n", + "\n", + " # Wrap content in and insert declarations at the top\n", + " fixed_xml = (\n", + " '\\n'\n", + " '\\n'\n", + " '\\n'\n", + " f'{xml_content.strip()}\\n'\n", + " ''\n", + " )\n", + "\n", + " # Write the fixed file\n", + " with open(output_file, \"w\") as f:\n", + " f.write(fixed_xml)\n", + "\n", + " print(f\"✅ Fixed: {omim_id} → saved to {output_file}\")" + ] + }, + { + "cell_type": "markdown", + "id": "8e34e0e1-96c3-492b-ae4b-1b725de062c2", + "metadata": {}, + "source": [ + "Iterating over all XMLs and parsing them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dae32152-5f63-4ba6-9968-b6d443189fca", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "21eaff4b-e351-466a-a13f-85b10da15803", + "metadata": {}, + "outputs": [], + "source": [ + "good_ids = [id for id in omim_ids if id not in problematic_ids]" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "d7075e77-a7ab-42c6-b4c5-91eedd698a05", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "51 7\n" + ] + } + ], + "source": [ + "print(len(good_ids), len(problematic_ids))" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "4fdfe8dc-3826-407a-8472-7a63da9ca53c", + "metadata": {}, + "outputs": [], + "source": [ + "for id in good_ids:\n", + " with open(f'Omim/{id}_parsed.txt', \"w\") as f:\n", + " try:\n", + " extract_linked_ids(f'Omim/{id}.xml', id, f)\n", + " except:\n", + " print(id)\n", + " break\n", + " \n", + "for id in problematic_ids:\n", + " with open(f'Omim/{id}_parsed.txt', \"w\") as f:\n", + " try:\n", + " extract_linked_ids(f'Omim_fixed/{id}.xml', id, f)\n", + " except:\n", + " print(id)\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48c5706f-14b9-4903-bf51-1df39bf700ea", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + "if test -f Omim/\"$p\"_parsed.txt; then\n", + " echo \"$p exists.\"\n", + "else\n", + " echo \"$p does not exist.\"\n", + "fi\n", + "done < Omim/OmimVar_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9c028c9b-6d83-4718-9f69-cfb3ec9b85de", + "metadata": {}, + "outputs": [], + "source": [ + "cat Omim/*_parsed.txt > Omim_parsed.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d0634b17-0831-4db6-ae86-305c97735e8c", + "metadata": {}, + "outputs": [], + "source": [ + "sed -i '' '/^ClinGen/d' Omim_parsed.txt\n", + "sed -i '' '/^UniProtKB/d' Omim_parsed.txt\n", + "sed -i '' '/^ClinVar/d' Omim_parsed.txt\n", + "sed -i '' '/^dbVar/d' Omim_parsed.txt\n", + "sed -i '' '/^Genetic/d' Omim_parsed.txt\n", + "sed -i '' '/^LOVD/d' Omim_parsed.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "2101ed00-feb4-4004-981a-e3c61976d339", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Parsed file into Omim_parsed.tsv\n" + ] + } + ], + "source": [ + "#!/bin/bash\n", + "\n", + "input_file=\"Omim_parsed.txt\" # Your input file\n", + "output_file=\"Omim_parsed.tsv\" # Output TSV file\n", + "\n", + "# Write header\n", + "echo -e \"omim_id\\tdbsnp_id\" > \"$output_file\"\n", + "\n", + "# Initialize variables\n", + "omim_id=\"\"\n", + "dbsnp_id=\"\"\n", + "\n", + "# Read the file line-by-line\n", + "while IFS= read -r line || [ -n \"$line\" ]; do\n", + " # If it's an OMIM line\n", + " if [[ $line == OMIM\\ ID\\ found:* ]]; then\n", + " # If we had a previous OMIM without dbSNP, write it now\n", + " if [[ -n $omim_id ]]; then\n", + " echo -e \"${omim_id}\\t${dbsnp_id}\" >> \"$output_file\"\n", + " fi\n", + " omim_id=\"${line#OMIM ID found: }\"\n", + " dbsnp_id=\"\" # Reset dbSNP\n", + " elif [[ $line == dbSNP:* ]]; then\n", + " dbsnp_id=\"${line#dbSNP:}\"\n", + " fi\n", + "done < \"$input_file\"\n", + "\n", + "# Write the last record\n", + "if [[ -n $omim_id ]]; then\n", + " echo -e \"${omim_id}\\t${dbsnp_id}\" >> \"$output_file\"\n", + "fi\n", + "\n", + "echo \"✅ Parsed file into $output_file\"" + ] + }, + { + "cell_type": "markdown", + "id": "a35a9986-62df-4893-93de-21733eb68404", + "metadata": {}, + "source": [ + "Adding 624 dbSNP IDs to the dbSNP file for retrieval" + ] + }, + { + "cell_type": "markdown", + "id": "6acb6389-7b0e-4dc6-871c-952528646920", + "metadata": {}, + "source": [ + "### ClinVar" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "29531b7b-33f1-4e15-b0b9-8090c4bde11f", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "2f98ba8b-3120-416d-a13e-a8f7235992bd", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + " Build250414-1300.1\n", + " \n", + " 17584\n", + " single nucleotide variant\n", + " VCV000017584\n", + " VCV000017584.5\n", + " NM_001904.4(CTNNB1):c.101G>A (p.Gly34Glu)\n", + " \n", + " \n", + " 32623\n", + " \n", + " \n", + " ClinGen\n", + " CA127277\n", + " \n", + " \n", + " UniProtKB\n", + " P35222#VAR_017620\n", + " \n", + " \n", + " OMIM\n", + " 116806.0008\n", + " \n", + " \n", + " dbSNP\n", + " 28931589\n", + " \n", + " \n", + " NM_001904.4(CTNNB1):c.101G>A (p.Gly34Glu)\n", + " c.101G>A\n", + " \n", + " \n", + " current\n", + " GRCh38\n", + " 3\n", + " 3p22.1\n", + " 41224613\n", + " 41224613\n", + " 41224613\n", + " 41224613\n", + " GCF_000001405.38\n", + " \n", + " \n", + " previous\n", + " GRCh37\n", + " 3\n", + " 3p22.1\n", + " 41266104\n", + " 41266104\n", + " 41266104\n", + " 41266104\n", + " GCF_000001405.25\n", + " \n", + " \n", + " \n", + " \n", + " Exome Aggregation Consortium (ExAC)\n", + " 0.00001\n", + " \n", + " \n", + " single nucleotide variant\n", + " NC_000003.12:41224612:G:A\n", + " \n", + " \n", + " \n", + " \n", + " SCV000039437\n", + " SCV000599908\n", + " \n", + " \n", + " RCV000019149\n", + " RCV000443977\n", + " \n", + " \n", + " \n", + " Pathogenic; other\n", + " 2016/05/01 00:00\n", + " no assertion criteria provided\n", + " \n", + " \n", + " \n", + " \n", + " Orphanet\n", + " 616\n", + " \n", + " \n", + " MedGen\n", + " C0025149\n", + " \n", + " \n", + " MeSH\n", + " D008527\n", + " \n", + " \n", + " MONDO\n", + " MONDO:0007959\n", + " \n", + " \n", + " OMIM\n", + " 155255\n", + " \n", + " \n", + " Human Phenotype Ontology\n", + " HP:0002885\n", + " \n", + " \n", + " Medulloblastoma\n", + " \n", + " \n", + " \n", + " \n", + " Orphanet\n", + " 91414\n", + " \n", + " \n", + " MedGen\n", + " C0206711\n", + " \n", + " \n", + " MeSH\n", + " D018296\n", + " \n", + " \n", + " MONDO\n", + " MONDO:0007564\n", + " \n", + " \n", + " OMIM\n", + " 132600\n", + " \n", + " \n", + " Human Phenotype Ontology\n", + " HP:0030434\n", + " \n", + " \n", + " Pilomatrixoma\n", + " \n", + " \n", + " \n", + " \n", + " 1/01/01 00:00\n", + " \n", + " \n", + " 1/01/01 00:00\n", + " \n", + " CTNNB1\n", + " 03\n", + " 00000000000041224613\n", + " \n", + " \n", + " CTNNB1\n", + " 1499\n", + " +\n", + " submitted\n", + " \n", + " \n", + " LOC126806658\n", + " 126806658\n", + " +\n", + " submitted\n", + " \n", + " \n", + " \n", + " missense variant\n", + " \n", + " G34E, G27E\n", + " \n", + "\n" + ] + } + ], + "source": [ + "esearch -db clinvar -query 17584 | efetch -format docsum" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "04a62ef6-5c7d-4644-bc8f-26148b470dbf", + "metadata": {}, + "outputs": [], + "source": [ + "grep ClinVar parsed_variants.tsv | cut -f3 > ClinVar/ClinVar_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "29542800-1697-481f-a0b7-5236dee9752e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 232 ClinVar/ClinVar_id.txt\n" + ] + } + ], + "source": [ + "wc -l ClinVar/ClinVar_id.txt" + ] + }, + { + "cell_type": "markdown", + "id": "e37a7a54-6049-4a86-808d-37a3f64721ac", + "metadata": {}, + "source": [ + "Saved all of the esearch queries to clinvar_esearch.sh . 232 of them" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6c56c9f6-a6f5-487f-b474-a0dcf4b0f763", + "metadata": {}, + "outputs": [], + "source": [ + "chmod +x ClinVar/clinvar_esearch.sh" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "78286511-609e-4f02-9717-4d094e9feebb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 232 ClinVar/clinvar_esearch.sh\n" + ] + } + ], + "source": [ + "wc -l ClinVar/clinvar_esearch.sh" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "321b9cb0-0cc4-4442-bcbc-01e0d7a2f50c", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "ClinVar/./clinvar_esearch.sh" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d6ae0dde-45c0-4853-b5dc-11ed6d195eec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "376308 is empty\n", + "376242 is empty\n", + "376235 is empty\n", + "376233 is empty\n", + "375895 is empty\n", + "376282 is empty\n", + "376280 is empty\n", + "396706 is empty\n", + "375971 is empty\n", + "376068 is empty\n", + "376728 is empty\n", + "160870 is empty\n", + "376464 is empty\n", + "376461 is empty\n", + "375873 is empty\n", + "376220 is empty\n", + "375871 is empty\n", + "375872 is empty\n", + "376221 is empty\n", + "376069 is empty\n" + ] + } + ], + "source": [ + "while read p; do\n", + "[ -s ClinVar/$p.xml ] || echo \"$p is empty\"\n", + "done < ClinVar/ClinVar_id.txt" + ] + }, + { + "cell_type": "markdown", + "id": "db67cf54-0487-4736-a323-5b0417b50295", + "metadata": {}, + "source": [ + "There are 20 XMLs as seen above that have been deleted so I cannot access them" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "61f507ed-7ccf-4a43-aee9-0f70aea28791", + "metadata": {}, + "outputs": [], + "source": [ + "esearch -db clinvar -query 177620 | efetch -format docsum > ClinVar/177620.xml" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "f3b18da5-4f8f-46be-8a51-f20fb15a40fc", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + "[ -s ClinVar/$p.xml ] || rm ClinVar/$p.xml\n", + "done < ClinVar/ClinVar_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "fd9671a2-9c21-46be-ab39-37a871ffbebd", + "metadata": {}, + "outputs": [], + "source": [ + "sed -i '' '/^376308$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376242$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376235$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376233$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^375895$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376282$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376280$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^396706$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^375971$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376068$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376728$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^160870$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376464$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376461$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^375873$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376220$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^375871$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^375872$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376221$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376069$/d' ClinVar/ClinVar_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "88d4c572-be3d-4b9c-a00e-d194f2b46351", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 212 ClinVar/ClinVar_id.txt\n" + ] + } + ], + "source": [ + "wc -l ClinVar/ClinVar_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "36907abc-374c-4803-bc4f-bac8890246b8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 214\n" + ] + } + ], + "source": [ + "ls ClinVar | wc -l" + ] + }, + { + "cell_type": "markdown", + "id": "90173878-a854-4453-83a4-f32465f9425a", + "metadata": {}, + "source": [ + "214 is good and checks out. 214 - 2 = 212 which is how many ids we have" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1371ee83-ef5a-48fa-b43d-f880a744a5ae", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d4e6c38d-a397-423a-ac26-02b1240cab25", + "metadata": {}, + "outputs": [], + "source": [ + "import xml.etree.ElementTree as ET\n", + "import os\n", + "\n", + "# Paths\n", + "id_file = \"ClinVar/ClinVar_id.txt\"\n", + "input_folder = \"ClinVar\"\n", + "output_file = \"ClinVar_parsed_output.tsv\"\n", + "\n", + "# Read all IDs from the input file\n", + "with open(id_file, \"r\") as f:\n", + " clinvar_ids = [line.strip() for line in f if line.strip()]\n", + "\n", + "# Prepare output file\n", + "with open(output_file, \"w\") as out:\n", + " # Write header\n", + " out.write(\"ClinVar_ID\\tseq_id\\tposition\\tref\\talt\\n\")\n", + "\n", + " for cid in clinvar_ids:\n", + " xml_path = os.path.join(input_folder, f\"{cid}.xml\")\n", + " if not os.path.exists(xml_path):\n", + " print(f\"⚠️ File not found: {xml_path}\")\n", + " continue\n", + "\n", + " try:\n", + " # Parse XML\n", + " tree = ET.parse(xml_path)\n", + " root = tree.getroot()\n", + "\n", + " # Find all canonical_spdi tags\n", + " for spdi in root.iter(\"canonical_spdi\"):\n", + " text = spdi.text\n", + " if text and \":\" in text:\n", + " parts = text.split(\":\")\n", + " if len(parts) == 4:\n", + " seq_id, pos, ref, alt = parts\n", + " out.write(f\"{cid}\\t{seq_id}\\t{pos}\\t{ref}\\t{alt}\\n\")\n", + "\n", + " except ET.ParseError as e:\n", + " print(f\"❌ Parse error in {cid}.xml: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ad902e02-e988-4b69-ac36-435bfd317c9f", + "metadata": {}, + "outputs": [], + "source": [ + "parsed = ['16928','16929','183391','183393','183395','8823','420108','9409','376307','220711','376310','376305','376303','77637','376384','233484','127526','182409','376306','182423','17577','17576','17580','17587','17588','17579','17583','17582','376231','17584','376232','17589','17578','376228','177620','16609','45263','16613','16339','16359','16332','16333','16342','16348','16276','16273','16272','375972','16274','15933','15934','15935','15936','801','184937','802','12602','12613','35554','180848','160364','376033','219296','9834','9381','39571','39572','14801','13860','13863','13852','12582','12583','12580','12578','16677','16685','16686','16688','186141','13881','13882','13886','13883','376126','13888','13889','13890','162466','162468','162465','375876','13901','13900','373003','39648','73058','375874','177778','162469','162470','5286','225431','225433','225434','225432','31944','13655','13652','13653','13659','91945','12674','164995','13244','13245','13246','13247','13251','13250','13249','409162','418436','7829','427590','187657','7814','7837','7836','7838','7833','189486','428256','186396','404151','375958','7813','189403','185200','189484','7815','92828','189448','9511','9512','13087','428681','13919','13911','38629','37102','13951','8117','8118','13961','375941','12511','213936','217016','12374','12356','12366','12347','12365','43594','12364','12355','127819','376570','12372','2216','43604','93326','2223','417961','14464','6390','41166','41209','4893','4886','4892','161992','161993','161995']" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ae292770-d8b9-4d11-a20a-b8d336d2aed3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "185" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(parsed)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "57ccde64-cc40-4c96-b334-455be7752d3f", + "metadata": {}, + "outputs": [], + "source": [ + "remaining = [id for id in clinvar_ids if id not in parsed]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "49d32e6a-58f7-4325-b779-592fdb9addc6", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir ClinVar_remaining" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "28893df1-1863-4442-8d3c-3c944cba9244", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Copied: 268075.xml\n", + "✅ Copied: 150740.xml\n", + "✅ Copied: 59680.xml\n", + "✅ Copied: 59682.xml\n", + "✅ Copied: 148363.xml\n", + "✅ Copied: 58696.xml\n", + "✅ Copied: 57282.xml\n", + "✅ Copied: 59782.xml\n", + "✅ Copied: 153718.xml\n", + "✅ Copied: 148679.xml\n", + "✅ Copied: 16270.xml\n", + "✅ Copied: 59715.xml\n", + "✅ Copied: 394884.xml\n", + "✅ Copied: 153231.xml\n", + "✅ Copied: 151754.xml\n", + "✅ Copied: 149554.xml\n", + "✅ Copied: 153441.xml\n", + "✅ Copied: 148269.xml\n", + "✅ Copied: 57074.xml\n", + "✅ Copied: 394609.xml\n", + "✅ Copied: 58030.xml\n", + "✅ Copied: 58029.xml\n", + "✅ Copied: 58028.xml\n", + "✅ Copied: 441904.xml\n", + "✅ Copied: 146814.xml\n", + "✅ Copied: 144406.xml\n", + "✅ Copied: 57042.xml\n" + ] + } + ], + "source": [ + "import os\n", + "import shutil \n", + "# Paths\n", + "source_dir = \"ClinVar\"\n", + "dest_dir = \"ClinVar_remaining\"\n", + "\n", + "# Ensure destination folder exists\n", + "os.makedirs(dest_dir, exist_ok=True)\n", + "\n", + "# Iterate and copy files\n", + "for clinvar_id in remaining:\n", + " src = os.path.join(source_dir, f\"{clinvar_id}.xml\")\n", + " dst = os.path.join(dest_dir, f\"{clinvar_id}.xml\")\n", + "\n", + " if os.path.exists(src):\n", + " shutil.copy(src, dst)\n", + " print(f\"✅ Copied: {clinvar_id}.xml\")\n", + " else:\n", + " print(f\"⚠️ Missing: {clinvar_id}.xml\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8cd41b62-cb7b-4380-9d79-de1befc1637c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 27\n" + ] + } + ], + "source": [ + "!ls Clinvar_remaining | wc -l" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "eb58056e-71ad-4602-847f-457366f2b963", + "metadata": {}, + "outputs": [], + "source": [ + "!cat Clinvar_remaining/* > Clinvar_remaining/all_remaining_variants.xml" + ] + }, + { + "cell_type": "markdown", + "id": "2b779d44-a620-42c8-8f0c-5ab96dcec165", + "metadata": {}, + "source": [ + "They are all copy number gain variations. Nothing that I can do for this project. So we will stick with our 185 parsed" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "29e34b3a-8a05-448a-a22f-6f9a0712f221", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 186 ClinVar_parsed_output.tsv\n" + ] + } + ], + "source": [ + "!wc -l ClinVar_parsed_output.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb3f65c6-6c01-4aab-ba6b-b46c7834ff1e", + "metadata": {}, + "outputs": [], + "source": [ + "rm -r Clinvar_remaining" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db6bf93c-404b-43fd-8699-5cd8de1ae03b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "a0610722-b83d-4d72-b0cf-75135eaa7141", + "metadata": {}, + "source": [ + "### dbSNP" + ] + }, + { + "cell_type": "markdown", + "id": "7b650b97-61c1-4db3-8e0c-9c4df6d4aac6", + "metadata": {}, + "source": [ + "Have to get the variants from OmimVar" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "470b0539-8f7f-49f0-8f95-e9293e3872d3", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "751381ad-54e6-4063-a1f2-71c4ffb2e617", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + " Build250306-1408.1\n", + " \n", + " 1131690863\n", + " 1131690863\n", + " 0\n", + " uncertain-significance,pathogenic\n", + " \n", + " \n", + " RB1\n", + " 5925\n", + " \n", + " \n", + " LOC112268118\n", + " 112268118\n", + " \n", + " \n", + " NC_000013.11\n", + " 13\n", + " EVA,CSS-BFX,CLINVAR\n", + " NC_000013.11:48362846:C:A,NC_000013.11:48362846:C:G,NC_000013.11:48362846:C:T\n", + " coding_sequence_variant,stop_gained,500B_downstream_variant,synonymous_variant,missense_variant,downstream_transcript_variant\n", + " by-cluster\n", + " HGVS=NC_000013.11:g.48362847C>A,NC_000013.11:g.48362847C>G,NC_000013.11:g.48362847C>T,NC_000013.10:g.48936983C>A,NC_000013.10:g.48936983C>G,NC_000013.10:g.48936983C>T,NG_009009.1:g.64101C>A,NG_009009.1:g.64101C>G,NG_009009.1:g.64101C>T,NM_000321.3:c.751C>A,NM_000321.3:c.751C>G,NM_000321.3:c.751C>T,NM_000321.2:c.751C>A,NM_000321.2:c.751C>G,NM_000321.2:c.751C>T,NM_001407166.1:c.751C>A,NM_001407166.1:c.751C>G,NM_001407166.1:c.751C>T,NM_001407165.1:c.751C>A,NM_001407165.1:c.751C>G,NM_001407165.1:c.751C>T,NP_000312.2:p.Arg251Gly,NP_000312.2:p.Arg251Ter|SEQ=[C/A/G/T]|LEN=1|GENE=RB1:5925,LOC112268118:112268118\n", + " 9606\n", + " 150\n", + " 157\n", + " 2017/07/17 11:16\n", + " 2024/11/03 17:09\n", + " 2137537937,6403986513,8442109874,8936184886\n", + " N\n", + " snv\n", + " 13:48362847\n", + " 13:48936983\n", + " 1131690863\n", + " 1\n", + " 0048362847\n", + " 0\n", + " \n", + "\n" + ] + } + ], + "source": [ + "esearch -db snp -query rs1131690863 | efetch -format docsum" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5cc99d6c-79f3-49fe-8bfa-ddda5821870a", + "metadata": {}, + "outputs": [], + "source": [ + "mkdir dbSNP" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0fe72bad-ed7c-4785-a475-2b39cc31974b", + "metadata": {}, + "outputs": [], + "source": [ + "grep dbSNP parsed_variants.tsv | cut -f3 > dbSNP/dbSNP_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "6f19e412-c93a-498e-8995-e8ec7b2a2398", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 201 dbSNP/dbSNP_id.txt\n" + ] + } + ], + "source": [ + "wc -l dbSNP/dbSNP_id.txt" + ] + }, + { + "cell_type": "markdown", + "id": "f9f1615b-81eb-48a7-adfd-b6f5a0161e79", + "metadata": {}, + "source": [ + "Added from Omim and removed repeats" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "4bb0fcc0-8f66-4fc2-9b7a-02b321194636", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 761 dbSNP/dbSNP_id.txt\n" + ] + } + ], + "source": [ + "wc -l dbSNP/dbSNP_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "e2e729f7-4bec-4f40-986c-fd0103186030", + "metadata": {}, + "outputs": [], + "source": [ + "# Saved the scripts to download all 761\n", + "chmod +x dbSNP/dbSNP_search.sh" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "2b271743-e43d-46ef-b689-2f928007cb4f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b(B\u001b[m\u001b[31m\u001b[1m\u001b[7m ERROR: \u001b(B\u001b[m\u001b[31m\u001b[1m Missing -db argument\u001b(B\u001b[m\n", + "\u001b(B\u001b[m\u001b[31m\u001b[1m\u001b[7m ERROR: \u001b(B\u001b[m\u001b[31m\u001b[1m Missing -db argument\u001b(B\u001b[m\n" + ] + } + ], + "source": [ + "dbSNP/./dbSNP_search.sh" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "1bf2df42-f562-4a29-85b2-7caea7919d58", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + "if ! test -f dbSNP/\"$p\".xml; then\n", + " echo \"$p does not exist.\"\n", + "fi\n", + "done < dbSNP/dbSNP_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "1b8d5d2d-83b1-4e23-8595-00c9ad01148b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rs121908237 is empty\n", + "rs137852480 is empty\n", + "rs13785281 is empty\n" + ] + } + ], + "source": [ + "while read p; do\n", + "[ -s dbSNP/\"$p\".xml ] || echo \"$p is empty\"\n", + "done < dbSNP/dbSNP_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "8a5de403-f6da-402b-a97a-32127db07014", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "esearch -db snp -query rs121908237 | efetch -format docsum > dbSNP/rs121908237.xml" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "e3dd144a-4aaa-4d72-84ee-2c43ce48f627", + "metadata": {}, + "outputs": [], + "source": [ + "esearch -db snp -query rs137852480 | efetch -format docsum > dbSNP/rs137852480.xml" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "93b02ebc-fe12-47a5-aa35-1c51445fcdcf", + "metadata": {}, + "outputs": [], + "source": [ + "esearch -db snp -query rs13785281 | efetch -format docsum > dbSNP/rs13785281.xml" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "5bc843db-1b32-4ec7-82f6-d1dec6589415", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rs13785281 is empty\n" + ] + } + ], + "source": [ + "while read p; do\n", + "[ -s dbSNP/\"$p\".xml ] || echo \"$p is empty\"\n", + "done < dbSNP/dbSNP_id.txt" + ] + }, + { + "cell_type": "markdown", + "id": "e55ec094-ee54-459d-92b0-2d767c1c428e", + "metadata": {}, + "source": [ + "rs13785281 is not found and is removed from the id file" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "0558f5d9-0627-4187-a451-90111fa2b1d9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 760 dbSNP/dbSNP_id.txt\n" + ] + } + ], + "source": [ + "wc -l dbSNP/dbSNP_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "1f09c627-0e45-4632-9918-0ee00e34350b", + "metadata": {}, + "outputs": [], + "source": [ + "while read -r p; do\n", + " if ! grep -q SPDI \"dbSNP/$p.xml\"; then\n", + " echo \"$p\"\n", + " fi\n", + "done < dbSNP/dbSNP_id.txt" + ] + }, + { + "cell_type": "markdown", + "id": "1873b4d1-96bd-431a-98ad-b47c361bbefb", + "metadata": {}, + "source": [ + "No output means every file has SPDI. yay" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bfd640d-765a-4cb8-a459-27c6ff897572", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "9a487229-2a65-4437-9c58-639774197373", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import xml.etree.ElementTree as ET\n", + "\n", + "# File paths\n", + "input_ids_file = \"dbSNP/dbSNP_id.txt\"\n", + "input_folder = \"dbSNP\"\n", + "output_file = \"dbSNP_output.tsv\"\n", + "\n", + "# Read SNP IDs\n", + "with open(input_ids_file, \"r\") as f:\n", + " dbsnp_ids = [line.strip() for line in f if line.strip()]\n", + "\n", + "# Open TSV output file\n", + "with open(output_file, \"w\") as out:\n", + " out.write(\"dbsnp_id\\tsequence_id\\tposition\\tref\\talt\\n\")\n", + "\n", + " for dbsnp_id in dbsnp_ids:\n", + " xml_path = os.path.join(input_folder, f\"{dbsnp_id}.xml\")\n", + "\n", + " if not os.path.exists(xml_path):\n", + " print(f\"⚠️ Missing: {xml_path}\")\n", + " continue\n", + "\n", + " try:\n", + " tree = ET.parse(xml_path)\n", + " root = tree.getroot()\n", + "\n", + " for spdi in root.iter(\"SPDI\"):\n", + " if spdi.text:\n", + " spdi_items = spdi.text.strip().split(\",\")\n", + " for item in spdi_items:\n", + " parts = item.strip().split(\":\")\n", + " if len(parts) == 4:\n", + " seq_id, pos, ref, alt = parts\n", + " out.write(f\"{dbsnp_id}\\t{seq_id}\\t{pos}\\t{ref}\\t{alt}\\n\")\n", + " else:\n", + " print(f\"⚠️ Invalid SPDI format in {dbsnp_id}: {item}\")\n", + "\n", + " except ET.ParseError as e:\n", + " print(f\"❌ Parse error in {dbsnp_id}.xml: {e}\")\n", + " except Exception as e:\n", + " print(f\"❌ Unexpected error in {dbsnp_id}.xml: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "2cbf7146-7d94-4905-adc0-f7d1e6443074", + "metadata": {}, + "source": [ + "Removed the duplicate lines and am left with 1408 mutations" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "7727dcc2-ed83-4f8d-808e-d8e24643a53a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 1409 dbSNP_output.tsv\n" + ] + } + ], + "source": [ + "!wc -l dbSNP_output.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9687bc9-97e0-41e9-82a1-5cfb986ae13b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "c4bd83bb-deea-41c0-87dc-d0982b0cc00b", + "metadata": {}, + "source": [ + "### COSM" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "65fb0b92-2e7b-4080-935a-e74b58bf0329", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "645e17ee-fe7c-4b48-bece-3c17de3dbd9c", + "metadata": {}, + "outputs": [], + "source": [ + "mkdir COSM" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "d04ca4ce-29ab-446d-8df9-95984f3c403f", + "metadata": {}, + "outputs": [], + "source": [ + "grep COSM parsed_variants.tsv | cut -f 3 > COSM/COSM_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "12380e56-6f2a-4f8d-9bcf-f97275e6e39b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 202 COSM/COSM_ids.txt\n" + ] + } + ], + "source": [ + "wc -l COSM/COSM_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "3c63290a-2d6d-4cd7-949d-ae3fe77f0136", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1677139\n", + "1989836\n", + "12523\n", + "13800\n", + "12475\n", + "12504\n", + "12506\n", + "13281\n", + "12512\n", + "12476\n" + ] + } + ], + "source": [ + "head COSM/COSM_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "29147778-754b-4ab6-a4ef-49cd5f314503", + "metadata": {}, + "outputs": [], + "source": [ + "while read id; do\n", + "curl --silent \"https://rest.ensembl.org/variation/human/\"$id\"?content-type=application/json\" > COSM/\"$id\".txt\n", + "done < COSM/COSM_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "df9c5163-8841-4279-940a-86d74b69f6a3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 203\n" + ] + } + ], + "source": [ + "ls COSM/* | wc -l" + ] + }, + { + "cell_type": "markdown", + "id": "bcc33132-cb25-49a7-8615-d2dc32278e4d", + "metadata": {}, + "source": [ + "download the COSM database from here https://cancer.sanger.ac.uk/cosmic/download/cosmic/v101/completetargetedscreensmutanttsv" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "c95fce22-ab4d-44d7-9321-d10bf1dfb368", + "metadata": {}, + "outputs": [], + "source": [ + "sed 's/$/\\t/' COSM/COSM_ids.txt > COSM/COSM_ids_tab.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "65045648-4b11-4264-a11a-3b47d58e0bc6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "grep -F -f COSM/COSM_ids_tab.txt Cosmic_CompleteTargetedScreensMutant_v101_GRCh38.tsv > COSM_matched.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "d2c438e0-9110-4f04-b155-71bf04e399f4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 372391 COSM_matched.tsv\n" + ] + } + ], + "source": [ + "wc -l COSM_matched.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "b7a458ce-3980-4ba8-a74f-e16f4534a6cc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 160 COSM_matched_id_unique.txt\n" + ] + } + ], + "source": [ + "cut -f 8 COSM_matched.tsv > COSM_matched_id.txt\n", + "sort -u COSM_matched_id.txt > COSM_matched_id_unique.txt\n", + "wc -l COSM_matched_id_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "22e104b8-587c-4b11-bb40-ef05a4fd1899", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "COSM12475\n", + "COSM12506\n", + "COSM12512\n", + "COSM13766\n", + "COSM13786\n", + "COSM13675\n", + "COSM13224\n", + "COSM13723\n", + "COSM13474\n", + "COSM12505\n", + "COSM785\n", + "COSM238553\n", + "COSM5564006\n", + "COSM5015793\n", + "COSM1673476\n", + "COSM6196669\n", + "COSM878\n", + "COSM965\n", + "COSM4766182\n" + ] + } + ], + "source": [ + "while read -r p; do\n", + " if ! grep -q $p COSM_matched_id_unique.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < COSM/COSM_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "22d892ef-d82b-49e7-a870-f909c3b4bce6", + "metadata": {}, + "outputs": [], + "source": [ + "echo 'COSM12475\n", + "COSM12506\n", + "COSM12512\n", + "COSM13766\n", + "COSM13786\n", + "COSM13675\n", + "COSM13224\n", + "COSM13723\n", + "COSM13474\n", + "COSM12505\n", + "COSM785\n", + "COSM238553\n", + "COSM5564006\n", + "COSM5015793\n", + "COSM1673476\n", + "COSM6196669\n", + "COSM878\n", + "COSM965\n", + "COSM4766182' > COSM_unmatched_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "de79b3a9-2aae-46c1-b5e9-3d5e6c6ea7ff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 19 COSM_unmatched_id.txt\n" + ] + } + ], + "source": [ + "wc -l COSM_unmatched_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "46f5194d-d9db-4896-8d9b-5145e77b95ac", + "metadata": {}, + "outputs": [], + "source": [ + "sed 's/$/\\t/' COSM_unmatched_id.txt > COSM_unmatched_tab_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "c1980527-71c4-460f-8798-7b75da61dab4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "grep -F -f COSM_unmatched_tab_id.txt Cosmic_CompleteTargetedScreensMutant_v101_GRCh37.tsv > COSM_unmatched.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "e30fa788-2015-4eb7-bf66-5b37a50531f3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 207 COSM_unmatched.tsv\n" + ] + } + ], + "source": [ + "wc -l COSM_unmatched.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "e942fe15-a77f-49a1-be18-7f11f8b97cfd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 5 COSM_unmatched_id_unique.txt\n" + ] + } + ], + "source": [ + "cut -f 8 COSM_unmatched.tsv > COSM_unmatched_id_parsed.txt\n", + "sort -u COSM_unmatched_id_parsed.txt > COSM_unmatched_id_unique.txt\n", + "wc -l COSM_unmatched_id_unique.txt" + ] + }, + { + "cell_type": "markdown", + "id": "ee9c11d0-0ea2-4c3d-b25b-12cffff1d877", + "metadata": {}, + "source": [ + "**Removing the COSM unmatched IDs from the text file**" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "id": "5986b990-3b24-40ac-a8c9-4f2eca0c5203", + "metadata": {}, + "outputs": [], + "source": [ + "rm COSM/COSM_total_parsed.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "c80b60e7-34e2-46bb-b6a4-7ad4732f7737", + "metadata": {}, + "outputs": [], + "source": [ + "cat COSM/COSM_matched.tsv >> COSM/COSM_total_parsed.tsv\n", + "cat COSM/COSM_unmatched.tsv >> COSM/COSM_total_parsed.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "id": "6daae3a8-092c-4652-ac4c-d78ed3c0bdea", + "metadata": {}, + "outputs": [], + "source": [ + "cp COSM/COSM_ids.txt COSM/COSM_ids_final.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "9f91b602-8beb-4556-9605-893d92117faa", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "COSM12475\n", + "COSM12506\n", + "COSM12512\n", + "COSM13766\n", + "COSM13675\n", + "COSM13224\n", + "COSM13474\n", + "COSM238553\n", + "COSM5564006\n", + "COSM5015793\n", + "COSM1673476\n", + "COSM6196669\n", + "COSM5159\n", + "COSM5313\n", + "COSM5154\n", + "COSM5105\n", + "COSM5204\n", + "COSM5141\n", + "COSM5283\n", + "COSM5079\n", + "COSM5046\n", + "COSM86063\n", + "COSM5142\n", + "COSM5322\n", + "COSM23625\n", + "COSM3736941\n", + "COSM5052\n", + "COSM1167954\n", + "COSM5143\n", + "COSM5119\n", + "COSM5148\n", + "COSM861\n", + "COSM878\n", + "COSM859\n", + "COSM860\n", + "COSM862\n", + "COSM864\n", + "COSM965\n", + "COSM1237919\n", + "COSM13152\n", + "COSM33076\n", + "COSM17983\n", + "COSM25676\n", + "COSM17855\n", + "COSM142849\n", + "COSM4387483\n", + "COSM4766182\n" + ] + } + ], + "source": [ + "while read -r p; do\n", + " if ! grep -q $p COSM/COSM_total_parsed.tsv; then\n", + " echo $p\n", + " sed -i '' '/'$p'/d' COSM/COSM_ids_final.txt\n", + " fi\n", + "done < COSM/COSM_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "98b263d1-0b58-4389-b043-accaf7b300db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 132 COSM/COSM_ids_final.txt\n" + ] + } + ], + "source": [ + "wc -l COSM/COSM_ids_final.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "id": "d92b5492-3048-46e1-a32a-6a7c0137aa19", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 166 COSM/COSM_total_parsed_id_unique.txt\n" + ] + } + ], + "source": [ + "cut -f 8 COSM/COSM_total_parsed.tsv > COSM/COSM_total_parsed_id.txt\n", + "sort -u COSM/COSM_total_parsed_id.txt > COSM/COSM_total_parsed_id_unique.txt\n", + "wc -l COSM/COSM_total_parsed_id_unique.txt\n", + "\n", + "rm COSM/COSM_total_parsed_id.txt\n", + "rm COSM/COSM_total_parsed_id_unique.txt" + ] + }, + { + "cell_type": "markdown", + "id": "6e6a0d3b-bc90-4164-8363-98f0eded180e", + "metadata": {}, + "source": [ + "### Parsing the Matched TSV File" + ] + }, + { + "cell_type": "markdown", + "id": "8febdfca-ae58-4874-a2ef-6446deb91273", + "metadata": {}, + "source": [ + "Got it into excel and deleting columns that don't matter" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "id": "e6a403e1-75be-48b6-b4f0-4d454cba047e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 1140 COSM/COSM_total_parsed.tsv\n" + ] + } + ], + "source": [ + "wc -l COSM/COSM_total_parsed.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "677cab77-1631-4035-9601-4c59d942a0c0", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8123bf3-f57c-4a1f-aa7a-25dea97371cb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "6bca1dde-ae4d-461f-b340-32807c35f8b3", + "metadata": {}, + "source": [ + "### COSF" + ] + }, + { + "cell_type": "markdown", + "id": "b5ec170c-a7f4-4216-b6ef-f7836232481f", + "metadata": {}, + "source": [ + "download the COSF database from here https://cancer.sanger.ac.uk/cosmic/download/cosmic/v101/fusion" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "0b12424c-c36c-4c52-b395-e93edec5d983", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "grep COSF parsed_variants.tsv | cut -f 3 > COSF/cosf_ids_temp.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "7b480d0f-3d6d-4ff5-98bb-1a9f4e570c4f", + "metadata": {}, + "outputs": [], + "source": [ + "sort -u COSF/cosf_ids_temp.txt > COSF/cosf_ids_temp_uniq.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "09e3e382-150b-40da-93ab-71d6aff06fdf", + "metadata": {}, + "outputs": [], + "source": [ + "rm COSF/cosf_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "c0b89452-303c-4ec3-8f77-88d9576d8173", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + "echo COSF$p >> COSF/cosf_ids.txt\n", + "\n", + "done < COSF/cosf_ids_temp_uniq.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "7778fc82-53fe-4ebd-a234-47434cc6bb3f", + "metadata": {}, + "outputs": [], + "source": [ + "rm COSF/cosf_ids_temp.txt\n", + "rm COSF/cosf_ids_temp_uniq.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "e8241f09-30a9-43e8-a4c3-0d1a340aad2f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "COSF121\n", + "COSF1216\n", + "COSF1220\n", + "COSF1224\n", + "COSF1231\n", + "COSF125\n", + "COSF1271\n", + "COSF128\n", + "COSF1319\n", + "COSF1320\n" + ] + } + ], + "source": [ + "head COSF/cosf_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "ab74bdbe-65f6-4596-a425-54868ed6859c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 65 COSF/cosf_ids.txt\n" + ] + } + ], + "source": [ + "wc -l COSF/cosf_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f0fc6fc-41d2-4a64-b214-91d3359f0db0", + "metadata": {}, + "outputs": [], + "source": [ + "cat Cosmic_Fusion_v101_GRCh38.tsv >> COSF/Cosmic_Fusion.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "01d86304-9e9a-4e83-a5cc-e8f221c8c36c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 18M\tCOSF/Cosmic_Fusion.tsv\n" + ] + } + ], + "source": [ + "du -h COSF/Cosmic_Fusion.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "e848ea6f-cec3-482d-8ff4-923dbbf6ce3b", + "metadata": {}, + "outputs": [], + "source": [ + "sed 's/$/\\t/' COSF/cosf_ids.txt > COSF/cosf_ids_tab.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "id": "d2d8a7b5-6c29-459c-8ff4-94abed7250e0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Extracted COSF entries saved to: COSF/kegg_data_cosf.tsv\n" + ] + } + ], + "source": [ + "#!/bin/bash\n", + "\n", + "# Paths (edit these as needed)\n", + "COSF_ID_FILE=\"COSF/cosf_ids_tab.txt\"\n", + "COSMIC_TSV=\"COSF/Cosmic_Fusion.tsv\"\n", + "OUTPUT_TSV=\"COSF/kegg_data_cosf.tsv\"\n", + "\n", + "# Header based on README\n", + "HEADER=\"COSMIC_SAMPLE_ID\\tSAMPLE_NAME\\tCOSMIC_PHENOTYPE_ID\\tCOSMIC_FUSION_ID\\tFUSION_SYNTAX\\tFIVE_PRIME_CHROMOSOME\\tFIVE_PRIME_STRAND\\tFIVE_PRIME_TRANSCRIPT_ID\\tFIVE_PRIME_GENE_SYMBOL\\tFIVE_PRIME_LAST_OBSERVE_EXON\\tFIVE_PRIME_GENOME_START_FROM\\tFIVE_PRIME_GENOME_START_TO\\tFIVE_PRIME_GENOME_STOP_FROM\\tFIVE_PRIME_GENOME_STOP_TO\\tTHREE_PRIME_CHROMOSOME\\tTHREE_PRIME_STRAND\\tTHREE_PRIME_TRANSCRIPT_ID\\tTHREE_PRIME_GENE_SYMBOL\\tTHREE_PRIME_FIRST_OBSERVE_EXON\\tTHREE_PRIME_GENOME_START_FROM\\tTHREE_PRIME_GENOME_START_TO\\tTHREE_PRIME_GENOME_STOP_FROM\\tTHREE_PRIME_GENOME_STOP_TO\\tFUSION_TYPE\\tPUBMED_PMID\"\n", + "\n", + "# Write header to output\n", + "echo -e \"$HEADER\" > \"$OUTPUT_TSV\"\n", + "\n", + "grep -F -f $COSF_ID_FILE $COSMIC_TSV >> $OUTPUT_TSV\n", + "\n", + "echo \"✅ Extracted COSF entries saved to: $OUTPUT_TSV\"" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "id": "a1175ebc-4bcb-499d-bd07-8b0b77df9954", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 29 COSF/kegg_data_cosf_parsed_uniq.txt\n" + ] + } + ], + "source": [ + "cut -f 4 COSF/kegg_data_cosf.tsv > COSF/kegg_data_cosf_parsed.txt\n", + "sort -u COSF/kegg_data_cosf_parsed.txt > COSF/kegg_data_cosf_parsed_uniq.txt\n", + "wc -l COSF/kegg_data_cosf_parsed_uniq.txt\n", + "\n", + "rm COSF/kegg_data_cosf_parsed.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "60e6f9d9-b4d9-4a90-9bfd-8523a26f2d85", + "metadata": {}, + "outputs": [], + "source": [ + "cp COSF/cosf_ids.txt COSF/cosf_ids_final.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "0673d326-99a6-4957-82e4-2151b4a5f2aa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "COSF1220\n", + "COSF1224\n", + "COSF125\n", + "COSF128\n", + "COSF1330\n", + "COSF1490\n", + "COSF154\n", + "COSF155\n", + "COSF166\n", + "COSF168\n", + "COSF1756\n", + "COSF1758\n", + "COSF1805\n", + "COSF187\n", + "COSF189\n", + "COSF1949\n", + "COSF1960\n", + "COSF2067\n", + "COSF2124\n", + "COSF218\n", + "COSF220\n", + "COSF2246\n", + "COSF2248\n", + "COSF248\n", + "COSF300\n", + "COSF302\n", + "COSF355\n", + "COSF356\n", + "COSF394\n", + "COSF396\n", + "COSF463\n", + "COSF501\n", + "COSF504\n", + "COSF528\n", + "COSF806\n", + "COSF808\n" + ] + } + ], + "source": [ + "while read -r p; do\n", + " if ! grep -q $p COSF/kegg_data_cosf_parsed_uniq.txt; then\n", + " echo $p\n", + " sed -i '' '/'$p'/d' COSF/cosf_ids_final.txt\n", + " fi\n", + "done < COSF/cosf_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "e1b77653-09e4-40e6-8ac3-2cf3be1442cf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 29 COSF/cosf_ids_final.txt\n" + ] + } + ], + "source": [ + "wc -l COSF/cosf_ids_final.txt" + ] + }, + { + "cell_type": "markdown", + "id": "670cec3f-237f-4789-a725-7d4d5a366815", + "metadata": {}, + "source": [ + "I was looking at the data and they don't give any proper ways to get the exact nt sequence, so I am leaving this out." + ] + }, + { + "cell_type": "markdown", + "id": "e18961b9-735c-47c2-bd68-0b6184c05375", + "metadata": {}, + "source": [ + "# Matching Variant and Nt sequence to each Network/Pathway" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be8b5cfb-3309-4abf-a7ae-250efae122a0", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "429d5f3b-b9e2-4ac4-9992-e6755a578bf6", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "c8cee289-0c1c-4091-81d8-37121fcd8644", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceID
010133v1OmimVar10133
11019v1ClinVar268075
21019v1ClinVar150740
31019v1dbVarnsv917029
41019v2ClinVar16928
............
7839817v1COSM6196638
784999v2COSM4766182
785999v2COSM1379150
786999v2COSM4766211
787999v2COSM4766271
\n", + "

788 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID\n", + "0 10133v1 OmimVar 10133\n", + "1 1019v1 ClinVar 268075\n", + "2 1019v1 ClinVar 150740\n", + "3 1019v1 dbVar nsv917029\n", + "4 1019v2 ClinVar 16928\n", + ".. ... ... ...\n", + "783 9817v1 COSM 6196638\n", + "784 999v2 COSM 4766182\n", + "785 999v2 COSM 1379150\n", + "786 999v2 COSM 4766211\n", + "787 999v2 COSM 4766271\n", + "\n", + "[788 rows x 3 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "parsed_variants = pd.read_csv(\"parsed_variants.tsv\", sep='\\t')\n", + "parsed_variants" + ] + }, + { + "cell_type": "markdown", + "id": "f0329361-f9df-471f-8ff0-a7265ada0ad2", + "metadata": {}, + "source": [ + "### ClinVar" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "7003f600-8b18-4f50-b041-3dd71af940e2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDseq_idpositionrefalt
016928NC_000012.1257751647GA
116929NC_000012.1257751646CT
2183391NC_000012.1212717896CAGGCGGAGCACCCCAAGCCCAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC
3183393NC_000012.1212718044CT
4183395NC_000012.1212718210CTCTCT
..................
1804886NC_000011.1067483197CT
1814892NC_000011.1067490803CA
182161992NC_000015.1050490442TC
183161993NC_000015.1050490443CG
184161995NC_000015.1050490449CG
\n", + "

185 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " ID seq_id position ref \\\n", + "0 16928 NC_000012.12 57751647 G \n", + "1 16929 NC_000012.12 57751646 C \n", + "2 183391 NC_000012.12 12717896 CAGGCGGAGCACCCCAAGCC \n", + "3 183393 NC_000012.12 12718044 C \n", + "4 183395 NC_000012.12 12718210 CTCT \n", + ".. ... ... ... ... \n", + "180 4886 NC_000011.10 67483197 C \n", + "181 4892 NC_000011.10 67490803 C \n", + "182 161992 NC_000015.10 50490442 T \n", + "183 161993 NC_000015.10 50490443 C \n", + "184 161995 NC_000015.10 50490449 C \n", + "\n", + " alt \n", + "0 A \n", + "1 T \n", + "2 CAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC \n", + "3 T \n", + "4 CT \n", + ".. ... \n", + "180 T \n", + "181 A \n", + "182 C \n", + "183 G \n", + "184 G \n", + "\n", + "[185 rows x 5 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clinvar_data = pd.read_csv(\"ClinVar_parsed_output.tsv\",sep='\\t')\n", + "clinvar_data = clinvar_data.rename(columns={\"ClinVar_ID\": \"ID\"})\n", + "clinvar_data['ID'] = clinvar_data['ID'].astype('string')\n", + "clinvar_data" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "f0319623-fcb7-4d14-a4ec-cbb9f117af50", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of missing ClinVar variant is 49\n" + ] + } + ], + "source": [ + "# Ensure ClinVar_ID is treated as string to avoid dtype mismatch\n", + "clinvar_ids = clinvar_data[\"ID\"].astype(str).unique()\n", + "\n", + "missing_num = 0\n", + "\n", + "# Iterate and print missing ClinVar IDs\n", + "for _, row in parsed_variants.iterrows():\n", + " if row[\"Source\"] == \"ClinVar\" and str(row[\"ID\"]) not in clinvar_ids:\n", + " missing_num+=1\n", + "print(f'Number of missing ClinVar variant is {missing_num}')" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "0d757a26-2a7d-41a8-9914-4bcbd8f166f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDseq_idpositionrefalt
01019v2ClinVar16928NC_000012.1257751647GA
11019v2ClinVar16929NC_000012.1257751646CT
21027v3ClinVar183391NC_000012.1212717896CAGGCGGAGCACCCCAAGCCCAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC
31027v3ClinVar183393NC_000012.1212718044CT
41027v3ClinVar183395NC_000012.1212718210CTCTCT
........................
1809049v1ClinVar4886NC_000011.1067483197CT
1819049v1ClinVar4892NC_000011.1067490803CA
1829101v1ClinVar161992NC_000015.1050490442TC
1839101v1ClinVar161993NC_000015.1050490443CG
1849101v1ClinVar161995NC_000015.1050490449CG
\n", + "

185 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID seq_id position ref \\\n", + "0 1019v2 ClinVar 16928 NC_000012.12 57751647 G \n", + "1 1019v2 ClinVar 16929 NC_000012.12 57751646 C \n", + "2 1027v3 ClinVar 183391 NC_000012.12 12717896 CAGGCGGAGCACCCCAAGCC \n", + "3 1027v3 ClinVar 183393 NC_000012.12 12718044 C \n", + "4 1027v3 ClinVar 183395 NC_000012.12 12718210 CTCT \n", + ".. ... ... ... ... ... ... \n", + "180 9049v1 ClinVar 4886 NC_000011.10 67483197 C \n", + "181 9049v1 ClinVar 4892 NC_000011.10 67490803 C \n", + "182 9101v1 ClinVar 161992 NC_000015.10 50490442 T \n", + "183 9101v1 ClinVar 161993 NC_000015.10 50490443 C \n", + "184 9101v1 ClinVar 161995 NC_000015.10 50490449 C \n", + "\n", + " alt \n", + "0 A \n", + "1 T \n", + "2 CAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC \n", + "3 T \n", + "4 CT \n", + ".. ... \n", + "180 T \n", + "181 A \n", + "182 C \n", + "183 G \n", + "184 G \n", + "\n", + "[185 rows x 7 columns]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clinvar_final = parsed_variants.merge(clinvar_data, on='ID')\n", + "clinvar_final" + ] + }, + { + "cell_type": "markdown", + "id": "0942ebc8-dfae-4f96-9762-2b83c01b5e29", + "metadata": {}, + "source": [ + "### dbSNP" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "33f3303c-51a4-46d2-9941-8331aee362c6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDdbsnp_idseq_idpositionrefalt
0104311rs661NC_000014.973217224GA
1104311rs661NC_000014.973217224GT
2606463rs364897NC_000001.11155238214TA
3606463rs364897NC_000001.11155238214TC
4606463rs368060NC_000001.11155235216CG
.....................
1403rs672601307rs672601307NC_000015.1050490442TC
1404rs672601308rs672601308NC_000015.1050490443CG
1405rs672601308rs672601308NC_000015.1050490443CT
1406rs672601311rs672601311NC_000015.1050490449CG
1407rs672601311rs672601311NC_000015.1050490449CT
\n", + "

1408 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " ID dbsnp_id seq_id position ref alt\n", + "0 104311 rs661 NC_000014.9 73217224 G A\n", + "1 104311 rs661 NC_000014.9 73217224 G T\n", + "2 606463 rs364897 NC_000001.11 155238214 T A\n", + "3 606463 rs364897 NC_000001.11 155238214 T C\n", + "4 606463 rs368060 NC_000001.11 155235216 C G\n", + "... ... ... ... ... .. ..\n", + "1403 rs672601307 rs672601307 NC_000015.10 50490442 T C\n", + "1404 rs672601308 rs672601308 NC_000015.10 50490443 C G\n", + "1405 rs672601308 rs672601308 NC_000015.10 50490443 C T\n", + "1406 rs672601311 rs672601311 NC_000015.10 50490449 C G\n", + "1407 rs672601311 rs672601311 NC_000015.10 50490449 C T\n", + "\n", + "[1408 rows x 6 columns]" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dbsnp_data = pd.read_csv(\"dbSNP_output.tsv\",sep='\\t')\n", + "dbsnp_data = dbsnp_data.rename(columns={\"True Id\": \"ID\",\"sequence_id\":'seq_id'})\n", + "dbsnp_data['ID'] = dbsnp_data['ID'].astype('string')\n", + "dbsnp_data" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "0223dcf2-3eba-43a1-847a-709f17b069a1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of missing dbSNP and OmimVar variant is 244\n" + ] + } + ], + "source": [ + "# Ensure ClinVar_ID is treated as string to avoid dtype mismatch\n", + "dbsnp_data_ids = dbsnp_data[\"ID\"].astype(str).unique()\n", + "\n", + "missing_num = 0\n", + "\n", + "# Iterate and print missing ClinVar IDs\n", + "for _, row in parsed_variants.iterrows():\n", + " if (row[\"Source\"] == \"dbSNP\" or row[\"Source\"] == \"OmimVar\") and str(row[\"ID\"]) not in clinvar_ids:\n", + " missing_num+=1\n", + "print(f'Number of missing dbSNP and OmimVar variant is {missing_num}')" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "b146e274-ef72-42d0-ba06-c859786bda89", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDdbsnp_idseq_idpositionrefalt
01019v2dbSNPrs11547328rs11547328NC_000012.1257751647GA
11019v2dbSNPrs11547328rs11547328NC_000012.1257751647GC
21019v2dbSNPrs11547328rs11547328NC_000012.1257751647GT
31019v2dbSNPrs104894340rs104894340NC_000012.1257751646CA
41019v2dbSNPrs104894340rs104894340NC_000012.1257751646CG
...........................
14179101v1dbSNPrs672601311rs672601311NC_000015.1050490449CG
14189101v1dbSNPrs672601311rs672601311NC_000015.1050490449CT
14199217v1OmimVar605704rs74315431NC_000020.1158418317CT
14209217v1OmimVar605704rs281875284NC_000020.1158418288CG
14219217v1OmimVar605704rs281875284NC_000020.1158418288CT
\n", + "

1422 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID dbsnp_id seq_id position ref \\\n", + "0 1019v2 dbSNP rs11547328 rs11547328 NC_000012.12 57751647 G \n", + "1 1019v2 dbSNP rs11547328 rs11547328 NC_000012.12 57751647 G \n", + "2 1019v2 dbSNP rs11547328 rs11547328 NC_000012.12 57751647 G \n", + "3 1019v2 dbSNP rs104894340 rs104894340 NC_000012.12 57751646 C \n", + "4 1019v2 dbSNP rs104894340 rs104894340 NC_000012.12 57751646 C \n", + "... ... ... ... ... ... ... .. \n", + "1417 9101v1 dbSNP rs672601311 rs672601311 NC_000015.10 50490449 C \n", + "1418 9101v1 dbSNP rs672601311 rs672601311 NC_000015.10 50490449 C \n", + "1419 9217v1 OmimVar 605704 rs74315431 NC_000020.11 58418317 C \n", + "1420 9217v1 OmimVar 605704 rs281875284 NC_000020.11 58418288 C \n", + "1421 9217v1 OmimVar 605704 rs281875284 NC_000020.11 58418288 C \n", + "\n", + " alt \n", + "0 A \n", + "1 C \n", + "2 T \n", + "3 A \n", + "4 G \n", + "... .. \n", + "1417 G \n", + "1418 T \n", + "1419 T \n", + "1420 G \n", + "1421 T \n", + "\n", + "[1422 rows x 8 columns]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dbsnp_final = parsed_variants.merge(dbsnp_data, on='ID')\n", + "dbsnp_final" + ] + }, + { + "cell_type": "markdown", + "id": "4f41d1e1-6e7e-49c7-81b8-456875e0f40a", + "metadata": {}, + "source": [ + "### COSM" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "99aa0a39-ce5d-4c4e-90a9-7b8f36672843", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GeneTranscriptIDCOSMIDNucChangeAAChangeChrStartEndStrandRefAlleleAltAlleleID
0CTNNB1ENST00000643031.1COSM5692c.134C>Ap.S45Y34122464641224646+CA5692
1CTNNB1ENST00000642248.1COSM5689c.134C>Gp.S45C34122464641224646+CG5689
2CDKN2AENST00000579755.1COSM13508c.375G>Ap.G125=92197102721971027-CT13508
3CTNNB1ENST00000396183.7COSM5681c.95A>Gp.D32G34122460741224607+AG5681
4CDKN2AENST00000530628.2COSM13807c.389G>Tp.G130V92197101321971013-CA13807
.......................................
1134CDKN2AENST00000579755.1COSM13723c.308G>Ap.G103E92197109321971093-CT13723
1135CDKN2AENST00000578845.2COSM13723c.112G>Ap.G38S92197109321971093-CT13723
1136CDKN2AENST00000579122.1COSM12505c.59C>Ap.A20E92197476821974768-GT12505
1137FLT3ENST00000380982.4COSM785c.2503G>Cp.D835H132859264228592642-CG785
1138CDKN2AENST00000579122.1COSM13723c.265G>Ap.G89S92197109321971093-CT13723
\n", + "

1139 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " Gene TranscriptID COSMID NucChange AAChange Chr Start \\\n", + "0 CTNNB1 ENST00000643031.1 COSM5692 c.134C>A p.S45Y 3 41224646 \n", + "1 CTNNB1 ENST00000642248.1 COSM5689 c.134C>G p.S45C 3 41224646 \n", + "2 CDKN2A ENST00000579755.1 COSM13508 c.375G>A p.G125= 9 21971027 \n", + "3 CTNNB1 ENST00000396183.7 COSM5681 c.95A>G p.D32G 3 41224607 \n", + "4 CDKN2A ENST00000530628.2 COSM13807 c.389G>T p.G130V 9 21971013 \n", + "... ... ... ... ... ... .. ... \n", + "1134 CDKN2A ENST00000579755.1 COSM13723 c.308G>A p.G103E 9 21971093 \n", + "1135 CDKN2A ENST00000578845.2 COSM13723 c.112G>A p.G38S 9 21971093 \n", + "1136 CDKN2A ENST00000579122.1 COSM12505 c.59C>A p.A20E 9 21974768 \n", + "1137 FLT3 ENST00000380982.4 COSM785 c.2503G>C p.D835H 13 28592642 \n", + "1138 CDKN2A ENST00000579122.1 COSM13723 c.265G>A p.G89S 9 21971093 \n", + "\n", + " End Strand RefAllele AltAllele ID \n", + "0 41224646 + C A 5692 \n", + "1 41224646 + C G 5689 \n", + "2 21971027 - C T 13508 \n", + "3 41224607 + A G 5681 \n", + "4 21971013 - C A 13807 \n", + "... ... ... ... ... ... \n", + "1134 21971093 - C T 13723 \n", + "1135 21971093 - C T 13723 \n", + "1136 21974768 - G T 12505 \n", + "1137 28592642 - C G 785 \n", + "1138 21971093 - C T 13723 \n", + "\n", + "[1139 rows x 12 columns]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cosm_data = pd.read_csv(\"COSM/COSM_total_parsed.tsv\",sep='\\t')\n", + "cosm_data['ID'] = cosm_data['COSMID'].str[4:]\n", + "cosm_data" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "cc7b32a9-6742-4cfd-a075-cf61fc098cc4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of missing COSM variant is 202\n" + ] + } + ], + "source": [ + "# Ensure ClinVar_ID is treated as string to avoid dtype mismatch\n", + "cosm_data_ids = cosm_data[\"ID\"].astype(str).unique()\n", + "\n", + "missing_num = 0\n", + "\n", + "# Iterate and print missing ClinVar IDs\n", + "for _, row in parsed_variants.iterrows():\n", + " if row[\"Source\"] == \"COSM\" and str(row[\"ID\"]) not in clinvar_ids:\n", + " missing_num+=1\n", + "print(f'Number of missing COSM variant is {missing_num}')" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "322fb3ef-866a-4f5b-8a2d-3ee5d33e6276", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDGeneTranscriptIDCOSMIDNucChangeAAChangeChrStartEndStrandRefAlleleAltAllele
01019v2COSM1677139CDK4ENST00000312990.10COSM1677139c.70C>Tp.R24C125775164857751648-GA
11019v2COSM1677139CDK4ENST00000549606.5COSM1677139c.-158+527C>Tp.?125775164857751648-GA
21019v2COSM1677139CDK4ENST00000257904.10COSM1677139c.70C>Tp.R24C125775164857751648-GA
31019v2COSM1989836CDK4ENST00000312990.10COSM1989836c.71G>Ap.R24H125775164757751647-CT
41019v2COSM1989836CDK4ENST00000549606.5COSM1989836c.-158+528G>Ap.?125775164757751647-CT
.............................................
1134999v2COSM4766271CDH1ENST00000612417.4COSM4766271c.662A>Gp.D221G166880882368808823+AG
1135999v2COSM4766271CDH1ENST00000611625.4COSM4766271c.662A>Gp.D221G166880882368808823+AG
1136999v2COSM4766271CDH1ENST00000422392.6COSM4766271c.662A>Gp.D221G166880882368808823+AG
1137999v2COSM4766271CDH1ENST00000621016.4COSM4766271c.662A>Gp.D221G166880882368808823+AG
1138999v2COSM4766271CDH1ENST00000261769.9COSM4766271c.662A>Gp.D221G166880882368808823+AG
\n", + "

1139 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID Gene TranscriptID COSMID \\\n", + "0 1019v2 COSM 1677139 CDK4 ENST00000312990.10 COSM1677139 \n", + "1 1019v2 COSM 1677139 CDK4 ENST00000549606.5 COSM1677139 \n", + "2 1019v2 COSM 1677139 CDK4 ENST00000257904.10 COSM1677139 \n", + "3 1019v2 COSM 1989836 CDK4 ENST00000312990.10 COSM1989836 \n", + "4 1019v2 COSM 1989836 CDK4 ENST00000549606.5 COSM1989836 \n", + "... ... ... ... ... ... ... \n", + "1134 999v2 COSM 4766271 CDH1 ENST00000612417.4 COSM4766271 \n", + "1135 999v2 COSM 4766271 CDH1 ENST00000611625.4 COSM4766271 \n", + "1136 999v2 COSM 4766271 CDH1 ENST00000422392.6 COSM4766271 \n", + "1137 999v2 COSM 4766271 CDH1 ENST00000621016.4 COSM4766271 \n", + "1138 999v2 COSM 4766271 CDH1 ENST00000261769.9 COSM4766271 \n", + "\n", + " NucChange AAChange Chr Start End Strand RefAllele \\\n", + "0 c.70C>T p.R24C 12 57751648 57751648 - G \n", + "1 c.-158+527C>T p.? 12 57751648 57751648 - G \n", + "2 c.70C>T p.R24C 12 57751648 57751648 - G \n", + "3 c.71G>A p.R24H 12 57751647 57751647 - C \n", + "4 c.-158+528G>A p.? 12 57751647 57751647 - C \n", + "... ... ... .. ... ... ... ... \n", + "1134 c.662A>G p.D221G 16 68808823 68808823 + A \n", + "1135 c.662A>G p.D221G 16 68808823 68808823 + A \n", + "1136 c.662A>G p.D221G 16 68808823 68808823 + A \n", + "1137 c.662A>G p.D221G 16 68808823 68808823 + A \n", + "1138 c.662A>G p.D221G 16 68808823 68808823 + A \n", + "\n", + " AltAllele \n", + "0 A \n", + "1 A \n", + "2 A \n", + "3 T \n", + "4 T \n", + "... ... \n", + "1134 G \n", + "1135 G \n", + "1136 G \n", + "1137 G \n", + "1138 G \n", + "\n", + "[1139 rows x 14 columns]" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cosm_final = parsed_variants.merge(cosm_data, on='ID')\n", + "cosm_final" + ] + }, + { + "cell_type": "markdown", + "id": "83db14d7-1688-4b9d-a47b-0afec2f57a10", + "metadata": {}, + "source": [ + "## Combining them together" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "468461b0-1b06-4993-9950-e5ba26b11aa0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDseq_idpositionrefalt
01019v2ClinVar16928NC_000012.1257751647GA
11019v2ClinVar16929NC_000012.1257751646CT
21027v3ClinVar183391NC_000012.1212717896CAGGCGGAGCACCCCAAGCCCAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC
31027v3ClinVar183393NC_000012.1212718044CT
41027v3ClinVar183395NC_000012.1212718210CTCTCT
........................
1809049v1ClinVar4886NC_000011.1067483197CT
1819049v1ClinVar4892NC_000011.1067490803CA
1829101v1ClinVar161992NC_000015.1050490442TC
1839101v1ClinVar161993NC_000015.1050490443CG
1849101v1ClinVar161995NC_000015.1050490449CG
\n", + "

185 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID seq_id position ref \\\n", + "0 1019v2 ClinVar 16928 NC_000012.12 57751647 G \n", + "1 1019v2 ClinVar 16929 NC_000012.12 57751646 C \n", + "2 1027v3 ClinVar 183391 NC_000012.12 12717896 CAGGCGGAGCACCCCAAGCC \n", + "3 1027v3 ClinVar 183393 NC_000012.12 12718044 C \n", + "4 1027v3 ClinVar 183395 NC_000012.12 12718210 CTCT \n", + ".. ... ... ... ... ... ... \n", + "180 9049v1 ClinVar 4886 NC_000011.10 67483197 C \n", + "181 9049v1 ClinVar 4892 NC_000011.10 67490803 C \n", + "182 9101v1 ClinVar 161992 NC_000015.10 50490442 T \n", + "183 9101v1 ClinVar 161993 NC_000015.10 50490443 C \n", + "184 9101v1 ClinVar 161995 NC_000015.10 50490449 C \n", + "\n", + " alt \n", + "0 A \n", + "1 T \n", + "2 CAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC \n", + "3 T \n", + "4 CT \n", + ".. ... \n", + "180 T \n", + "181 A \n", + "182 C \n", + "183 G \n", + "184 G \n", + "\n", + "[185 rows x 7 columns]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clinvar_final" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "6cda9996-9725-4993-ab9e-ca8b74ced30a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDseq_idpositionrefalt
01019v2dbSNPrs11547328NC_000012.1257751647GA
11019v2dbSNPrs11547328NC_000012.1257751647GC
21019v2dbSNPrs11547328NC_000012.1257751647GT
31019v2dbSNPrs104894340NC_000012.1257751646CA
41019v2dbSNPrs104894340NC_000012.1257751646CG
........................
14179101v1dbSNPrs672601311NC_000015.1050490449CG
14189101v1dbSNPrs672601311NC_000015.1050490449CT
14199217v1OmimVar605704NC_000020.1158418317CT
14209217v1OmimVar605704NC_000020.1158418288CG
14219217v1OmimVar605704NC_000020.1158418288CT
\n", + "

1422 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID seq_id position ref alt\n", + "0 1019v2 dbSNP rs11547328 NC_000012.12 57751647 G A\n", + "1 1019v2 dbSNP rs11547328 NC_000012.12 57751647 G C\n", + "2 1019v2 dbSNP rs11547328 NC_000012.12 57751647 G T\n", + "3 1019v2 dbSNP rs104894340 NC_000012.12 57751646 C A\n", + "4 1019v2 dbSNP rs104894340 NC_000012.12 57751646 C G\n", + "... ... ... ... ... ... .. ..\n", + "1417 9101v1 dbSNP rs672601311 NC_000015.10 50490449 C G\n", + "1418 9101v1 dbSNP rs672601311 NC_000015.10 50490449 C T\n", + "1419 9217v1 OmimVar 605704 NC_000020.11 58418317 C T\n", + "1420 9217v1 OmimVar 605704 NC_000020.11 58418288 C G\n", + "1421 9217v1 OmimVar 605704 NC_000020.11 58418288 C T\n", + "\n", + "[1422 rows x 7 columns]" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dbsnp_final = dbsnp_final.drop(columns=['dbsnp_id'])\n", + "dbsnp_final" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "fac7aacc-79c6-4cef-8d4e-7be27bca34e2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDseq_idpositionrefalt
01019v2ClinVar16928NC_000012.1257751647GA
11019v2ClinVar16929NC_000012.1257751646CT
21027v3ClinVar183391NC_000012.1212717896CAGGCGGAGCACCCCAAGCCCAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC
31027v3ClinVar183393NC_000012.1212718044CT
41027v3ClinVar183395NC_000012.1212718210CTCTCT
........................
14179101v1dbSNPrs672601311NC_000015.1050490449CG
14189101v1dbSNPrs672601311NC_000015.1050490449CT
14199217v1OmimVar605704NC_000020.1158418317CT
14209217v1OmimVar605704NC_000020.1158418288CG
14219217v1OmimVar605704NC_000020.1158418288CT
\n", + "

1607 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID seq_id position \\\n", + "0 1019v2 ClinVar 16928 NC_000012.12 57751647 \n", + "1 1019v2 ClinVar 16929 NC_000012.12 57751646 \n", + "2 1027v3 ClinVar 183391 NC_000012.12 12717896 \n", + "3 1027v3 ClinVar 183393 NC_000012.12 12718044 \n", + "4 1027v3 ClinVar 183395 NC_000012.12 12718210 \n", + "... ... ... ... ... ... \n", + "1417 9101v1 dbSNP rs672601311 NC_000015.10 50490449 \n", + "1418 9101v1 dbSNP rs672601311 NC_000015.10 50490449 \n", + "1419 9217v1 OmimVar 605704 NC_000020.11 58418317 \n", + "1420 9217v1 OmimVar 605704 NC_000020.11 58418288 \n", + "1421 9217v1 OmimVar 605704 NC_000020.11 58418288 \n", + "\n", + " ref alt \n", + "0 G A \n", + "1 C T \n", + "2 CAGGCGGAGCACCCCAAGCC CAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC \n", + "3 C T \n", + "4 CTCT CT \n", + "... ... ... \n", + "1417 C G \n", + "1418 C T \n", + "1419 C T \n", + "1420 C G \n", + "1421 C T \n", + "\n", + "[1607 rows x 7 columns]" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clinvar_dbsnp = pd.concat([clinvar_final, dbsnp_final])\n", + "clinvar_dbsnp" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "6956d533-6f6e-4526-9350-f4db614e14da", + "metadata": {}, + "outputs": [], + "source": [ + "clinvar_dbsnp = clinvar_dbsnp.rename(columns={\"seq_id\":\"TranscriptID\",\"position\":\"Start\",\"ref\":\"RefAllele\",\"alt\":\"AltAllele\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "343b56f9-f078-47e6-9c88-5ceff0a8b537", + "metadata": {}, + "outputs": [], + "source": [ + "clinvar_dbsnp[\"End\"] = clinvar_dbsnp[\"Start\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "8717fee6-a548-487b-87e9-f1fef6d2429a", + "metadata": {}, + "outputs": [], + "source": [ + "clinvar_dbsnp['Chr'] = clinvar_dbsnp['TranscriptID'].str[7:9].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "238baf78-d5f4-4c2b-80b4-ce2b4df67cde", + "metadata": {}, + "outputs": [], + "source": [ + "clinvar_dbsnp = clinvar_dbsnp[['ENTRY', 'Source', 'ID', 'TranscriptID','Chr', 'Start', 'End','RefAllele','AltAllele']]" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "a1482c60-3453-4f16-89bd-9fc73ba6b622", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDTranscriptIDChrStartEndRefAlleleAltAllele
01019v2ClinVar16928NC_000012.12125775164757751647GA
11019v2ClinVar16929NC_000012.12125775164657751646CT
21027v3ClinVar183391NC_000012.12121271789612717896CAGGCGGAGCACCCCAAGCCCAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC
31027v3ClinVar183393NC_000012.12121271804412718044CT
41027v3ClinVar183395NC_000012.12121271821012718210CTCTCT
..............................
14179101v1dbSNPrs672601311NC_000015.10155049044950490449CG
14189101v1dbSNPrs672601311NC_000015.10155049044950490449CT
14199217v1OmimVar605704NC_000020.11205841831758418317CT
14209217v1OmimVar605704NC_000020.11205841828858418288CG
14219217v1OmimVar605704NC_000020.11205841828858418288CT
\n", + "

1607 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID TranscriptID Chr Start End \\\n", + "0 1019v2 ClinVar 16928 NC_000012.12 12 57751647 57751647 \n", + "1 1019v2 ClinVar 16929 NC_000012.12 12 57751646 57751646 \n", + "2 1027v3 ClinVar 183391 NC_000012.12 12 12717896 12717896 \n", + "3 1027v3 ClinVar 183393 NC_000012.12 12 12718044 12718044 \n", + "4 1027v3 ClinVar 183395 NC_000012.12 12 12718210 12718210 \n", + "... ... ... ... ... ... ... ... \n", + "1417 9101v1 dbSNP rs672601311 NC_000015.10 15 50490449 50490449 \n", + "1418 9101v1 dbSNP rs672601311 NC_000015.10 15 50490449 50490449 \n", + "1419 9217v1 OmimVar 605704 NC_000020.11 20 58418317 58418317 \n", + "1420 9217v1 OmimVar 605704 NC_000020.11 20 58418288 58418288 \n", + "1421 9217v1 OmimVar 605704 NC_000020.11 20 58418288 58418288 \n", + "\n", + " RefAllele AltAllele \n", + "0 G A \n", + "1 C T \n", + "2 CAGGCGGAGCACCCCAAGCC CAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC \n", + "3 C T \n", + "4 CTCT CT \n", + "... ... ... \n", + "1417 C G \n", + "1418 C T \n", + "1419 C T \n", + "1420 C G \n", + "1421 C T \n", + "\n", + "[1607 rows x 9 columns]" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clinvar_dbsnp" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "8207bc04-9787-4621-818e-7e9cc17770ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDTranscriptIDNucChangeChrStartEndRefAlleleAltAllele
01019v2COSM1677139ENST00000312990.10c.70C>T125775164857751648GA
11019v2COSM1677139ENST00000549606.5c.-158+527C>T125775164857751648GA
21019v2COSM1677139ENST00000257904.10c.70C>T125775164857751648GA
31019v2COSM1989836ENST00000312990.10c.71G>A125775164757751647CT
41019v2COSM1989836ENST00000549606.5c.-158+528G>A125775164757751647CT
.................................
1134999v2COSM4766271ENST00000612417.4c.662A>G166880882368808823AG
1135999v2COSM4766271ENST00000611625.4c.662A>G166880882368808823AG
1136999v2COSM4766271ENST00000422392.6c.662A>G166880882368808823AG
1137999v2COSM4766271ENST00000621016.4c.662A>G166880882368808823AG
1138999v2COSM4766271ENST00000261769.9c.662A>G166880882368808823AG
\n", + "

1139 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID TranscriptID NucChange Chr Start \\\n", + "0 1019v2 COSM 1677139 ENST00000312990.10 c.70C>T 12 57751648 \n", + "1 1019v2 COSM 1677139 ENST00000549606.5 c.-158+527C>T 12 57751648 \n", + "2 1019v2 COSM 1677139 ENST00000257904.10 c.70C>T 12 57751648 \n", + "3 1019v2 COSM 1989836 ENST00000312990.10 c.71G>A 12 57751647 \n", + "4 1019v2 COSM 1989836 ENST00000549606.5 c.-158+528G>A 12 57751647 \n", + "... ... ... ... ... ... .. ... \n", + "1134 999v2 COSM 4766271 ENST00000612417.4 c.662A>G 16 68808823 \n", + "1135 999v2 COSM 4766271 ENST00000611625.4 c.662A>G 16 68808823 \n", + "1136 999v2 COSM 4766271 ENST00000422392.6 c.662A>G 16 68808823 \n", + "1137 999v2 COSM 4766271 ENST00000621016.4 c.662A>G 16 68808823 \n", + "1138 999v2 COSM 4766271 ENST00000261769.9 c.662A>G 16 68808823 \n", + "\n", + " End RefAllele AltAllele \n", + "0 57751648 G A \n", + "1 57751648 G A \n", + "2 57751648 G A \n", + "3 57751647 C T \n", + "4 57751647 C T \n", + "... ... ... ... \n", + "1134 68808823 A G \n", + "1135 68808823 A G \n", + "1136 68808823 A G \n", + "1137 68808823 A G \n", + "1138 68808823 A G \n", + "\n", + "[1139 rows x 10 columns]" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cosm_final = cosm_final.drop(columns={\"Gene\",\"COSMID\",\"AAChange\",\"Strand\"})\n", + "cosm_final" + ] + }, + { + "cell_type": "markdown", + "id": "e9c0fd4d-14e1-4018-b848-678717d265f0", + "metadata": {}, + "source": [ + "**Final Concatenation**" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "ced3d93a-e5a8-4283-8925-8257540a5e99", + "metadata": {}, + "outputs": [], + "source": [ + "final_data = pd.concat([cosm_final,clinvar_dbsnp])" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "8e2d6e3e-38ca-4aef-bc32-7f7ea3f45126", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDTranscriptIDNucChangeChrStartEndRefAlleleAltAllele
01019v2COSM1677139ENST00000312990.10c.70C>T125775164857751648GA
11019v2COSM1677139ENST00000549606.5c.-158+527C>T125775164857751648GA
21019v2COSM1677139ENST00000257904.10c.70C>T125775164857751648GA
31019v2COSM1989836ENST00000312990.10c.71G>A125775164757751647CT
41019v2COSM1989836ENST00000549606.5c.-158+528G>A125775164757751647CT
.................................
14179101v1dbSNPrs672601311NC_000015.10NaN155049044950490449CG
14189101v1dbSNPrs672601311NC_000015.10NaN155049044950490449CT
14199217v1OmimVar605704NC_000020.11NaN205841831758418317CT
14209217v1OmimVar605704NC_000020.11NaN205841828858418288CG
14219217v1OmimVar605704NC_000020.11NaN205841828858418288CT
\n", + "

2746 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID TranscriptID NucChange Chr \\\n", + "0 1019v2 COSM 1677139 ENST00000312990.10 c.70C>T 12 \n", + "1 1019v2 COSM 1677139 ENST00000549606.5 c.-158+527C>T 12 \n", + "2 1019v2 COSM 1677139 ENST00000257904.10 c.70C>T 12 \n", + "3 1019v2 COSM 1989836 ENST00000312990.10 c.71G>A 12 \n", + "4 1019v2 COSM 1989836 ENST00000549606.5 c.-158+528G>A 12 \n", + "... ... ... ... ... ... .. \n", + "1417 9101v1 dbSNP rs672601311 NC_000015.10 NaN 15 \n", + "1418 9101v1 dbSNP rs672601311 NC_000015.10 NaN 15 \n", + "1419 9217v1 OmimVar 605704 NC_000020.11 NaN 20 \n", + "1420 9217v1 OmimVar 605704 NC_000020.11 NaN 20 \n", + "1421 9217v1 OmimVar 605704 NC_000020.11 NaN 20 \n", + "\n", + " Start End RefAllele AltAllele \n", + "0 57751648 57751648 G A \n", + "1 57751648 57751648 G A \n", + "2 57751648 57751648 G A \n", + "3 57751647 57751647 C T \n", + "4 57751647 57751647 C T \n", + "... ... ... ... ... \n", + "1417 50490449 50490449 C G \n", + "1418 50490449 50490449 C T \n", + "1419 58418317 58418317 C T \n", + "1420 58418288 58418288 C G \n", + "1421 58418288 58418288 C T \n", + "\n", + "[2746 rows x 10 columns]" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_data" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "3bec1451-dfd8-4597-b7a7-6b1ec5d70b13", + "metadata": {}, + "outputs": [], + "source": [ + "final_data.to_csv(\"all_variant_data.tsv\",sep='\\t',index=False, header=True)" + ] + }, + { + "cell_type": "markdown", + "id": "a4723ff8-9848-4f96-8461-2175e986a8f2", + "metadata": {}, + "source": [ + "In Excel removed duplicates based on the same Variant ID, Chromosome number, ref allele and alt allele\n", + "\n", + "After removing 1 lines from manual inspection, I am left with 761 variants and their associated variant ids" + ] + }, + { + "cell_type": "markdown", + "id": "1f3c322f-17b8-4ab9-8a8f-a7b60ea73ab0", + "metadata": {}, + "source": [ + "# Variant ID to Network" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "48fff44f-22ab-4c30-a24b-2bc029e72463", + "metadata": {}, + "outputs": [], + "source": [ + "gene_variant = pd.read_csv(\"gene_variants.tsv\", sep='\\t', names=['Network','ENTRY'])" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "bba198c9-a63e-466b-871e-b0ee30f84e56", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NetworkENTRY
0N0000225v1
1N0000225v2
2N000033815v1
3N000042322v1
4N000042322v2
.........
323N017142760v1
324N018095052v1
325N018737428v3
326N018763084v1
327N018772066v1
\n", + "

328 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " Network ENTRY\n", + "0 N00002 25v1\n", + "1 N00002 25v2\n", + "2 N00003 3815v1\n", + "3 N00004 2322v1\n", + "4 N00004 2322v2\n", + ".. ... ...\n", + "323 N01714 2760v1\n", + "324 N01809 5052v1\n", + "325 N01873 7428v3\n", + "326 N01876 3084v1\n", + "327 N01877 2066v1\n", + "\n", + "[328 rows x 2 columns]" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gene_variant" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "8d2f02f0-bd56-4693-88c7-2f0124a12fa4", + "metadata": {}, + "outputs": [], + "source": [ + "all_variant_data = pd.read_csv(\"all_variant_data.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "707e21dc-85b0-48da-9d2e-b20c1351d035", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDTranscriptIDNucChangeChrStartEndRefAlleleAltAllele
01019v2ClinVar16929NC_000012.12NaN125775164657751646CT
11019v2dbSNPrs104894340NC_000012.12NaN125775164657751646CA
21019v2dbSNPrs104894340NC_000012.12NaN125775164657751646CG
31019v2ClinVar16928NC_000012.12NaN125775164757751647GA
41019v2dbSNPrs11547328NC_000012.12NaN125775164757751647GC
.................................
7569817v1COSM6196635ENST00000393623.6c.706G>T191049219610492196CA
7579817v1COSM6196637ENST00000393623.6c.548A>G191049948610499486TC
758999v2COSM4766271ENST00000621016.4c.662A>G166880882368808823AG
759999v2COSM4766211ENST00000621016.4c.755T>G166881026468810264TG
760999v2COSM1379150ENST00000621016.4c.769G>A166881027868810278GA
\n", + "

761 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID TranscriptID NucChange Chr Start \\\n", + "0 1019v2 ClinVar 16929 NC_000012.12 NaN 12 57751646 \n", + "1 1019v2 dbSNP rs104894340 NC_000012.12 NaN 12 57751646 \n", + "2 1019v2 dbSNP rs104894340 NC_000012.12 NaN 12 57751646 \n", + "3 1019v2 ClinVar 16928 NC_000012.12 NaN 12 57751647 \n", + "4 1019v2 dbSNP rs11547328 NC_000012.12 NaN 12 57751647 \n", + ".. ... ... ... ... ... ... ... \n", + "756 9817v1 COSM 6196635 ENST00000393623.6 c.706G>T 19 10492196 \n", + "757 9817v1 COSM 6196637 ENST00000393623.6 c.548A>G 19 10499486 \n", + "758 999v2 COSM 4766271 ENST00000621016.4 c.662A>G 16 68808823 \n", + "759 999v2 COSM 4766211 ENST00000621016.4 c.755T>G 16 68810264 \n", + "760 999v2 COSM 1379150 ENST00000621016.4 c.769G>A 16 68810278 \n", + "\n", + " End RefAllele AltAllele \n", + "0 57751646 C T \n", + "1 57751646 C A \n", + "2 57751646 C G \n", + "3 57751647 G A \n", + "4 57751647 G C \n", + ".. ... ... ... \n", + "756 10492196 C A \n", + "757 10499486 T C \n", + "758 68808823 A G \n", + "759 68810264 T G \n", + "760 68810278 G A \n", + "\n", + "[761 rows x 10 columns]" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_variant_data" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "fcc506c3-c957-4e8a-acbd-bdb0c9dc6318", + "metadata": {}, + "outputs": [], + "source": [ + "variant_data_together_wo_nt = all_variant_data.merge(gene_variant, on=\"ENTRY\")" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "e679f511-77da-40c1-9f5e-25162fd7f714", + "metadata": {}, + "outputs": [], + "source": [ + "variant_data_together_wo_nt.to_csv(\"variant_data_together_wo_nt.tsv\", sep='\\t',index=False, header=True)" + ] + }, + { + "cell_type": "markdown", + "id": "1cf263d2-a41b-422c-b095-4a18184158c6", + "metadata": {}, + "source": [ + "# Parsing Unique Networks and getting Gene Pathway" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4586fd55-9de0-4d1c-b81f-92bdcce839ec", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d8f896ab-5859-438b-97f9-392c6f7c837b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 182 network_variant_data_unique.txt\n" + ] + } + ], + "source": [ + "cut -f 1 variant_data_together_wo_nt.tsv > network_variant_data.txt\n", + "sort -u network_variant_data.txt > network_variant_data_unique.txt\n", + "sed -i '' '/Network/d' network_variant_data_unique.txt\n", + "wc -l network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9a1fa0c9-94a0-40f7-831a-557532512878", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + " if ! grep -q ENTRY network_variant/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3bd437dc-7caa-4b30-9587-84397015be0f", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + " if ! grep -q NAME network_variant/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "94f49e6f-eea7-4eb3-b495-23bc0593633d", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + " if ! grep -q DEFINITION network_variant/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "baf9f804-fc45-4560-8bbc-fe9e43cebb09", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + " if ! grep -q EXPANDED network_variant/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "931168b5-9a73-4dcb-ad78-fcc41a911503", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "N00302\n", + "N00303\n", + "N00304\n", + "N00305\n", + "N00600\n", + "N00643\n", + "N00679\n", + "N00789\n", + "N01064\n", + "N01065\n", + "N01419\n", + "N01422\n", + "N01444\n", + "N01714\n" + ] + } + ], + "source": [ + "while read p; do\n", + " if ! grep -q PATHWAY network_variant/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "dd3b8bb6-0b8d-43f9-8241-f093e6b7a063", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + " if ! grep -q CLASS network_variant/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e2d1d5ad-9662-4c7a-abe2-bf155a6e0257", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "N01683\n", + "N01689\n", + "N01697\n", + "N01698\n", + "N01699\n", + "N01700\n", + "N01702\n", + "N01704\n", + "N01714\n" + ] + } + ], + "source": [ + "while read p; do\n", + " if ! grep -q DISEASE network_variant/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d8eeded0-fda2-4a90-9dcb-b4cb841d77b9", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + " if ! grep -q GENE network_variant/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "bad6343d-6c08-4738-9902-ee17f3832b40", + "metadata": {}, + "outputs": [], + "source": [ + "sed -i '' '/N01683/d' network_variant_data_unique.txt\n", + "sed -i '' '/N01689/d' network_variant_data_unique.txt\n", + "sed -i '' '/N01697/d' network_variant_data_unique.txt\n", + "sed -i '' '/N01698/d' network_variant_data_unique.txt\n", + "sed -i '' '/N01699/d' network_variant_data_unique.txt\n", + "sed -i '' '/N01700/d' network_variant_data_unique.txt\n", + "sed -i '' '/N01702/d' network_variant_data_unique.txt\n", + "sed -i '' '/N01704/d' network_variant_data_unique.txt\n", + "sed -i '' '/N01714/d' network_variant_data_unique.txt" + ] + }, + { + "cell_type": "markdown", + "id": "eab3e1bd-3725-4037-839d-ed06e02eff4c", + "metadata": {}, + "source": [ + "Networks without a disease tag and thus without a ground truth paragraph" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "3a05997d-80bf-4f89-9630-7adf8b6b2866", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 173 network_variant_data_unique.txt\n" + ] + } + ], + "source": [ + "wc -l network_variant_data_unique.txt" + ] + }, + { + "cell_type": "markdown", + "id": "30578490-ad11-4bed-b683-80fa41f8c41e", + "metadata": {}, + "source": [ + "**Switch to python**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b58d998-6919-4961-a18e-89a5dfb96d1c", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6a0904c9-366a-48b5-9291-efc09039478f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "85bd0c1f-cc3a-4fed-94f0-e2dd7d5bb598", + "metadata": {}, + "outputs": [], + "source": [ + "# Define column structure\n", + "network_info = pd.DataFrame(columns=[\"Entry\", \"Name\", \"Definition\", \"Expanded\", \"Pathway\", \"Class\", \"Disease\", \"Gene\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "f568e7e9-d28c-44b5-8224-fefbd31735bc", + "metadata": {}, + "outputs": [], + "source": [ + "# Read all variant IDs\n", + "with open('network_variant_data_unique.txt', 'r') as f:\n", + " network_var_id = [line.strip() for line in f if line.strip()]\n", + "\n", + "# Function to extract single-line values (handles leading whitespace too)\n", + "def get_single_line_value(lines, key):\n", + " for line in lines:\n", + " if line.lstrip().startswith(key):\n", + " return line.split(key, 1)[-1].strip()\n", + " return \"\"\n", + "\n", + "# Function to extract multiline values that follow a key line (indented lines)\n", + "def get_multiline_values(lines, key):\n", + " values = []\n", + " recording = False\n", + " for i, line in enumerate(lines):\n", + " if line.startswith(key):\n", + " # Capture first line's content after the key\n", + " initial_value = line[len(key):].strip()\n", + " if initial_value:\n", + " values.append(initial_value)\n", + " recording = True\n", + " continue\n", + " if recording:\n", + " if re.match(r'^\\s{2,}', line): # line starts with 2+ spaces\n", + " values.append(line.strip())\n", + " else:\n", + " break # stop when indentation breaks\n", + " return \"| \".join(values)\n", + "\n", + "# Process each network_variant file\n", + "for variant_id in network_var_id:\n", + " file_path = f'network_variant/{variant_id}.txt'\n", + "\n", + " try:\n", + " with open(file_path, 'r') as f:\n", + " lines = f.readlines()\n", + "\n", + " row = {\n", + " \"Entry\": variant_id,\n", + " \"Name\": get_single_line_value(lines, \"NAME\"),\n", + " \"Definition\": get_single_line_value(lines, \"DEFINITION\"),\n", + " \"Expanded\": get_single_line_value(lines, \"EXPANDED\"),\n", + " \"Pathway\": get_multiline_values(lines, \"PATHWAY\"),\n", + " \"Class\": get_multiline_values(lines, \"CLASS\"),\n", + " \"Disease\": get_multiline_values(lines, \"DISEASE\"),\n", + " \"Gene\": get_multiline_values(lines, \"GENE\")\n", + " }\n", + "\n", + " network_info = pd.concat([network_info, pd.DataFrame([row])], ignore_index=True)\n", + "\n", + " except FileNotFoundError:\n", + " print(f\"[Warning] File not found: {file_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "ed1fef16-cf95-44ef-bfd7-552c631b725e", + "metadata": {}, + "outputs": [], + "source": [ + "network_info = network_info.set_index('Entry')" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "a2188680-0f9d-4e7f-89f1-9dc5aa95094f", + "metadata": {}, + "outputs": [], + "source": [ + "no_pathway = [\"N00302\",\"N00303\",\"N00304\",\"N00305\",\"N00600\",\"N00643\",\"N00679\",\"N00789\",\"N01064\",\"N01065\",\"N01419\",\"N01422\",\"N01444\"]\n", + "for id in no_pathway:\n", + " network_info.at[id, 'Pathway'] = pd.NA" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "194ccc3a-54c5-483b-90d7-b5bcda4bdfe4", + "metadata": {}, + "outputs": [], + "source": [ + "network_info = network_info.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "e3d1de46-c7c8-4615-b9ff-2d4d08e979f0", + "metadata": {}, + "outputs": [], + "source": [ + "# Columns to process\n", + "cols_to_clean = [\"Pathway\", \"Class\", \"Disease\",\"Gene\"]\n", + "\n", + "def extract_data(cell):\n", + " if pd.isna(cell):\n", + " return cell # Leave NaN as is\n", + " gene_dict = {}\n", + " for part in cell.split(\"|\"):\n", + " tokens = part.strip().split()\n", + " if len(tokens) >= 2:\n", + " gene_dict[tokens[0]] = ' '.join(tokens[1:])\n", + " elif len(tokens) == 1:\n", + " gene_dict[tokens[0]] = \"\"\n", + " return gene_dict\n", + "\n", + "# Apply the transformation to each column\n", + "for col in cols_to_clean:\n", + " network_info[col] = network_info[col].apply(extract_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "72a6992e-def7-4ada-abc6-080c31cec3fe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EntryNameDefinitionExpandedPathwayClassDiseaseGene
0N00002BCR-ABL fusion kinase to RAS-ERK signaling pat...BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->...(25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38...{'hsa05220': 'Chronic myeloid leukemia'}{'nt06276': 'Chronic myeloid leukemia', 'nt062...{'H00004': 'Chronic myeloid leukemia'}{'25': 'ABL1; ABL proto-oncogene 1, non-recept...
1N00003Mutation-activated KIT to RAS-ERK signaling pa...KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48...{'hsa05221': 'Acute myeloid leukemia'}{'nt06275': 'Acute myeloid leukemia', 'nt06210...{'H00003': 'Acute myeloid leukemia'}{'3815': 'KIT; KIT proto-oncogene receptor tyr...
2N00004Duplication or mutation-activated FLT3 to RAS-...FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK(2322v2,2322v1) -> 2885 -> (6654,6655) -> (326...{'hsa05221': 'Acute myeloid leukemia'}{'nt06275': 'Acute myeloid leukemia', 'nt06210...{'H00003': 'Acute myeloid leukemia'}{'2322': 'FLT3; fms related tyrosine kinase 3'...
3N00005Mutation-activated MET to RAS-ERK signaling pa...MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER...4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48...{'hsa05225': 'Hepatocellular carcinoma', 'hsa0...{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma', 'H00021...{'4233': 'MET; MET proto-oncogene, receptor ty...
4N00007EML4-ALK fusion kinase to RAS-ERK signaling pa...EML4-ALK -> RAS -> RAF -> MEK -> ERK -> CCND1(238v1,238v2) -> (3265,3845,4893) -> (369,673,...{'hsa05223': 'Non-small cell lung cancer'}{'nt06266': 'Non-small cell lung cancer', 'nt0...{'H00014': 'Non-small cell lung cancer'}{'238': 'ALK; ALK receptor tyrosine kinase', '...
...........................
168N01422HPRT1 deficiency in purine salvage pathway(Hypoxanthine,Guanine) // HPRT1*(C00262,C00242) // 3251v1<NA>{'nt06027': 'Purine salvage pathway'}{'H00194': 'Lesch-Nyhan syndrome'}{'3251': 'HPRT1; hypoxanthine phosphoribosyltr...
169N01444NXN mutation to WNT5A-ROR signaling pathwayNXN* -| DVL64359v1 -| (1855,1856,1857)<NA>{'nt06505': 'WNT signaling'}{'H00485': 'Robinow syndrome'}{'64359': 'NXN; nucleoredoxin', '1855': 'DVL1;...
170N01809Mutation-caused epigenetic silencing of MMACHCPRDX1* =| MMACHC5052v1 =| 25974{'hsa04980': 'Cobalamin transport and metaboli...{'nt06538': 'Cobalamin transport and metabolism'}{'H02221': 'Methylmalonic aciduria and homocys...{'5052': 'PRDX1; peroxiredoxin 1', '25974': 'M...
171N01873VHL mutation to HIF-2 signaling pathway(VHL*+RBX1+ELOC+ELOB+CUL2) // EPAS1 == ARNT =>...(7428v3+9978+6921+6923+8453) // 2034 == 405 =>...{'hsa05211': 'Renal cell carcinoma'}{'nt06542': 'HIF signaling'}{'H00021': 'Renal cell carcinoma', 'H00559': '...{'7428': 'VHL; von Hippel-Lindau tumor suppres...
172N01877ERBB4 mutation to GF-RTK-PI3K signaling pathwayNRG // ERBB4*(3084,9542,10718,145957) // 2066v1{'hsa04012': 'ErbB signaling pathway'}{'nt06543': 'NRG-ERBB signaling'}{'H00058': 'Amyotrophic lateral sclerosis (ALS)'}{'3084': 'NRG1; neuregulin 1', '9542': 'NRG2; ...
\n", + "

173 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Entry Name \\\n", + "0 N00002 BCR-ABL fusion kinase to RAS-ERK signaling pat... \n", + "1 N00003 Mutation-activated KIT to RAS-ERK signaling pa... \n", + "2 N00004 Duplication or mutation-activated FLT3 to RAS-... \n", + "3 N00005 Mutation-activated MET to RAS-ERK signaling pa... \n", + "4 N00007 EML4-ALK fusion kinase to RAS-ERK signaling pa... \n", + ".. ... ... \n", + "168 N01422 HPRT1 deficiency in purine salvage pathway \n", + "169 N01444 NXN mutation to WNT5A-ROR signaling pathway \n", + "170 N01809 Mutation-caused epigenetic silencing of MMACHC \n", + "171 N01873 VHL mutation to HIF-2 signaling pathway \n", + "172 N01877 ERBB4 mutation to GF-RTK-PI3K signaling pathway \n", + "\n", + " Definition \\\n", + "0 BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->... \n", + "1 KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK \n", + "2 FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK \n", + "3 MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER... \n", + "4 EML4-ALK -> RAS -> RAF -> MEK -> ERK -> CCND1 \n", + ".. ... \n", + "168 (Hypoxanthine,Guanine) // HPRT1* \n", + "169 NXN* -| DVL \n", + "170 PRDX1* =| MMACHC \n", + "171 (VHL*+RBX1+ELOC+ELOB+CUL2) // EPAS1 == ARNT =>... \n", + "172 NRG // ERBB4* \n", + "\n", + " Expanded \\\n", + "0 (25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38... \n", + "1 3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48... \n", + "2 (2322v2,2322v1) -> 2885 -> (6654,6655) -> (326... \n", + "3 4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48... \n", + "4 (238v1,238v2) -> (3265,3845,4893) -> (369,673,... \n", + ".. ... \n", + "168 (C00262,C00242) // 3251v1 \n", + "169 64359v1 -| (1855,1856,1857) \n", + "170 5052v1 =| 25974 \n", + "171 (7428v3+9978+6921+6923+8453) // 2034 == 405 =>... \n", + "172 (3084,9542,10718,145957) // 2066v1 \n", + "\n", + " Pathway \\\n", + "0 {'hsa05220': 'Chronic myeloid leukemia'} \n", + "1 {'hsa05221': 'Acute myeloid leukemia'} \n", + "2 {'hsa05221': 'Acute myeloid leukemia'} \n", + "3 {'hsa05225': 'Hepatocellular carcinoma', 'hsa0... \n", + "4 {'hsa05223': 'Non-small cell lung cancer'} \n", + ".. ... \n", + "168 \n", + "169 \n", + "170 {'hsa04980': 'Cobalamin transport and metaboli... \n", + "171 {'hsa05211': 'Renal cell carcinoma'} \n", + "172 {'hsa04012': 'ErbB signaling pathway'} \n", + "\n", + " Class \\\n", + "0 {'nt06276': 'Chronic myeloid leukemia', 'nt062... \n", + "1 {'nt06275': 'Acute myeloid leukemia', 'nt06210... \n", + "2 {'nt06275': 'Acute myeloid leukemia', 'nt06210... \n", + "3 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "4 {'nt06266': 'Non-small cell lung cancer', 'nt0... \n", + ".. ... \n", + "168 {'nt06027': 'Purine salvage pathway'} \n", + "169 {'nt06505': 'WNT signaling'} \n", + "170 {'nt06538': 'Cobalamin transport and metabolism'} \n", + "171 {'nt06542': 'HIF signaling'} \n", + "172 {'nt06543': 'NRG-ERBB signaling'} \n", + "\n", + " Disease \\\n", + "0 {'H00004': 'Chronic myeloid leukemia'} \n", + "1 {'H00003': 'Acute myeloid leukemia'} \n", + "2 {'H00003': 'Acute myeloid leukemia'} \n", + "3 {'H00048': 'Hepatocellular carcinoma', 'H00021... \n", + "4 {'H00014': 'Non-small cell lung cancer'} \n", + ".. ... \n", + "168 {'H00194': 'Lesch-Nyhan syndrome'} \n", + "169 {'H00485': 'Robinow syndrome'} \n", + "170 {'H02221': 'Methylmalonic aciduria and homocys... \n", + "171 {'H00021': 'Renal cell carcinoma', 'H00559': '... \n", + "172 {'H00058': 'Amyotrophic lateral sclerosis (ALS)'} \n", + "\n", + " Gene \n", + "0 {'25': 'ABL1; ABL proto-oncogene 1, non-recept... \n", + "1 {'3815': 'KIT; KIT proto-oncogene receptor tyr... \n", + "2 {'2322': 'FLT3; fms related tyrosine kinase 3'... \n", + "3 {'4233': 'MET; MET proto-oncogene, receptor ty... \n", + "4 {'238': 'ALK; ALK receptor tyrosine kinase', '... \n", + ".. ... \n", + "168 {'3251': 'HPRT1; hypoxanthine phosphoribosyltr... \n", + "169 {'64359': 'NXN; nucleoredoxin', '1855': 'DVL1;... \n", + "170 {'5052': 'PRDX1; peroxiredoxin 1', '25974': 'M... \n", + "171 {'7428': 'VHL; von Hippel-Lindau tumor suppres... \n", + "172 {'3084': 'NRG1; neuregulin 1', '9542': 'NRG2; ... \n", + "\n", + "[173 rows x 8 columns]" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "network_info" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "4844c8c8-7efc-4b21-ba7a-bd6eef0a7cf3", + "metadata": {}, + "outputs": [], + "source": [ + "network_info.to_csv(\"network_variant_final_info.tsv\",sep='\\t', header=True, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "c432ed92-f45d-4893-8666-a71fa6256076", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['H00003', 'H00004', 'H00013', 'H00014', 'H00018', 'H00019', 'H00020', 'H00021', 'H00022', 'H00024', 'H00026', 'H00031', 'H00032', 'H00033', 'H00034', 'H00038', 'H00039', 'H00042', 'H00048', 'H00056', 'H00057', 'H00058', 'H00059', 'H00061', 'H00063', 'H00126', 'H00135', 'H00194', 'H00195', 'H00246', 'H00247', 'H00251', 'H00260', 'H00423', 'H00485', 'H00559', 'H01032', 'H01102', 'H01398', 'H01431', 'H01522', 'H01603', 'H02049', 'H02221']\n" + ] + } + ], + "source": [ + "all_disease_keys = []\n", + "\n", + "for disease in network_info['Disease']:\n", + " if isinstance(disease, dict):\n", + " all_disease_keys.extend(disease.keys())\n", + "\n", + "unique_disease_keys = sorted(set(all_disease_keys))\n", + "print(unique_disease_keys)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "9b0a7a42-fef6-4300-9272-3973be631880", + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess\n", + "\n", + "disease_dict = {}\n", + "\n", + "for disease in unique_disease_keys:\n", + " try:\n", + " # Run the shell command and capture output\n", + " result = subprocess.run(\n", + " f\"kegg_pull rest get {disease} | grep DESCRIPTION\",\n", + " shell=True,\n", + " capture_output=True,\n", + " text=True\n", + " )\n", + " # Save the stdout (if grep found something)\n", + " if result.stdout:\n", + " disease_dict[disease] = result.stdout.strip()\n", + " else:\n", + " disease_dict[disease] = None # or \"DESCRIPTION not found\"\n", + " except Exception as e:\n", + " disease_dict[disease] = f\"Error: {str(e)}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "7ce66864-aa4d-47f3-9843-b2a96d2e188b", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'H00003': 'DESCRIPTION Acute myeloid leukemia (AML) is a disease that is characterized by uncontrolled proliferation of clonal neoplastic cells and accumulation in the bone marrow of blasts with an impaired differentiation program. AML accounts for approximately 80% of all adult leukemias and remains the most common cause of leukemia death. Two major types of genetic events have been described that are crucial for leukemic transformation. A proposed necessary first event is disordered cell growth and upregulation of cell survival genes. The most common of these activating events were observed in the RTK Flt3, in N-Ras and K-Ras, in Kit, and sporadically in other RTKs. Alterations in myeloid transcription factors governing hematopoietic differentiation provide second necessary event for leukemogenesis. Transcription factor fusion proteins such as PML-RARalpha (in Acute promyelocytic leukemia, a subtype of AML), AML-ETO or PLZF-RARalpha block myeloid cell differentiation by repressing target genes. In other cases, the transcription factors themselves are mutated.',\n", + " 'H00004': 'DESCRIPTION Chronic myeloid leukemia (CML) is a clonal myeloproliferative disorder of a pluripotent stem cell. The natural history of CML has a triphasic clinical course comprising of an initial chronic phase (CP), which is characterized by expansion of functionally normal myeloid cells, followed by an accelerated phase (AP) and finally a more aggressive blast phase (BP), with loss of terminal differentiation capacity. On the cellular level, CML is associated with a specific chromosome abnormality, the t(9; 22) reciprocal translocation that forms the Philadelphia (Ph) chromosome. The Ph chromosome is the result of a molecular rearrangement between the c-ABL proto-oncogene on chromosome 9 and the BCR (breakpoint cluster region) gene on chromosome 22. The BCR/ABL fusion gene encodes p210 BCR/ABL, an oncoprotein, which, unlike the normal p145 c-Abl, has constitutive tyrosine kinase activity and is predominantly localized in the cytoplasm. While fusion of c-ABL and BCR is believed to be the primary cause of the chronic phase of CML, progression to blast crisis requires other molecular changes. Common secondary abnormalities include mutations in TP53, RB, and p16/INK4A, or overexpression of genes such as EVI1. Additional chromosome translocations are also observed,such as t(3;21)(q26;q22), which generates AML1-EVI1.',\n", + " 'H00013': 'DESCRIPTION Lung cancer is a leading cause of cancer death among men and women in industrialized countries. Small cell lung carcinoma (SCLC) is a highly aggressive neoplasm, which accounts for approximately 25% of all lung cancer cases. Molecular mechanisms altered in SCLC include induced expression of oncogene, MYC, and loss of tumorsuppressor genes, such as p53, PTEN, RB, and FHIT. The overexpression of MYC proteins in SCLC is largely a result of gene amplification. Such overexpression leads to more rapid proliferation and loss of terminal differentiation. Mutation or deletion of p53 or PTEN can lead to more rapid proliferation and reduced apoptosis. The retinoblastoma gene RB1 encodes a nuclear phosphoprotein that helps to regulate cell-cycle progression. The fragile histidine triad gene FHIT encodes the enzyme diadenosine triphosphate hydrolase, which is thought to have an indirect role in proapoptosis and cell-cycle control.',\n", + " 'H00014': 'DESCRIPTION Lung cancer is a leading cause of cancer death among men and women in industrialized countries. Non-small-cell lung cancer (NSCLC) accounts for approximately 85% of lung cancer and represents a heterogeneous group of cancers, consisting mainly of squamous cell (SCC), adeno (AC) and large-cell carcinoma. Molecular mechanisms altered in NSCLC include activation of oncogenes, such as K-RAS, EGFR and EML4-ALK, and inactivation of tumorsuppressor genes, such as p53, p16INK4a, RAR-beta, and RASSF1. Point mutations within the K-RAS gene inactivate GTPase activity and the p21-RAS protein continuously transmits growth signals to the nucleus. Mutations or overexpression of EGFR leads to a proliferative advantage. EML4-ALK fusion leads to constitutive ALK activation, which causes cell proliferation, invasion, and inhibition of apoptosis. Inactivating mutation of p53 can lead to more rapid proliferation and reduced apoptosis. The protein encoded by the p16INK4a inhibits formation of CDK-cyclin-D complexes by competitive binding of CDK4 and CDK6. Loss of p16INK4a expression is a common feature of NSCLC. RAR-beta is a nuclear receptor that bears vitamin-A-dependent transcriptional activity. RASSF1A is able to form heterodimers with Nore-1, an RAS effector. Therefore loss of RASSF1A might shift the balance of RAS activity towards a growth-promoting effect.',\n", + " 'H00018': \"DESCRIPTION Gastric cancer (GC) is one of the world's most common cancers. According to Lauren's histological classification gastric cancer is divided into two distinct histological groups - the intestinal and diffuse types. Several genetic changes have been identified in intestinal-type GC. The intestinal metaplasia is characterized by mutations in p53 gene, reduced expression of retinoic acid receptor beta (RAR-beta) and hTERT expression. Gastric adenomas furthermore display mutations in the APC gene, reduced p27 expression and cyclin E amplification. In addition, amplification and overexpression of c-ErbB2, reduced TGF-beta receptor type I (TGFBRI) expression and complete loss of p27 expression are commonly observed in more advanced GC. The main molecular changes observed in diffuse-type GCs include loss of E-cadherin function by mutations in CDH1and amplification of MET and FGFR2F.\",\n", + " 'H00019': \"DESCRIPTION Infiltrating ductal adenocarcinoma is the most common malignancy of the pancreas. When most investigators use the term 'pancreatic cancer' they are referring to pancreatic ductal adenocarcinoma (PDA). Normal duct epithelium progresses to infiltrating cancer through a series of histologically defined precursors. The overexpression of HER-2/neu and activating point mutations in the K-ras gene occur early, inactivation of the p16 gene at an intermediate stage, and the inactivation of p53, SMAD4, and BRCA2 occur relatively late. Activated K-ras engages multiple effector pathways. Although EGF receptors are conventionally regarded as upstream activators of RAS proteins, they can also act as RAS signal transducers via RAS-induced autocrine activation of the EGFR family ligands. Moreover, PDA shows extensive genomic instability and aneuploidy. Telomere attrition and mutations in p53 and BRCA2 are likely to contribute to these phenotypes. Inactivation of the SMAD4 tumour suppressor gene leads to loss of the inhibitory influence of the transforming growth factor-beta signalling pathway.\",\n", + " 'H00020': 'DESCRIPTION Colorectal cancer (CRC) is the second largest cause of cancer-related deaths in Western countries. CRC arises from the colorectal epithelium as a result of the accumulation of genetic alterations in defined oncogenes and tumour suppressor genes (TSG). Two major mechanisms of genomic instability have been identified in sporadic CRC progression. The first, known as chromosomal instability (CIN), results from a series of genetic changes that involve the activation of oncogenes such as K-ras and inactivation of TSG such as p53, DCC/Smad4, and APC. The second, known as microsatellite instability (MSI), results from inactivation of the DNA mismatch repair genes MLH1 and/or MSH2 by hypermethylation of their promoter, and secondary mutation of genes with coding microsatellites, such as transforming growth factor receptor II (TGF-RII) and BAX. Hereditary syndromes have germline mutations in specific genes (mutation in the tumour suppressor gene APC on chromosome 5q in FAP, mutated DNA mismatch repair genes in HNPCC).',\n", + " 'H00021': 'DESCRIPTION Renal cell cancer (RCC) accounts for ~3% of human malignancies and its incidence appears to be rising. Although most cases of RCC seem to occur sporadically, an inherited predisposition to renal cancer accounts for 1-4% of cases. RCC is not a single disease, it has several morphological subtypes. Conventional RCC (clear cell RCC) accounts for ~80% of cases, followed by papillary RCC (10-15%), chromophobe RCC (5%), and collecting duct RCC (<1%). Genes potentially involved in sporadic neoplasms of each particular type are VHL, MET, BHD, and FH respectively. In the absence of VHL, hypoxia-inducible factor alpha (HIF-alpha) accumulates, leading to production of several growth factors, including vascular endothelial growth factor and platelet-derived growth factor. Activated MET mediates a number of biological effects including motility, invasion of extracellular matrix, cellular transformation, prevention of apoptosis and metastasis formation. Loss of functional FH leads to accumulation of fumarate in the cell, triggering inhibition of HPH and preventing targeted pVHL-mediated degradation of HIF-alpha. BHD mutations cause the Birt-Hogg-Dube syndrome and its associated chromophobe, hybrid oncocytic, and conventional (clear cell) RCC.',\n", + " 'H00022': 'DESCRIPTION The urothelium covers the luminal surface of almost the entire urinary tract, extending from the renal pelvis, through the ureter and bladder, to the proximal urethra. The majority of urothelial carcinoma are bladder carcinomas, and urothelial carcinomas of the renal pelvis and ureter account for only approximately 7% of the total. Urothelial tumours arise and evolve through divergent phenotypic pathways. Some tumours progress from urothelial hyperplasia to low-grade non-invasive superficial papillary tumours. More aggressive variants arise either from flat, high-grade carcinoma in situ (CIS) and progress to invasive tumours, or they arise de novo as invasive tumours. Low-grade papillary tumors frequently show a constitutive activation of the receptor tyrosine kinase-Ras pathway, exhibiting activating mutations in the HRAS and fibroblast growth factor receptor 3 (FGFR3) genes. In contrast, CIS and invasive tumors frequently show alterations in the TP53 and RB genes and pathways. Invasion and metastases are promoted by several factors that alter the tumour microenvironment, including the aberrant expression of E-cadherins (E-cad), matrix metalloproteinases (MMPs), angiogenic factors such as vascular endothelial growth factor (VEGF).',\n", + " 'H00024': 'DESCRIPTION Prostate cancer constitutes a major health problem in Western countries. It is the most frequently diagnosed cancer among men and the second leading cause of male cancer deaths. The identification of key molecular alterations in prostate-cancer cells implicates carcinogen defenses (GSTP1), growth-factor-signaling pathways (NKX3.1, PTEN, and p27), and androgens (AR) as critical determinants of the phenotype of prostate-cancer cells. Glutathione S-transferases (GSTP1) are detoxifying enzymes. Cells of prostatic intraepithelial neoplasia, devoid of GSTP1, undergo genomic damage mediated by carcinogens. NKX3.1, PTEN, and p27 regulate the growth and survival of prostate cells in the normal prostate. Inadequate levels of PTEN and NKX3.1 lead to a reduction in p27 levels and to increased proliferation and decreased apoptosis. Androgen receptor (AR) is a transcription factor that is normally activated by its androgen ligand. During androgen withdrawal therapy, the AR signal transduction pathway also could be activated by amplification of the AR gene, by AR gene mutations, or by altered activity of AR coactivators. Through these mechanisms, tumor cells lead to the emergence of androgen-independent prostate cancer.',\n", + " 'H00026': 'DESCRIPTION Endometrial cancer (EC) is the most common gynaecological malignancy and the fourth most common malignancy in women in the developed world after breast, colorectal and lung cancer. Two types of endometrial carcinoma are distinguished with respect to biology and clinical course. Type-I carcinoma is related to hyperestrogenism by association with endometrial hyperplasia, frequent expression of estrogen and progesterone receptors and younger age, whereas type-II carcinoma is unrelated to estrogen, associated with atrophic endometrium, frequent lack of estrogen and progesterone receptors and older age. The morphologic differences in these cancers are mirrored in their molecular genetic profile with type I showing defects in DNA-mismatch repair and mutations in PTEN, K-ras, and beta-catenin, and type II showing aneuploidy, p53 mutations, and her2/neu amplification.',\n", + " 'H00031': 'DESCRIPTION Breast cancer is the leading cause of cancer death among women worldwide. The vast majority of breast cancers are carcinomas that originate from cells lining the milk-forming ducts of the mammary gland. The molecular subtypes of breast cancer, which are based on the presence or absence of hormone receptors (estrogen and progesterone subtypes) and human epidermal growth factor receptor-2 (HER2), include: hormone receptor positive and HER2 negative (luminal A subtype), hormone receptor positive and HER2 positive (luminal B subtype), hormone receptor negative and HER2 positive (HER2 positive), and hormone receptor negative and HER2 negative (basal-like or triple-negative breast cancers (TNBCs)). Hormone receptor positive breast cancers are largely driven by the estrogen/ER pathway. In HER2 positive breast tumours, HER2 activates the PI3K/AKT and the RAS/RAF/MAPK pathways, and stimulate cell growth, survival and differentiation. In patients suffering from TNBC, the deregulation of various signalling pathways (Notch, Wnt/beta-catenin, and EGFR) have been confirmed.',\n", + " 'H00032': 'DESCRIPTION Thyroid cancer is the most common endocrine malignancy and accounts for the majority of endocrine cancer- related deaths each year. More than 95% of thyroid carcinomas are derived from follicular cells. Their behavior varies from the indolent growing, well-differentiated papillary and follicular carcinomas (PTC and FTC, respectively) to the extremely aggressive undifferentiated carcinoma (UC). Somatic rearrangements of RET and TRK are almost exclusively found in PTC and may be found in early stages. The most distinctive molecular features of FTC are the prominence of aneuploidy and the high prevalence of RAS mutations and PAX8-PPAR{gamma} rearrangements. p53 seems to play a crucial role in the dedifferentiation process of thyroid carcinoma.',\n", + " 'H00033': 'DESCRIPTION Adrenocortical carcinoma (ACC) is a rare endocrine malignancy defined by a heterogeneous clinical presentation, dismal prognosis, and lack of effective therapeutic regimens. The incidence of ACC ranges from 0.5 to 2 cases per million people per year, accounting for 0.02% of all reported cancers. Unfortunately, most patients present with metastatic disease which reduces the 5 year survival rate to less than 10%. Oncogenes and tumor-suppressor genes involved in adrenal carcinomas include mutations in the p53 tumor-suppressor gene and rearrangements of the chromosomal locus 11p15.5 associated with IGF II hyperexpression. Deletions of the ACTH receptor gene have recently been found in undifferentiated adenomas and in aggressive ACCs.',\n", + " 'H00034': 'DESCRIPTION Carcinoid tumors are relatively uncommon neoplasms that nonetheless comprise up to 85% of neuroendocrine gastrointestinal neoplasms. They most frequently occur in the midgut and develop from neuroendocrine cells that are normally and diffusely present in this location. Most carcinoids are sporadic but epidemiological studies report a familial risk. Moreover, carcinoids can occur within the multiple endocrine neoplasia (MEN) syndrome, a rare familiar tumor syndrome in which mutations in the MEN1 gene are manifested. Recently, it has been shown that a majority (78%) of sporadic carcinoids display loss of heterozygosity for markers around the MEN 1 region, thus suggesting involvement of this gene in the pathogenesis of both familial and sporadic carcinoids.',\n", + " 'H00038': 'DESCRIPTION Melanoma is a form of skin cancer that has a poor prognosis and which is on the rise in Western populations. Melanoma arises from the malignant transformation of pigment-producing cells, melanocytes. The only known environmental risk factor is exposure to ultraviolet (UV) light and in people with fair skin the risk is greatly increased. Melanoma pathogenesis is also driven by genetic factors. Oncogenic NRAS mutations activate both effector pathways Raf-MEK-ERK and PI3K-Akt. The Raf-MEK-ERK pathway may also be activated via mutations in the BRAF gene. The PI3K-Akt pathway may be activated through loss or mutation of the inhibitory tumor suppressor gene PTEN. These mutations arise early during melanoma pathogenesis and are preserved throughout tumor progression. Melanoma development has been shown to be strongly associated with inactivation of the p16INK4a/cyclin dependent kinases 4 and 6/retinoblastoma protein (p16INK4a/CDK4,6/pRb) and p14ARF/human double minute 2/p53 (p14ARF/HMD2/p53) tumor suppressor pathways. MITF and TP53 are implicated in further melanoma progression.',\n", + " 'H00039': 'DESCRIPTION Cancer of the skin is the most common cancer in Caucasians and basal cell carcinomas (BCC) account for 90% of all skin cancers. The vast majority of BCC cases are sporadic, though there is a rare familial syndrome basal cell nevus syndrome (BCNS, or Gorlin syndrome) that predisposes to development of BCC. In addition, there is strong epidemiological and genetic evidence that demonstrates UV exposure as a risk factor of prime importance. The development of basal cell carcinoma is associated with constitutive activation of sonic hedgehog signaling. The mutations in SMOH, PTCH1, and SHH in BCCs result in continuous activation of target genes. At a cellular level, sonic hedgehog signaling promotes cell proliferation. Mutations in TP53 are also found with high frequency (>50%) in sporadic BCC.',\n", + " 'H00042': 'DESCRIPTION Gliomas are the most common of the primary brain tumors and account for more than 40% of all central nervous system neoplasms. Gliomas include tumours that are composed predominantly of astrocytes (astrocytomas), oligodendrocytes (oligodendrogliomas), mixtures of various glial cells (for example,oligoastrocytomas) and ependymal cells (ependymomas). The most malignant form of infiltrating astrocytoma - glioblastoma multiforme (GBM) - is one of the most aggressive human cancers. GBM may develop de novo (primary glioblastoma) or by progression from low-grade or anaplastic astrocytoma (secondary glioblastoma). Primary glioblastomas develop in older patients and typically show genetic alterations (EGFR amplification, p16/INK4a deletion, and PTEN mutations) at frequencies of 24-34%. Secondary glioblastomas develop in younger patients and frequently show overexpression of PDGF and CDK4 as well as p53 mutations (65%) and loss of Rb playing major roles in such transformations. Loss of PTEN has been implicated in both pathways, although it is much more common in the pathogenesis of primary GBM.',\n", + " 'H00048': 'DESCRIPTION Hepatocellular carcinoma (HCC) is a major type of primary liver cancer and one of the rare human neoplasms etiologically linked to viral factors. It has been shown that, after HBV/HCV infection and alcohol or aflatoxin B1 exposure, genetic and epigenetic changes occur. The recurrent mutated genes were found to be highly enriched in multiple key driver signaling processes, including telomere maintenance, TP53, cell cycle regulation, the Wnt/beta-catenin pathway (CTNNB1 and AXIN1), the phosphatidylinositol-3 kinase (PI3K)/AKT/mammalian target of rapamycin (mTOR) pathway. Recent studies using whole-exome sequencing have revealed recurrent mutations in new driver genes involved in the chromatin remodelling (ARID1A and ARID2) and the oxidative stress (NFE2L2) pathways.',\n", + " 'H00056': 'DESCRIPTION Alzheimer disease (AD) is a chronic disorder that slowly destroys neurons and causes serious cognitive disability. AD is associated with senile plaques and neurofibrillary tangles (NFTs). Amyloid-beta (Abeta), a major component of senile plaques, has various pathological effects on cell and organelle function. To date genetic studies have revealed four genes that may be linked to autosomal dominant or familial early onset AD (FAD). These four genes include: amyloid precursor protein (APP), presenilin 1 (PS1), presenilin 2 (PS2), and apolipoprotein E (ApoE). All mutations associated with APP and PS proteins can lead to an increase in the production of Abeta peptides, specifically the more amyloidogenic form, Abeta42. It was proposed that Abeta forms Ca2+ permeable pores and binds to and modulates multiple synaptic proteins, including NMDAR, mGluR5, and VGCC, leading to the overfilling of neurons with calcium ions. Consequently, cellular Ca2+ disruptions will lead to neuronal apoptosis, autophagy deficits, mitochondrial abnormality, defective neurotransmission, impaired synaptic plasticity, and neurodegeneration in AD. FAD-linked PS1 mutation downregulates the unfolded protein response and leads to vulnerability to ER stress.',\n", + " 'H00057': 'DESCRIPTION Parkinson disease (PD) is a progressive neurodegenerative movement disorder that results primarily from the death of dopaminergic (DA) neurons in the substantia nigra pars compacta (SNc). Both environmental factors and mutations in familial PD-linked genes such as SNCA, Parkin, DJ-1, PINK1 and LRRK2 are associated with PD pathogenesis. These pathogenic mutations and environmental factors are known to cause disease due to oxidative stress, intracellular Ca2+ homeostasis impairment, mitochondrial dysfunctions and altered protein handling compromising key roles of DA neuronal function and survival. The demise of DA neurons located in the SNc leads to a drop in the dopaminergic input to the striatum, which is hypothesized to impede movement by inducing hypo and hyper activity in striatal spiny projection neurons (SPNs) of the direct (dSPNs) and indirect (iSPNs) pathways in the basal ganglia, respectively.',\n", + " 'H00058': 'DESCRIPTION Amyotrophic lateral sclerosis (ALS) is a neurodegenerative disorder characterized by a progressive degeneration of motor neurons in the brain and spinal cord. In 90% of patients, ALS is sporadic, with no clear genetic linkage. On the other hand, the remaining 10% of cases show familial inheritance, with mutations in SOD1, TDP43(TARDBP), FUS, or C9orf72 genes being the most frequent causes. In spite of such difference, familial ALS and sporadic ALS have similarities in their pathological features. Proposed disease mechanisms contributing to motor neuron degeneration in ALS are: impaired proteostasis, aberrant RNA processing, mitochondrial disfunction and oxidative stress, microglia activation, and axonal dysfunction.',\n", + " 'H00059': 'DESCRIPTION Huntington disease (HD) is an autosomal-dominant neurodegenerative disorder that primarily affects medium spiny striatal neurons (MSN). The symptoms are choreiform, involuntary movements, personality changes and dementia. HD is caused by a CAG repeat expansion in the IT15 gene, which results in a long stretch of polyglutamine (polyQ) close to the amino-terminus of the HD protein huntingtin (Htt). Mutant Htt (mHtt) has effects both in the cytoplasm and in the nucleus. Full-length Htt is cleaved by proteases in the cytoplasm, leading to the formation of cytoplasmic and neuritic aggregates. mHtt also alters vesicular transport and recycling, causes cytosolic and mitochondrial Ca2+ overload, triggers endoplasmic reticulum stress through proteasomal dysfunction, and impairs autophagy function, increasing neuronal death susceptibility. N-terminal fragments containing the polyQ stretch translocate to the nucleus where they impair transcription and induce neuronal death.',\n", + " 'H00061': 'DESCRIPTION Prion diseases, also termed transmissible spongiform encephalopathies (TSEs), are a group of fatal neurodegenerative diseases that affect humans and a number of other animal species. The etiology of these diseases is thought to be associated with the conversion of a normal protein, PrPC, into an infectious, pathogenic form, PrPSc. The conversion is induced by prion infections (for example, variant Creutzfeldt-Jakob disease (vCJD), iatrogenic CJD, Kuru), mutations (familial CJD, Gerstmann-Straussler-Scheinker syndrome, fatal familial insomnia (FFI)) or unknown factors (sporadic CJD (sCJD)), and is thought to occur after PrPC has reached the plasma membrane or is re-internalized for degradation. The PrPSc form shows greater protease resistance than PrPC and accumulates in affected individuals, often in the form of extracellular plaques. Pathways that may lead to neuronal death comprise oxidative stress, regulated activation of complement, ubiquitin-proteasome and endosomal-lysosomal systems, synaptic alterations and dendritic atrophy, corticosteroid response, and endoplasmic reticulum stress. In addition, the conformational transition could lead to the lost of a beneficial activity of the natively folded protein, PrPC.',\n", + " 'H00063': 'DESCRIPTION The autosomal dominant spinocerebellar ataxias (SCAs) are a group of progressive neurodegenerative diseases characterised by loss of balance and motor coordination due to the primary dysfunction of the cerebellum. Compelling evidence points to major aetiological roles for transcriptional dysregulation, protein aggregation and clearance, autophagy, the ubiquitin-proteasome system, alterations of calcium homeostasis, mitochondria defects, toxic RNA gain-of-function mechanisms and eventual cell death with apoptotic features of neurons during SCA disease progression.',\n", + " 'H00126': 'DESCRIPTION Gaucher disease is an autosomal recessive lysosomal storage disorder caused by deficient beta-glucocerebrosidase (glucosylceramidase) activity or saposin C which is an activator of beta-glucocerebrosidase in sphingolipid metabolism. The enzymatic defects lead to the accumulation of glucosylceramide (GC) in lysosomes of affected cells. Despite the fact that Gaucher Disease consists of a phenotype, with varying degrees of severity, it has been sub-divided in three subtypes according to the presence or absence of neurological involvement. The sub-types are Type 1, 2 and 3.',\n", + " 'H00135': 'DESCRIPTION Krabbe disease is an autosomal recessive disorder caused by deficient activity of galactosylceramidase.',\n", + " 'H00194': 'DESCRIPTION Deficiency of hypoxanthine-guanine phosphoribosyltransferase activity is an inborn error of purine metabolism characterized by hyperuricemia with hyperuricosuria and a continuum spectrum of neurological manifestations.',\n", + " 'H00195': 'DESCRIPTION Adenine phosphoribosyltransferase deficiency (APRTD) is an autosomal recessive disorder of purine metabolism and causes urolithiasis due to accumulation of the insoluble purine 2,8-dihydroxyadenine.',\n", + " 'H00246': 'DESCRIPTION Familial hyperparathyroidism (HRPT) is characterized by parathyroid adenoma and hyperplasia with hypersecretion of parathyroid hormone and hypercalcaemia. It is caused by mutation in the HRPT2 (CDC73 or Parafibromin) gene that also causes the hyperparathyroidism-jaw tumor syndrome. Sporadic cases are also known to occur with somatic mutations within the MEN1 gene.',\n", + " 'H00247': \"DESCRIPTION Multiple endocrine neoplasias (MEN) are autosomal dominant syndrome which is characterized by the occurrence of tumors involving two or more endocrine glands. Four major forms of MEN are recognized, namely MEN1, MEN2A, MEN2B and MEN4. MEN1, which is also referred as Wermer's syndrome, is characterized by parathyroid adenoma, gastrinoma, and pituitary adenoma. Gastrinomas are the most common type, leading to the Zollinger-Ellison Syndrome (see H01522). MEN2 is characterized by medullary thyroid cancer (MTC) and includes three subtypes: MEN2A (Sipple's syndrome), MEN2B (MEN3) and familial MTC. Patients with MEN2A develop MTC in association with phaeochromocytoma and parathyroid tumors. Patients with MEN2B develop MTC in association with marfanoid habitus, mucosal neuromas, medullated corneal fibers and intestinal autonomic ganglion dysfunction, leading to megacolon. MEN4, also referred to as MENX, appears to have signs and symptoms similar to those of type 1. However MEN4 patients have mutations in other genes. The mutations in their responsible genes are found in Each MEN syndrome.\",\n", + " 'H00251': 'DESCRIPTION Thyroid dyshormonogenesis is a genetically heterogeneous group of inherited disorders in the enzymatic cascade of thyroid hormone synthesis that result in congenital hypothyroidism due to genetic defects in the synthesis of thyroid hormones.',\n", + " 'H00260': \"DESCRIPTION Primary pigmented micronodular adrenocortical disease (PPNAD) is a form of ACTH-independent adrenal hyperplasia resulting in endogenous Cushing's syndrome.\",\n", + " 'H00423': 'DESCRIPTION The sphingolipidoses are a group of monogenic inherited diseases caused by defects in the system of lysosomal sphingolipid degradation, with subsequent accumulation of non-degradable storage material in one or more organs.',\n", + " 'H00485': 'DESCRIPTION Robinow syndrome (RS) is a rare genetically heterogeneous condition characterized by hypertelorism, nasal features (large nasal bridge, short upturned nose, and anteverted nares), midface hypoplasia, mesomelic limb shortening, brachydactyly, clinodactyly, micropenis, and short stature. Both autosomal recessive and autosomal dominant inheritance have been described. The phenotypic presentation in both types of RS overlaps; however, subtle variances in the severity of craniofacial, musculoskeletal, cardiovascular, and urogenital characteristics may be present. In general, autosomal recessive RS (RRS) patients have more severe dysmorphology than autosomal dominant RS (DRS), especially in the musculoskeletal system.',\n", + " 'H00559': 'DESCRIPTION von Hippel-Lindau syndrome is an autosomal dominant disorder associated with tumors in the central nervous system and other organs. The most frequent tumors are cerebellar and retinal haemangioblastomas, pancreatic neuroendocrine tumors, renal cell carcinoma, phaeochromocytoma in the adrenal gland, epididymal cystadenoma, and endolymphatic sac tumors. Germline inactivation of VHL tumor suppressor protein leads to the upregulation of HIF and promotes to carcinogenesis.',\n", + " 'H01032': 'DESCRIPTION N-acetylglutamate synthase (NAGS) deficiency is a rare inborn error of metabolism affecting ammonia detoxification in the urea cycle. The N-acetylglutamate is the absolutely required allosteric activator of the first urea cycle enzyme carbamoylphosphate synthetase 1 (CPS1). In defects of NAGS, the urea cycle function can be severely affected resulting in fatal hyperammonemia in neonatal patients or at any later stage in life. Clinical features of NAGS deficiency include poor feeding, vomiting, altered level of consciousness, seizures, and coma.',\n", + " 'H01102': 'DESCRIPTION Pituitary adenomas are an important and frequently occurring form of intracranial tumor. They are usually benign but can give rise to severe clinical syndromes due to hormonal excess, or to visual/cranial disturbances due to mass effect. The tumor can be clinically nonfunctioning or hormone secreting. Among the latter, prolactin (PRL) and growth hormone (GH)-secreting adenomas are the most common. The majority of pituitary adenomas arise sporadically, although a subset occurs as component tumors of well-characterized familial cancer syndromes, such as multiple endocrine neoplasia (MEN) [DS:H00247], and Carney complex (CNC) [DS:H01820].',\n", + " 'H01398': 'DESCRIPTION Hyperammonemia is a metabolic condition characterized by elevated levels of ammonia in the blood, and may result in irreversible brain damage if not treated early and thoroughly. Hyperammonemia can be classified into primary or secondary hyperammonemia depending on the underlying pathophysiology. Detoxification of ammonia is mainly accomplished by the urea cycle in periportal hepatocytes. If the urea cycle is directly affected by a defect of any of the involved enzymes or transporters, this results in primary hyperammonemia.',\n", + " 'H01431': \"DESCRIPTION Cushing syndrome (CS) is a rare disorder resulting from prolonged exposure to excess glucocorticoids via exogenous and endogenous sources. The typical clinical features of CS are related to hypercortisolism and include accumulation of central fat, moon facies, neuromuscular weakness, osteoporosis or bone fractures, metabolic complications, and mood changes. Traditionally, endogenous CS is classified as adrenocorticotropic hormone (ACTH)-dependent (about 80%) or ACTH- independent (about 20%). Among ACTH-dependent forms, pituitary corticotroph adenoma (Cushing's disease) is most common. Most pituitary tumors are sporadic, resulting from monoclonal expansion of a single mutated cell. Recently recurrent activating somatic driver mutations in the ubiquitin-specific protease 8 gene (USP8) were identified in almost half of corticotroph adenoma. Germline mutations in MEN1 (encoding menin), AIP (encoding aryl-hydrocarbon receptor-interacting protein), PRKAR1A (encoding cAMP-dependent protein kinase type I alpha regulatory subunit) and CDKN1B (encoding cyclin-dependent kinase inhibitor 1B; also known as p27 Kip1) have been identified in familial forms of pituitary adenomas. However, the frequency of familial pituitary adenomas is less than 5% in patients with pituitary adenomas. Among ACTH-independent CS, adrenal adenoma is most common. Rare adrenal causes of CS include primary bilateral macronodular adrenal hyperplasia (BMAH) or primary pigmented nodular adrenocortical disease (PPNAD).\",\n", + " 'H01522': 'DESCRIPTION Zollinger-Ellison syndrome (ZES) is a rare endocrinopathy caused by tumors of the pancreas and duodenum. These tumors, called gastrinomas, release gastrin to produce large amounts of acid that result in severe gastroesophageal peptic ulcer disease and diarrhea. Most ZES cases are sporadic, but about over 20 percent are caused by an inherited genetic disorder called multiple endocrine neoplasia type 1 (MEN1) [DS:H00247]. The clinical presentation is not specific for this disease and there is overlap of symptoms similar to those of a peptic ulcer. The most common symptoms include abdominal pain and diarrhea, sometimes accompanied by heartburn, nausea, and weight loss. Peptic ulceration complicated by bleeding is present in 25% of patients, and is more frequently in patients with sporadic ZES than in those with MEN1. In addition, the gastrinomas may be cancerous. The cancer can be spread to other parts of the body, most commonly to regional lymph nodes and the liver. The treatment of the ZES includes surgical removal and medical management of gastric acid hypersecretion for the prevention of malignant transformation and the genesis of complications.',\n", + " 'H01603': 'DESCRIPTION Primary aldosteronism is a clinical syndrome characterized by excess secretion of aldosterone from the adrenal gland. It is manifested by hypertension and hyporeninemia. In the past, hypokalemia was thought to be a mandatory finding in primary aldosteronism. However, later studies confirmed that most patients with primary aldosteronism are normokalemic. The prevalence of primary aldosteronism among nonselected hypertensive persons is between 5% and 13%, and it is now recognized to be the most common form of secondary hypertension. There are the seven subtypes of primary aldosteronism. Aldosterone-producing adenoma (APA) and bilateral idiopathic hyperaldosteronism (IHA) are the most common subtypes of primary aldosteronism. Unilateral adrenal hyperplasia, aldosterone-producing adrenocortical carcinoma, ectopic aldosterone-producing adenoma, and familial hyperaldosteronism (type I and typeII) are unusual subtypes. Somatic mutations in KCNJ5, ATP1A1, ATP2B3, and CACNA1D have been described in APAs. Usually, adenomas are managed surgically and bilateral hyperplasia, medically.',\n", + " 'H02049': \"DESCRIPTION Bilateral macronodular adrenal hyperplasia (BMAH) is an adrenal disorder characterized by bilateral benign adrenocortical nodules associated with variable levels of cortisol excess. BMAH is an adrenal cause of Cushing's syndrome (CS). An increased activity of the cAMP/PKA pathway is found in the various forms of BMAH. Actors of the cAMP/PKA signaling pathway or genes causing a hereditary familial tumor syndrome including adenomatous polyposis coli gene (APC), menin (MEN1) and fumarate hydratase (FH) can favor or be responsible for the development of BMAH. Recently, a new gene, ARMC5, was identified as a frequent cause of sporadic or familial BMAH.\",\n", + " 'H02221': 'DESCRIPTION Methylmalonic aciduria and homocystinuria (MAHC) is caused by defects of intracellular cobalamin (vitamin B12) metabolism. Derivatives of cobalamin are essential cofactors for enzymes required in intermediary metabolism, and its defects lead to the accumulation of methylmalonic acid and/or homocysteine in blood and urine. Affected persons present with multisystem clinical abnormalities, including developmental, hematologic, neurologic, and metabolic findings.'}" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "disease_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "97d091ec-097c-4028-bcaa-5c3e01ff0d01", + "metadata": {}, + "outputs": [], + "source": [ + "# Columns to process\n", + "cols_to_edit = [\"Disease\"]\n", + "\n", + "def put_disease_data(cell):\n", + " if pd.isna(cell):\n", + " return cell # Leave NaN as is\n", + " gene_dict = {}\n", + " for key in cell.keys():\n", + " gene_dict[key] = disease_dict[key]\n", + " return gene_dict\n", + "\n", + "# Apply the transformation to each column\n", + "for col in cols_to_edit:\n", + " network_info[col] = network_info[col].apply(put_disease_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "05257651-5f54-4d05-aa23-b04c1a3f85f7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EntryNameDefinitionExpandedPathwayClassDiseaseGene
0N00002BCR-ABL fusion kinase to RAS-ERK signaling pat...BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->...(25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38...{'hsa05220': 'Chronic myeloid leukemia'}{'nt06276': 'Chronic myeloid leukemia', 'nt062...{'H00004': 'DESCRIPTION Chronic myeloid leukem...{'25': 'ABL1; ABL proto-oncogene 1, non-recept...
1N00003Mutation-activated KIT to RAS-ERK signaling pa...KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48...{'hsa05221': 'Acute myeloid leukemia'}{'nt06275': 'Acute myeloid leukemia', 'nt06210...{'H00003': 'DESCRIPTION Acute myeloid leukemia...{'3815': 'KIT; KIT proto-oncogene receptor tyr...
2N00004Duplication or mutation-activated FLT3 to RAS-...FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK(2322v2,2322v1) -> 2885 -> (6654,6655) -> (326...{'hsa05221': 'Acute myeloid leukemia'}{'nt06275': 'Acute myeloid leukemia', 'nt06210...{'H00003': 'DESCRIPTION Acute myeloid leukemia...{'2322': 'FLT3; fms related tyrosine kinase 3'...
3N00005Mutation-activated MET to RAS-ERK signaling pa...MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER...4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48...{'hsa05225': 'Hepatocellular carcinoma', 'hsa0...{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'DESCRIPTION Hepatocellular carcino...{'4233': 'MET; MET proto-oncogene, receptor ty...
4N00007EML4-ALK fusion kinase to RAS-ERK signaling pa...EML4-ALK -> RAS -> RAF -> MEK -> ERK -> CCND1(238v1,238v2) -> (3265,3845,4893) -> (369,673,...{'hsa05223': 'Non-small cell lung cancer'}{'nt06266': 'Non-small cell lung cancer', 'nt0...{'H00014': 'DESCRIPTION Lung cancer is a leadi...{'238': 'ALK; ALK receptor tyrosine kinase', '...
...........................
168N01422HPRT1 deficiency in purine salvage pathway(Hypoxanthine,Guanine) // HPRT1*(C00262,C00242) // 3251v1<NA>{'nt06027': 'Purine salvage pathway'}{'H00194': 'DESCRIPTION Deficiency of hypoxant...{'3251': 'HPRT1; hypoxanthine phosphoribosyltr...
169N01444NXN mutation to WNT5A-ROR signaling pathwayNXN* -| DVL64359v1 -| (1855,1856,1857)<NA>{'nt06505': 'WNT signaling'}{'H00485': 'DESCRIPTION Robinow syndrome (RS) ...{'64359': 'NXN; nucleoredoxin', '1855': 'DVL1;...
170N01809Mutation-caused epigenetic silencing of MMACHCPRDX1* =| MMACHC5052v1 =| 25974{'hsa04980': 'Cobalamin transport and metaboli...{'nt06538': 'Cobalamin transport and metabolism'}{'H02221': 'DESCRIPTION Methylmalonic aciduria...{'5052': 'PRDX1; peroxiredoxin 1', '25974': 'M...
171N01873VHL mutation to HIF-2 signaling pathway(VHL*+RBX1+ELOC+ELOB+CUL2) // EPAS1 == ARNT =>...(7428v3+9978+6921+6923+8453) // 2034 == 405 =>...{'hsa05211': 'Renal cell carcinoma'}{'nt06542': 'HIF signaling'}{'H00021': 'DESCRIPTION Renal cell cancer (RCC...{'7428': 'VHL; von Hippel-Lindau tumor suppres...
172N01877ERBB4 mutation to GF-RTK-PI3K signaling pathwayNRG // ERBB4*(3084,9542,10718,145957) // 2066v1{'hsa04012': 'ErbB signaling pathway'}{'nt06543': 'NRG-ERBB signaling'}{'H00058': 'DESCRIPTION Amyotrophic lateral sc...{'3084': 'NRG1; neuregulin 1', '9542': 'NRG2; ...
\n", + "

173 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Entry Name \\\n", + "0 N00002 BCR-ABL fusion kinase to RAS-ERK signaling pat... \n", + "1 N00003 Mutation-activated KIT to RAS-ERK signaling pa... \n", + "2 N00004 Duplication or mutation-activated FLT3 to RAS-... \n", + "3 N00005 Mutation-activated MET to RAS-ERK signaling pa... \n", + "4 N00007 EML4-ALK fusion kinase to RAS-ERK signaling pa... \n", + ".. ... ... \n", + "168 N01422 HPRT1 deficiency in purine salvage pathway \n", + "169 N01444 NXN mutation to WNT5A-ROR signaling pathway \n", + "170 N01809 Mutation-caused epigenetic silencing of MMACHC \n", + "171 N01873 VHL mutation to HIF-2 signaling pathway \n", + "172 N01877 ERBB4 mutation to GF-RTK-PI3K signaling pathway \n", + "\n", + " Definition \\\n", + "0 BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->... \n", + "1 KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK \n", + "2 FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK \n", + "3 MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER... \n", + "4 EML4-ALK -> RAS -> RAF -> MEK -> ERK -> CCND1 \n", + ".. ... \n", + "168 (Hypoxanthine,Guanine) // HPRT1* \n", + "169 NXN* -| DVL \n", + "170 PRDX1* =| MMACHC \n", + "171 (VHL*+RBX1+ELOC+ELOB+CUL2) // EPAS1 == ARNT =>... \n", + "172 NRG // ERBB4* \n", + "\n", + " Expanded \\\n", + "0 (25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38... \n", + "1 3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48... \n", + "2 (2322v2,2322v1) -> 2885 -> (6654,6655) -> (326... \n", + "3 4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48... \n", + "4 (238v1,238v2) -> (3265,3845,4893) -> (369,673,... \n", + ".. ... \n", + "168 (C00262,C00242) // 3251v1 \n", + "169 64359v1 -| (1855,1856,1857) \n", + "170 5052v1 =| 25974 \n", + "171 (7428v3+9978+6921+6923+8453) // 2034 == 405 =>... \n", + "172 (3084,9542,10718,145957) // 2066v1 \n", + "\n", + " Pathway \\\n", + "0 {'hsa05220': 'Chronic myeloid leukemia'} \n", + "1 {'hsa05221': 'Acute myeloid leukemia'} \n", + "2 {'hsa05221': 'Acute myeloid leukemia'} \n", + "3 {'hsa05225': 'Hepatocellular carcinoma', 'hsa0... \n", + "4 {'hsa05223': 'Non-small cell lung cancer'} \n", + ".. ... \n", + "168 \n", + "169 \n", + "170 {'hsa04980': 'Cobalamin transport and metaboli... \n", + "171 {'hsa05211': 'Renal cell carcinoma'} \n", + "172 {'hsa04012': 'ErbB signaling pathway'} \n", + "\n", + " Class \\\n", + "0 {'nt06276': 'Chronic myeloid leukemia', 'nt062... \n", + "1 {'nt06275': 'Acute myeloid leukemia', 'nt06210... \n", + "2 {'nt06275': 'Acute myeloid leukemia', 'nt06210... \n", + "3 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "4 {'nt06266': 'Non-small cell lung cancer', 'nt0... \n", + ".. ... \n", + "168 {'nt06027': 'Purine salvage pathway'} \n", + "169 {'nt06505': 'WNT signaling'} \n", + "170 {'nt06538': 'Cobalamin transport and metabolism'} \n", + "171 {'nt06542': 'HIF signaling'} \n", + "172 {'nt06543': 'NRG-ERBB signaling'} \n", + "\n", + " Disease \\\n", + "0 {'H00004': 'DESCRIPTION Chronic myeloid leukem... \n", + "1 {'H00003': 'DESCRIPTION Acute myeloid leukemia... \n", + "2 {'H00003': 'DESCRIPTION Acute myeloid leukemia... \n", + "3 {'H00048': 'DESCRIPTION Hepatocellular carcino... \n", + "4 {'H00014': 'DESCRIPTION Lung cancer is a leadi... \n", + ".. ... \n", + "168 {'H00194': 'DESCRIPTION Deficiency of hypoxant... \n", + "169 {'H00485': 'DESCRIPTION Robinow syndrome (RS) ... \n", + "170 {'H02221': 'DESCRIPTION Methylmalonic aciduria... \n", + "171 {'H00021': 'DESCRIPTION Renal cell cancer (RCC... \n", + "172 {'H00058': 'DESCRIPTION Amyotrophic lateral sc... \n", + "\n", + " Gene \n", + "0 {'25': 'ABL1; ABL proto-oncogene 1, non-recept... \n", + "1 {'3815': 'KIT; KIT proto-oncogene receptor tyr... \n", + "2 {'2322': 'FLT3; fms related tyrosine kinase 3'... \n", + "3 {'4233': 'MET; MET proto-oncogene, receptor ty... \n", + "4 {'238': 'ALK; ALK receptor tyrosine kinase', '... \n", + ".. ... \n", + "168 {'3251': 'HPRT1; hypoxanthine phosphoribosyltr... \n", + "169 {'64359': 'NXN; nucleoredoxin', '1855': 'DVL1;... \n", + "170 {'5052': 'PRDX1; peroxiredoxin 1', '25974': 'M... \n", + "171 {'7428': 'VHL; von Hippel-Lindau tumor suppres... \n", + "172 {'3084': 'NRG1; neuregulin 1', '9542': 'NRG2; ... \n", + "\n", + "[173 rows x 8 columns]" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "network_info" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "0cad7b6f-d863-49f9-b0a2-644da8beb947", + "metadata": {}, + "outputs": [], + "source": [ + "network_info.to_csv(\"network_variant_final_info.tsv\",sep='\\t', header=True, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "3a556f82-3468-44eb-be31-5e9bedf59c70", + "metadata": {}, + "outputs": [], + "source": [ + "!sed -i '' 's/DESCRIPTION //g' network_variant_final_info.tsv" + ] + }, + { + "cell_type": "markdown", + "id": "a34eb400-5a7d-41c2-b2be-5bb9a3febf57", + "metadata": {}, + "source": [ + "# Final Merge of Variant Data with Network Data" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "id": "83d484dd-69d7-4e50-9454-9369223f1dd2", + "metadata": {}, + "outputs": [], + "source": [ + "variant_data = pd.read_csv(\"variant_data_together_wo_nt.tsv\", sep='\\t')\n", + "network_info = pd.read_csv(\"network_variant_final_info.tsv\",sep='\\t')\n", + "network_info = network_info.rename(columns={\"Entry\":\"Network\", \"Definition\":\"Network Definition\",\"Expanded\":\"Network Expanded\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "id": "63f214f1-e32a-4275-a037-554fd89409aa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NetworkENTRYSourceIDTranscriptIDNucChangeChrStartEndRefAlleleAltAllele
0N000731019v2ClinVar16929NC_000012.12NaN125775164657751646CT
1N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646CA
2N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646CG
3N000731019v2ClinVar16928NC_000012.12NaN125775164757751647GA
4N000731019v2dbSNPrs11547328NC_000012.12NaN125775164757751647GC
....................................
1506N002449817v1COSM6196635ENST00000393623.6c.706G>T191049219610492196CA
1507N002449817v1COSM6196637ENST00000393623.6c.548A>G191049948610499486TC
1508N00258999v2COSM4766271ENST00000621016.4c.662A>G166880882368808823AG
1509N00258999v2COSM4766211ENST00000621016.4c.755T>G166881026468810264TG
1510N00258999v2COSM1379150ENST00000621016.4c.769G>A166881027868810278GA
\n", + "

1511 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Network ENTRY Source ID TranscriptID NucChange Chr \\\n", + "0 N00073 1019v2 ClinVar 16929 NC_000012.12 NaN 12 \n", + "1 N00073 1019v2 dbSNP rs104894340 NC_000012.12 NaN 12 \n", + "2 N00073 1019v2 dbSNP rs104894340 NC_000012.12 NaN 12 \n", + "3 N00073 1019v2 ClinVar 16928 NC_000012.12 NaN 12 \n", + "4 N00073 1019v2 dbSNP rs11547328 NC_000012.12 NaN 12 \n", + "... ... ... ... ... ... ... ... \n", + "1506 N00244 9817v1 COSM 6196635 ENST00000393623.6 c.706G>T 19 \n", + "1507 N00244 9817v1 COSM 6196637 ENST00000393623.6 c.548A>G 19 \n", + "1508 N00258 999v2 COSM 4766271 ENST00000621016.4 c.662A>G 16 \n", + "1509 N00258 999v2 COSM 4766211 ENST00000621016.4 c.755T>G 16 \n", + "1510 N00258 999v2 COSM 1379150 ENST00000621016.4 c.769G>A 16 \n", + "\n", + " Start End RefAllele AltAllele \n", + "0 57751646 57751646 C T \n", + "1 57751646 57751646 C A \n", + "2 57751646 57751646 C G \n", + "3 57751647 57751647 G A \n", + "4 57751647 57751647 G C \n", + "... ... ... ... ... \n", + "1506 10492196 10492196 C A \n", + "1507 10499486 10499486 T C \n", + "1508 68808823 68808823 A G \n", + "1509 68810264 68810264 T G \n", + "1510 68810278 68810278 G A \n", + "\n", + "[1511 rows x 11 columns]" + ] + }, + "execution_count": 118, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variant_data" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "a681f1fb-b921-4ec3-b9cb-43df32fe9ef8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NetworkNameNetwork DefinitionNetwork ExpandedPathwayClassDiseaseGene
0N00002BCR-ABL fusion kinase to RAS-ERK signaling pat...BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->...(25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38...{'hsa05220': 'Chronic myeloid leukemia'}{'nt06276': 'Chronic myeloid leukemia', 'nt062...{'H00004': 'Chronic myeloid leukemia (CML) is ...{'25': 'ABL1; ABL proto-oncogene 1, non-recept...
1N00003Mutation-activated KIT to RAS-ERK signaling pa...KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48...{'hsa05221': 'Acute myeloid leukemia'}{'nt06275': 'Acute myeloid leukemia', 'nt06210...{'H00003': 'Acute myeloid leukemia (AML) is a ...{'3815': 'KIT; KIT proto-oncogene receptor tyr...
2N00004Duplication or mutation-activated FLT3 to RAS-...FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK(2322v2,2322v1) -> 2885 -> (6654,6655) -> (326...{'hsa05221': 'Acute myeloid leukemia'}{'nt06275': 'Acute myeloid leukemia', 'nt06210...{'H00003': 'Acute myeloid leukemia (AML) is a ...{'2322': 'FLT3; fms related tyrosine kinase 3'...
3N00005Mutation-activated MET to RAS-ERK signaling pa...MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER...4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48...{'hsa05225': 'Hepatocellular carcinoma', 'hsa0...{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'4233': 'MET; MET proto-oncogene, receptor ty...
4N00007EML4-ALK fusion kinase to RAS-ERK signaling pa...EML4-ALK -> RAS -> RAF -> MEK -> ERK -> CCND1(238v1,238v2) -> (3265,3845,4893) -> (369,673,...{'hsa05223': 'Non-small cell lung cancer'}{'nt06266': 'Non-small cell lung cancer', 'nt0...{'H00014': 'Lung cancer is a leading cause of ...{'238': 'ALK; ALK receptor tyrosine kinase', '...
...........................
168N01422HPRT1 deficiency in purine salvage pathway(Hypoxanthine,Guanine) // HPRT1*(C00262,C00242) // 3251v1NaN{'nt06027': 'Purine salvage pathway'}{'H00194': 'Deficiency of hypoxanthine-guanine...{'3251': 'HPRT1; hypoxanthine phosphoribosyltr...
169N01444NXN mutation to WNT5A-ROR signaling pathwayNXN* -| DVL64359v1 -| (1855,1856,1857)NaN{'nt06505': 'WNT signaling'}{'H00485': 'Robinow syndrome (RS) is a rare ge...{'64359': 'NXN; nucleoredoxin', '1855': 'DVL1;...
170N01809Mutation-caused epigenetic silencing of MMACHCPRDX1* =| MMACHC5052v1 =| 25974{'hsa04980': 'Cobalamin transport and metaboli...{'nt06538': 'Cobalamin transport and metabolism'}{'H02221': 'Methylmalonic aciduria and homocys...{'5052': 'PRDX1; peroxiredoxin 1', '25974': 'M...
171N01873VHL mutation to HIF-2 signaling pathway(VHL*+RBX1+ELOC+ELOB+CUL2) // EPAS1 == ARNT =>...(7428v3+9978+6921+6923+8453) // 2034 == 405 =>...{'hsa05211': 'Renal cell carcinoma'}{'nt06542': 'HIF signaling'}{'H00021': 'Renal cell cancer (RCC) accounts f...{'7428': 'VHL; von Hippel-Lindau tumor suppres...
172N01877ERBB4 mutation to GF-RTK-PI3K signaling pathwayNRG // ERBB4*(3084,9542,10718,145957) // 2066v1{'hsa04012': 'ErbB signaling pathway'}{'nt06543': 'NRG-ERBB signaling'}{'H00058': 'Amyotrophic lateral sclerosis (ALS...{'3084': 'NRG1; neuregulin 1', '9542': 'NRG2; ...
\n", + "

173 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Network Name \\\n", + "0 N00002 BCR-ABL fusion kinase to RAS-ERK signaling pat... \n", + "1 N00003 Mutation-activated KIT to RAS-ERK signaling pa... \n", + "2 N00004 Duplication or mutation-activated FLT3 to RAS-... \n", + "3 N00005 Mutation-activated MET to RAS-ERK signaling pa... \n", + "4 N00007 EML4-ALK fusion kinase to RAS-ERK signaling pa... \n", + ".. ... ... \n", + "168 N01422 HPRT1 deficiency in purine salvage pathway \n", + "169 N01444 NXN mutation to WNT5A-ROR signaling pathway \n", + "170 N01809 Mutation-caused epigenetic silencing of MMACHC \n", + "171 N01873 VHL mutation to HIF-2 signaling pathway \n", + "172 N01877 ERBB4 mutation to GF-RTK-PI3K signaling pathway \n", + "\n", + " Network Definition \\\n", + "0 BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->... \n", + "1 KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK \n", + "2 FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK \n", + "3 MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER... \n", + "4 EML4-ALK -> RAS -> RAF -> MEK -> ERK -> CCND1 \n", + ".. ... \n", + "168 (Hypoxanthine,Guanine) // HPRT1* \n", + "169 NXN* -| DVL \n", + "170 PRDX1* =| MMACHC \n", + "171 (VHL*+RBX1+ELOC+ELOB+CUL2) // EPAS1 == ARNT =>... \n", + "172 NRG // ERBB4* \n", + "\n", + " Network Expanded \\\n", + "0 (25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38... \n", + "1 3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48... \n", + "2 (2322v2,2322v1) -> 2885 -> (6654,6655) -> (326... \n", + "3 4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48... \n", + "4 (238v1,238v2) -> (3265,3845,4893) -> (369,673,... \n", + ".. ... \n", + "168 (C00262,C00242) // 3251v1 \n", + "169 64359v1 -| (1855,1856,1857) \n", + "170 5052v1 =| 25974 \n", + "171 (7428v3+9978+6921+6923+8453) // 2034 == 405 =>... \n", + "172 (3084,9542,10718,145957) // 2066v1 \n", + "\n", + " Pathway \\\n", + "0 {'hsa05220': 'Chronic myeloid leukemia'} \n", + "1 {'hsa05221': 'Acute myeloid leukemia'} \n", + "2 {'hsa05221': 'Acute myeloid leukemia'} \n", + "3 {'hsa05225': 'Hepatocellular carcinoma', 'hsa0... \n", + "4 {'hsa05223': 'Non-small cell lung cancer'} \n", + ".. ... \n", + "168 NaN \n", + "169 NaN \n", + "170 {'hsa04980': 'Cobalamin transport and metaboli... \n", + "171 {'hsa05211': 'Renal cell carcinoma'} \n", + "172 {'hsa04012': 'ErbB signaling pathway'} \n", + "\n", + " Class \\\n", + "0 {'nt06276': 'Chronic myeloid leukemia', 'nt062... \n", + "1 {'nt06275': 'Acute myeloid leukemia', 'nt06210... \n", + "2 {'nt06275': 'Acute myeloid leukemia', 'nt06210... \n", + "3 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "4 {'nt06266': 'Non-small cell lung cancer', 'nt0... \n", + ".. ... \n", + "168 {'nt06027': 'Purine salvage pathway'} \n", + "169 {'nt06505': 'WNT signaling'} \n", + "170 {'nt06538': 'Cobalamin transport and metabolism'} \n", + "171 {'nt06542': 'HIF signaling'} \n", + "172 {'nt06543': 'NRG-ERBB signaling'} \n", + "\n", + " Disease \\\n", + "0 {'H00004': 'Chronic myeloid leukemia (CML) is ... \n", + "1 {'H00003': 'Acute myeloid leukemia (AML) is a ... \n", + "2 {'H00003': 'Acute myeloid leukemia (AML) is a ... \n", + "3 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", + "4 {'H00014': 'Lung cancer is a leading cause of ... \n", + ".. ... \n", + "168 {'H00194': 'Deficiency of hypoxanthine-guanine... \n", + "169 {'H00485': 'Robinow syndrome (RS) is a rare ge... \n", + "170 {'H02221': 'Methylmalonic aciduria and homocys... \n", + "171 {'H00021': 'Renal cell cancer (RCC) accounts f... \n", + "172 {'H00058': 'Amyotrophic lateral sclerosis (ALS... \n", + "\n", + " Gene \n", + "0 {'25': 'ABL1; ABL proto-oncogene 1, non-recept... \n", + "1 {'3815': 'KIT; KIT proto-oncogene receptor tyr... \n", + "2 {'2322': 'FLT3; fms related tyrosine kinase 3'... \n", + "3 {'4233': 'MET; MET proto-oncogene, receptor ty... \n", + "4 {'238': 'ALK; ALK receptor tyrosine kinase', '... \n", + ".. ... \n", + "168 {'3251': 'HPRT1; hypoxanthine phosphoribosyltr... \n", + "169 {'64359': 'NXN; nucleoredoxin', '1855': 'DVL1;... \n", + "170 {'5052': 'PRDX1; peroxiredoxin 1', '25974': 'M... \n", + "171 {'7428': 'VHL; von Hippel-Lindau tumor suppres... \n", + "172 {'3084': 'NRG1; neuregulin 1', '9542': 'NRG2; ... \n", + "\n", + "[173 rows x 8 columns]" + ] + }, + "execution_count": 119, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "network_info" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "ff9b9542-754c-414a-82c9-4eb8409b19b5", + "metadata": {}, + "outputs": [], + "source": [ + "final_data = variant_data.merge(network_info, on='Network')" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "id": "5f1e15c8-49f3-4be8-9f4e-e3f73e30f01c", + "metadata": {}, + "outputs": [], + "source": [ + "final_data.to_csv(\"final_network_with_variant.tsv\",sep='\\t',header=True, index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "99191ac9-875f-4e5c-89d6-6382b29a9564", + "metadata": {}, + "source": [ + "# Extracting Human Chromosomes" + ] + }, + { + "cell_type": "markdown", + "id": "fb75a991-a908-48f1-8615-3099ab06ac66", + "metadata": {}, + "source": [ + "Downloaded the human genome from here https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.26/" + ] + }, + { + "cell_type": "markdown", + "id": "048f0501-7a19-4a8f-8a21-5e975f26135b", + "metadata": {}, + "source": [ + "Got all the chromosomes and their ids that we have variants for\n", + "\n", + "NC_000001.11\n", + "NC_000002.12\n", + "NC_000003.12\n", + "NC_000004.12\n", + "NC_000005.10\n", + "NC_000006.12\n", + "NC_000007.14\n", + "NC_000009.12\n", + "NC_000010.11\n", + "NC_000011.10\n", + "NC_000012.12\n", + "NC_000013.11\n", + "NC_000014.9\n", + "NC_000015.10\n", + "NC_000016.10\n", + "NC_000017.11\n", + "NC_000018.10\n", + "NC_000019.10\n", + "NC_000020.11\n", + "NC_000021.9\n", + "NC_000023.11\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3d870ed0-b14d-42a8-8d55-b58dc49367f1", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc0c7ced-c649-4695-bc11-9a7bfb87e128", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO]\u001b[0m 21 patterns loaded from file\n" + ] + } + ], + "source": [ + "seqkit grep -r -n -f chromosomes.txt /ncbi_dataset/data/GCF_000001405.26/GCF_000001405.26_GRCh38_genomic.fna -o chromosomes.fasta" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ef62d80c-f572-4f4e-9443-e3653a178327", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "file format type num_seqs sum_len min_len avg_len max_len\n", + "chromosomes.fasta FASTA DNA 21 2,835,085,313 46,709,983 135,004,062.5 248,956,422\n" + ] + } + ], + "source": [ + "seqkit stats chromosomes.fasta" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2452900e-93d3-4707-b2a3-0b94224de2cc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.7G\tchromosomes.fasta\n" + ] + } + ], + "source": [ + "du -h chromosomes.fasta" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "460d091d-6f98-4c9e-95a3-137601780652", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NC_000001.11 Homo sapiens chromosome 1, GRCh38 Primary Assembly\n", + "NC_000002.12 Homo sapiens chromosome 2, GRCh38 Primary Assembly\n", + "NC_000003.12 Homo sapiens chromosome 3, GRCh38 Primary Assembly\n", + "NC_000004.12 Homo sapiens chromosome 4, GRCh38 Primary Assembly\n", + "NC_000005.10 Homo sapiens chromosome 5, GRCh38 Primary Assembly\n", + "NC_000006.12 Homo sapiens chromosome 6, GRCh38 Primary Assembly\n", + "NC_000007.14 Homo sapiens chromosome 7, GRCh38 Primary Assembly\n", + "NC_000009.12 Homo sapiens chromosome 9, GRCh38 Primary Assembly\n", + "NC_000010.11 Homo sapiens chromosome 10, GRCh38 Primary Assembly\n", + "NC_000011.10 Homo sapiens chromosome 11, GRCh38 Primary Assembly\n", + "NC_000012.12 Homo sapiens chromosome 12, GRCh38 Primary Assembly\n", + "NC_000013.11 Homo sapiens chromosome 13, GRCh38 Primary Assembly\n", + "NC_000014.9 Homo sapiens chromosome 14, GRCh38 Primary Assembly\n", + "NC_000015.10 Homo sapiens chromosome 15, GRCh38 Primary Assembly\n", + "NC_000016.10 Homo sapiens chromosome 16, GRCh38 Primary Assembly\n", + "NC_000017.11 Homo sapiens chromosome 17, GRCh38 Primary Assembly\n", + "NC_000018.10 Homo sapiens chromosome 18, GRCh38 Primary Assembly\n", + "NC_000019.10 Homo sapiens chromosome 19, GRCh38 Primary Assembly\n", + "NC_000020.11 Homo sapiens chromosome 20, GRCh38 Primary Assembly\n", + "NC_000021.9 Homo sapiens chromosome 21, GRCh38 Primary Assembly\n", + "NC_000023.11 Homo sapiens chromosome X, GRCh38 Primary Assembly\n" + ] + } + ], + "source": [ + "seqkit fx2tab chromosomes.fasta | cut -f1" + ] + }, + { + "cell_type": "markdown", + "id": "c923727a-9eae-407f-a85f-fb9317ccd3ce", + "metadata": {}, + "source": [ + "# Creating the Nt Variant Database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06b00f03-f71e-4575-80b2-9960be48dba8", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "04b7f027-f8d2-451f-9d1e-b784708079cf", + "metadata": {}, + "outputs": [], + "source": [ + "from Bio import SeqIO\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1ca8532d-3e55-494f-8712-a1ea56c2b96d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Var_IDNetworkENTRYSourceIDTranscriptIDNucChangeChrStartEndRefAlleleAltAlleleNameNetwork DefinitionNetwork ExpandedPathwayClassDiseaseGene
0KEGG_1N000731019v2ClinVar16929NC_000012.12NaN125775164657751646CTMutation-activated CDK4 to cell cycle G1/S(CCND+CDK4*) -> RB1 // E2F((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...
1KEGG_2N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646CAMutation-activated CDK4 to cell cycle G1/S(CCND+CDK4*) -> RB1 // E2F((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...
2KEGG_3N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646CGMutation-activated CDK4 to cell cycle G1/S(CCND+CDK4*) -> RB1 // E2F((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...
3KEGG_4N000731019v2ClinVar16928NC_000012.12NaN125775164757751647GAMutation-activated CDK4 to cell cycle G1/S(CCND+CDK4*) -> RB1 // E2F((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...
4KEGG_5N000731019v2dbSNPrs11547328NC_000012.12NaN125775164757751647GCMutation-activated CDK4 to cell cycle G1/S(CCND+CDK4*) -> RB1 // E2F((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...
............................................................
1444KEGG_1445N002449817v1COSM6196635ENST00000393623.6c.706G>T191049219610492196CAMutation-inactivated KEAP1 to KEAP1-NRF2 signa...KEAP1* // NRF2 => (HMOX1,NQO1,GST,TXNRD1)9817v1 // 4780 => (3162,1728,119391,221357,293...{'hsa05225': 'Hepatocellular carcinoma'}{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'9817': 'KEAP1; kelch like ECH associated pro...
1445KEGG_1446N002449817v1COSM6196637ENST00000393623.6c.548A>G191049948610499486TCMutation-inactivated KEAP1 to KEAP1-NRF2 signa...KEAP1* // NRF2 => (HMOX1,NQO1,GST,TXNRD1)9817v1 // 4780 => (3162,1728,119391,221357,293...{'hsa05225': 'Hepatocellular carcinoma'}{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'9817': 'KEAP1; kelch like ECH associated pro...
1446KEGG_1447N00258999v2COSM4766271ENST00000621016.4c.662A>G166880882368808823AGMutation-inactivated CDH1 to beta-catenin sign...CDH1* // CTNNB1 -> TCF/LEF => (MYC,CCND1)999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...
1447KEGG_1448N00258999v2COSM4766211ENST00000621016.4c.755T>G166881026468810264TGMutation-inactivated CDH1 to beta-catenin sign...CDH1* // CTNNB1 -> TCF/LEF => (MYC,CCND1)999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...
1448KEGG_1449N00258999v2COSM1379150ENST00000621016.4c.769G>A166881027868810278GAMutation-inactivated CDH1 to beta-catenin sign...CDH1* // CTNNB1 -> TCF/LEF => (MYC,CCND1)999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...
\n", + "

1449 rows × 19 columns

\n", + "
" + ], + "text/plain": [ + " Var_ID Network ENTRY Source ID TranscriptID \\\n", + "0 KEGG_1 N00073 1019v2 ClinVar 16929 NC_000012.12 \n", + "1 KEGG_2 N00073 1019v2 dbSNP rs104894340 NC_000012.12 \n", + "2 KEGG_3 N00073 1019v2 dbSNP rs104894340 NC_000012.12 \n", + "3 KEGG_4 N00073 1019v2 ClinVar 16928 NC_000012.12 \n", + "4 KEGG_5 N00073 1019v2 dbSNP rs11547328 NC_000012.12 \n", + "... ... ... ... ... ... ... \n", + "1444 KEGG_1445 N00244 9817v1 COSM 6196635 ENST00000393623.6 \n", + "1445 KEGG_1446 N00244 9817v1 COSM 6196637 ENST00000393623.6 \n", + "1446 KEGG_1447 N00258 999v2 COSM 4766271 ENST00000621016.4 \n", + "1447 KEGG_1448 N00258 999v2 COSM 4766211 ENST00000621016.4 \n", + "1448 KEGG_1449 N00258 999v2 COSM 1379150 ENST00000621016.4 \n", + "\n", + " NucChange Chr Start End RefAllele AltAllele \\\n", + "0 NaN 12 57751646 57751646 C T \n", + "1 NaN 12 57751646 57751646 C A \n", + "2 NaN 12 57751646 57751646 C G \n", + "3 NaN 12 57751647 57751647 G A \n", + "4 NaN 12 57751647 57751647 G C \n", + "... ... ... ... ... ... ... \n", + "1444 c.706G>T 19 10492196 10492196 C A \n", + "1445 c.548A>G 19 10499486 10499486 T C \n", + "1446 c.662A>G 16 68808823 68808823 A G \n", + "1447 c.755T>G 16 68810264 68810264 T G \n", + "1448 c.769G>A 16 68810278 68810278 G A \n", + "\n", + " Name \\\n", + "0 Mutation-activated CDK4 to cell cycle G1/S \n", + "1 Mutation-activated CDK4 to cell cycle G1/S \n", + "2 Mutation-activated CDK4 to cell cycle G1/S \n", + "3 Mutation-activated CDK4 to cell cycle G1/S \n", + "4 Mutation-activated CDK4 to cell cycle G1/S \n", + "... ... \n", + "1444 Mutation-inactivated KEAP1 to KEAP1-NRF2 signa... \n", + "1445 Mutation-inactivated KEAP1 to KEAP1-NRF2 signa... \n", + "1446 Mutation-inactivated CDH1 to beta-catenin sign... \n", + "1447 Mutation-inactivated CDH1 to beta-catenin sign... \n", + "1448 Mutation-inactivated CDH1 to beta-catenin sign... \n", + "\n", + " Network Definition \\\n", + "0 (CCND+CDK4*) -> RB1 // E2F \n", + "1 (CCND+CDK4*) -> RB1 // E2F \n", + "2 (CCND+CDK4*) -> RB1 // E2F \n", + "3 (CCND+CDK4*) -> RB1 // E2F \n", + "4 (CCND+CDK4*) -> RB1 // E2F \n", + "... ... \n", + "1444 KEAP1* // NRF2 => (HMOX1,NQO1,GST,TXNRD1) \n", + "1445 KEAP1* // NRF2 => (HMOX1,NQO1,GST,TXNRD1) \n", + "1446 CDH1* // CTNNB1 -> TCF/LEF => (MYC,CCND1) \n", + "1447 CDH1* // CTNNB1 -> TCF/LEF => (MYC,CCND1) \n", + "1448 CDH1* // CTNNB1 -> TCF/LEF => (MYC,CCND1) \n", + "\n", + " Network Expanded \\\n", + "0 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "1 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "2 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "3 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "4 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "... ... \n", + "1444 9817v1 // 4780 => (3162,1728,119391,221357,293... \n", + "1445 9817v1 // 4780 => (3162,1728,119391,221357,293... \n", + "1446 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "1447 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "1448 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "\n", + " Pathway \\\n", + "0 {'hsa05218': 'Melanoma'} \n", + "1 {'hsa05218': 'Melanoma'} \n", + "2 {'hsa05218': 'Melanoma'} \n", + "3 {'hsa05218': 'Melanoma'} \n", + "4 {'hsa05218': 'Melanoma'} \n", + "... ... \n", + "1444 {'hsa05225': 'Hepatocellular carcinoma'} \n", + "1445 {'hsa05225': 'Hepatocellular carcinoma'} \n", + "1446 {'hsa05226': 'Gastric cancer'} \n", + "1447 {'hsa05226': 'Gastric cancer'} \n", + "1448 {'hsa05226': 'Gastric cancer'} \n", + "\n", + " Class \\\n", + "0 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "1 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "2 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "3 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "4 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "... ... \n", + "1444 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "1445 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "1446 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "1447 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "1448 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "\n", + " Disease \\\n", + "0 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "1 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "2 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "3 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "4 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "... ... \n", + "1444 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", + "1445 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", + "1446 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "1447 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "1448 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "\n", + " Gene \n", + "0 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... \n", + "1 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... \n", + "2 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... \n", + "3 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... \n", + "4 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... \n", + "... ... \n", + "1444 {'9817': 'KEAP1; kelch like ECH associated pro... \n", + "1445 {'9817': 'KEAP1; kelch like ECH associated pro... \n", + "1446 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... \n", + "1447 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... \n", + "1448 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... \n", + "\n", + "[1449 rows x 19 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variant_data = pd.read_csv(\"final_network_with_variant.tsv\", sep='\\t')\n", + "variant_data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ae73bfae-91a9-40a9-bfdb-c14b1d3e14ea", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1449" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(variant_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6e515f9d-b9a6-4a24-bde6-2496a823b9ba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'N00073'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variant_data.iloc[1][\"Network\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "488f8ed2-2a5b-4831-a5b7-90f3e049614f", + "metadata": {}, + "outputs": [], + "source": [ + "fasta_file = \"chromosomes.fasta\"\n", + "record_dict = SeqIO.to_dict(SeqIO.parse(fasta_file, \"fasta\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6c04e6aa-d700-427c-a4ce-cba8225e3024", + "metadata": {}, + "outputs": [], + "source": [ + "chromosome_dictionary = {\n", + " \"1\": \"NC_000001.11\",\n", + " \"2\": \"NC_000002.12\",\n", + " \"3\": \"NC_000003.12\",\n", + " \"4\": \"NC_000004.12\",\n", + " \"5\": \"NC_000005.10\",\n", + " \"6\": \"NC_000006.12\",\n", + " \"7\": \"NC_000007.14\",\n", + " \"9\": \"NC_000009.12\",\n", + " \"10\": \"NC_000010.11\",\n", + " \"11\": \"NC_000011.10\",\n", + " \"12\": \"NC_000012.12\",\n", + " \"13\": \"NC_000013.11\",\n", + " \"14\": \"NC_000014.9\",\n", + " \"15\": \"NC_000015.10\",\n", + " \"16\": \"NC_000016.10\",\n", + " \"17\": \"NC_000017.11\",\n", + " \"18\": \"NC_000018.10\",\n", + " \"19\": \"NC_000019.10\",\n", + " \"20\": \"NC_000020.11\",\n", + " \"21\": \"NC_000021.9\",\n", + " \"23\": \"NC_000023.11\"\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7a3550d7-04a4-44f3-a7d5-b61d30890ef0", + "metadata": {}, + "source": [ + "### Verification that the reference is present at the exact position I have in my data" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b593ef66-65e3-411a-ac95-a33c9d37667a", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"verification.txt\", \"w\") as f:\n", + " for i in range(len(variant_data)):\n", + " # ---- Input ----\n", + " chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]\n", + " if (variant_data.iloc[i]['TranscriptID'][:4] == \"ENST\"):\n", + " start = variant_data.iloc[i]['Start'] - 1\n", + " else:\n", + " start = variant_data.iloc[i]['Start']\n", + " reference_allele = variant_data.iloc[i]['RefAllele']\n", + " end = len(reference_allele) + start\n", + "\n", + " chrom_seq = record_dict[chromosome_id].seq\n", + "\n", + " # Adjust for 0-based indexing in Python\n", + " genomic_ref = chrom_seq[start: start + len(reference_allele)]\n", + "\n", + " if genomic_ref.upper() != reference_allele.upper():\n", + " f.write(f\"⚠️ Warning: Entry number {i} with variant {variant_data.iloc[i]['ID']} expected '{reference_allele}', but found '{genomic_ref}'\\n\")\n", + " else:\n", + " f.write(f\"✅ Verified: {chromosome_id}:{start}-{end} → '{reference_allele}' matches genome\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "02044565-4f9c-45f9-b59a-63590b571dd1", + "metadata": {}, + "outputs": [], + "source": [ + "mkdir nt_seq" + ] + }, + { + "cell_type": "markdown", + "id": "361619c9-7b49-45dd-901a-625cf1642535", + "metadata": {}, + "source": [ + "### Performing the mutation and saving the reference and variant allele with a 1000 nt window" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "96392c0b-c3fd-49ee-a2c1-97cef4127617", + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(len(variant_data)):\n", + " with open(f\"nt_seq/{variant_data.iloc[i]['Var_ID']}.txt\", \"w\") as f:\n", + " # ---- Input ----\n", + " chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]\n", + " if (variant_data.iloc[i]['TranscriptID'][:4] == \"ENST\"):\n", + " start = variant_data.iloc[i]['Start'] - 1\n", + " else:\n", + " start = variant_data.iloc[i]['Start']\n", + " reference_allele = variant_data.iloc[i]['RefAllele']\n", + " variant_allele = variant_data.iloc[i]['AltAllele']\n", + "\n", + " end = len(reference_allele) + start\n", + " window = 1000\n", + " \n", + " chrom_seq = record_dict[chromosome_id].seq\n", + "\n", + " # Extract region\n", + " region_start = max(0, start - window)\n", + " region_end = end + window\n", + "\n", + " ref_seq = chrom_seq[region_start:region_end]\n", + " \n", + " if (variant_allele == \"deletion\"):\n", + " # Apply mutation\n", + " mutated_seq = ref_seq[:window] + variant_allele + ref_seq[window + len(reference_allele):]\n", + " \n", + " f.write(f\">{variant_data.iloc[i]['ID']}_reference_{reference_allele}\\n\")\n", + " f.write(f\"{ref_seq}\\n\")\n", + " f.write(f\">{variant_data.iloc[i]['ID']}_variant_{variant_allele}\\n\")\n", + " f.write(f\"{mutated_seq}\\n\")\n", + " else:\n", + " del_len = len(reference_allele)\n", + " # Apply mutation\n", + " mutated_seq = ref_seq[:window] + ref_seq[window + del_len:]\n", + " \n", + " f.write(f\">{variant_data.iloc[i]['ID']}_reference_{reference_allele}\\n\")\n", + " f.write(f\"{ref_seq}\\n\")\n", + " f.write(f\">{variant_data.iloc[i]['ID']}_variant_{variant_allele}\\n\")\n", + " f.write(f\"{mutated_seq}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e06b86fd-2d31-486e-82ed-80dbb7f3b627", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/BioReason-main/data/KEGG_Data_2.ipynb b/BioReason-main/data/KEGG_Data_2.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..6a09e6a13f84e999adbc66fbe917bd98e28b553d --- /dev/null +++ b/BioReason-main/data/KEGG_Data_2.ipynb @@ -0,0 +1,1208 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0103d07d", + "metadata": {}, + "source": [ + "# KEGG Data Processing Pipeline - Part 2: Variant Information Parsing and Sequence Generation\n", + "\n", + "## Overview\n", + "\n", + "This notebook is the second part of the KEGG data processing pipeline. It focuses on parsing variant information from KEGG data, generating nucleotide sequences with mutations, and creating disease mapping databases.\n", + "\n", + "## What This Notebook Does\n", + "\n", + "1. **Variant Information Parsing**: Extracts detailed information from KEGG variant files\n", + "2. **Sequence Generation**: Creates reference and variant nucleotide sequences with genomic context\n", + "3. **Disease Mapping**: Downloads and processes KEGG disease information\n", + "4. **Data Integration**: Merges variant data with genomic sequences and disease annotations\n", + "5. **Quality Control**: Validates reference sequences against the genome\n", + "\n", + "## Prerequisites\n", + "\n", + "**Required from Part 1 (KEGG_Data_1.ipynb):**\n", + "- `gene_variants.txt` - List of variant identifiers\n", + "- `variant_info/` directory - Individual variant information files\n", + "- `final_network_with_variant.tsv` - Network and variant mapping data\n", + "\n", + "**Additional Requirements:**\n", + "- Reference genome FASTA file (GRCh38)\n", + "- BioPython for sequence processing\n", + "- KEGG_pull for disease information retrieval\n", + "\n", + "## Required Packages\n", + "\n", + "```bash\n", + "pip install biopython pandas kegg-pull\n", + "```\n", + "\n", + "## Input Files Expected\n", + "\n", + "- `gene_variants.txt` - Variant identifiers from Part 1\n", + "- `variant_info/*.txt` - Individual variant information files\n", + "- `chromosomes.fasta` - Reference genome sequences\n", + "- `final_network_with_variant.tsv` - Network-variant mapping\n", + "\n", + "## Output Files Generated\n", + "\n", + "- `nt_seq/` - Directory containing reference and variant sequences\n", + "- `verification.txt` - Quality control results\n", + "- `diseases.txt` - List of disease identifiers\n", + "- `disease_info/` - Disease information files\n", + "- Updated `final_network_with_variant.tsv` with disease names\n", + "\n", + "## Important Notes\n", + "\n", + "- **Memory Usage**: Processing large genomic sequences requires significant RAM\n", + "- **Storage**: Generated sequence files can be several GB in size\n", + "- **Processing Time**: Full pipeline may take several hours depending on dataset size\n", + "- **Dependencies**: Requires successful completion of KEGG_Data_1.ipynb\n", + "\n", + "## Next Steps\n", + "\n", + "After completing this notebook, run `KEGG_Data_3.ipynb` for final dataset creation and sequence integration." + ] + }, + { + "cell_type": "markdown", + "id": "ccc3ca96", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "Set up paths and parameters for variant processing:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28d2629e", + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration - Update these paths for your environment\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "# Navigate to kegg_data directory\n", + "data_dir = Path('kegg_data')\n", + "if not data_dir.exists():\n", + " print(\"❌ kegg_data directory not found. Please run KEGG_Data_1.ipynb first.\")\n", + " raise FileNotFoundError(\"kegg_data directory missing\")\n", + "\n", + "os.chdir(data_dir)\n", + "\n", + "# Configuration parameters\n", + "CONFIG = {\n", + " # Input files (should exist from Part 1)\n", + " 'gene_variants_file': 'gene_variants.txt',\n", + " 'variant_info_dir': 'variant_info',\n", + " 'network_data_file': 'final_network_with_variant.tsv',\n", + " \n", + " # Reference genome (update path as needed)\n", + " 'reference_fasta': 'chromosomes.fasta', # Update to your reference genome path\n", + " \n", + " # Output directories\n", + " 'nt_seq_dir': 'nt_seq',\n", + " 'disease_info_dir': 'disease_info',\n", + " \n", + " # Processing parameters\n", + " 'sequence_window': 2000, # Nucleotides around variant\n", + " 'verification_file': 'verification.txt',\n", + " 'diseases_file': 'diseases.txt'\n", + "}\n", + "\n", + "# Verify required input files\n", + "required_files = ['gene_variants.txt', 'final_network_with_variant.tsv']\n", + "missing_files = []\n", + "for file in required_files:\n", + " if not os.path.exists(file):\n", + " missing_files.append(file)\n", + "\n", + "if missing_files:\n", + " print(f\"❌ Missing required files: {missing_files}\")\n", + " print(\"Please run KEGG_Data_1.ipynb first to generate these files.\")\n", + "else:\n", + " print(\"✅ All required input files found\")\n", + "\n", + "# Create output directories\n", + "for dir_name in [CONFIG['nt_seq_dir'], CONFIG['disease_info_dir']]:\n", + " Path(dir_name).mkdir(exist_ok=True)\n", + "\n", + "print(f\"Working directory: {os.getcwd()}\")\n", + "print(\"\\n📝 Update CONFIG['reference_fasta'] with path to your reference genome file\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d05a4d10-03de-42ae-89c1-5ddbe77043a7", + "metadata": {}, + "outputs": [], + "source": [ + "# Working directory already set in configuration section above\n", + "print(f\"Current working directory: {os.getcwd()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "96662dbb-ee2c-4a74-8e45-ab58a3496976", + "metadata": {}, + "outputs": [], + "source": [ + "sed -i '' 's/:/_/g' gene_variants.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "db4f4cf2-cd95-4df8-99b6-cc112857502f", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + " if ! grep -q NAME variant_info/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < gene_variants.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "11959296-d5cb-4fb4-9914-83596dd41c86", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + " if ! grep -q GENE variant_info/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < gene_variants.txt" + ] + }, + { + "cell_type": "markdown", + "id": "784d0394-1a14-471a-9def-f4877b4bbd4e", + "metadata": {}, + "source": [ + "# Pulling Info from the Variant File\n", + "\n", + "# Variant Information Parsing\n", + "\n", + "This section processes individual variant files to extract structured information including variant names, genes, and types." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62b4167a-6d5a-4120-99fe-5678227db6cc", + "metadata": {}, + "outputs": [], + "source": [ + "# Working directory already set - proceeding with variant information parsing\n", + "print(f\"Processing variant files from: {CONFIG['variant_info_dir']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ed32b62-e3a6-4cff-b4ab-a80f04725a1c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import re\n", + "from pathlib import Path\n", + "import os\n", + "\n", + "# Read all file names from gene_variants.txt\n", + "gene_variants_file = CONFIG['gene_variants_file']\n", + "if not os.path.exists(gene_variants_file):\n", + " print(f\"❌ Gene variants file not found: {gene_variants_file}\")\n", + " print(\"Please run KEGG_Data_1.ipynb first to generate this file\")\n", + " raise FileNotFoundError(f\"Gene variants file not found: {gene_variants_file}\")\n", + "\n", + "with open(gene_variants_file, 'r') as f:\n", + " variant_files = [line.strip() for line in f if line.strip()]\n", + "\n", + "print(f\"Processing {len(variant_files)} variant files\")\n", + "\n", + "# Initialize an empty DataFrame to collect the results\n", + "variant_info = pd.DataFrame(columns=[\"Entry\", \"Variant_Name\", \"Variant_Gene\", \"Variant_Gene Info\", \"Variant_Type\"])\n", + "\n", + "# Function to extract the value after a keyword (single line, rest of the line)\n", + "def extract_value(line, key):\n", + " return line.split(key, 1)[-1].strip()\n", + "\n", + "# Process each variant file\n", + "variant_info_dir = Path(CONFIG['variant_info_dir'])\n", + "processed_count = 0\n", + "not_found_count = 0\n", + "\n", + "for file_name in variant_files:\n", + " file_path = variant_info_dir / f\"{file_name}.txt\"\n", + "\n", + " try:\n", + " with open(file_path, 'r') as f:\n", + " lines = f.readlines()\n", + "\n", + " name = \"\"\n", + " gene = \"\"\n", + " gene_info = \"\"\n", + " type_info = \"\"\n", + "\n", + " for line in lines:\n", + " line = line.strip()\n", + " if line.startswith(\"NAME\"):\n", + " name = extract_value(line, \"NAME\")\n", + " elif line.startswith(\"GENE\"):\n", + " gene_data = extract_value(line, \"GENE\")\n", + " if gene_data:\n", + " parts = gene_data.split(maxsplit=1)\n", + " gene = parts[0]\n", + " gene_info = parts[1] if len(parts) > 1 else \"\"\n", + " elif line.startswith(\"TYPE\"):\n", + " type_info = extract_value(line, \"TYPE\")\n", + "\n", + " row = {\n", + " \"Entry\": file_name,\n", + " \"Variant_Name\": name,\n", + " \"Variant_Gene\": gene,\n", + " \"Variant_Gene Info\": gene_info,\n", + " \"Variant_Type\": type_info\n", + " }\n", + "\n", + " variant_info = pd.concat([variant_info, pd.DataFrame([row])], ignore_index=True)\n", + " processed_count += 1\n", + " \n", + " if processed_count % 100 == 0:\n", + " print(f\"Processed {processed_count}/{len(variant_files)} files...\")\n", + "\n", + " except FileNotFoundError:\n", + " print(f\"[Warning] File not found: {file_path}\")\n", + " not_found_count += 1\n", + "\n", + "print(f\"✅ Processing complete: {processed_count} files processed, {not_found_count} files not found\")\n", + "print(f\"Extracted information for {len(variant_info)} variants\")\n", + "\n", + "# Optional: Save the final table\n", + "# variant_info.to_csv(\"parsed_variant_info.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "85e94a07-740d-44cd-a1c3-2330a30b99b1", + "metadata": {}, + "outputs": [], + "source": [ + "variant_info[\"Entry\"] = variant_info[\"Entry\"].str.replace(\"hsa_var_\", \"\", regex=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "4fc8fa00-2a28-4bd9-9aed-5c4602969cca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EntryVariant_NameVariant_GeneVariant_Gene InfoVariant_Type
01019v2CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]
11027v3CDKN1B mutationCDKN1Bcyclin dependent kinase inhibitor 1B [KO:K06624]
210280v1SIGMAR1 mutationSIGMAR1sigma non-opioid intracellular receptor 1 [KO:...
31029v2CDKN2A mutationCDKN2Acyclin dependent kinase inhibitor 2A [KO:K06621]
411315v1PARK7 mutationPARK7Parkinsonism associated deglycase [KO:K05687]
..................
909049v1AIP mutationAIPAHR interacting HSP90 co-chaperone [KO:K17767]
919101v1USP8 mutationUSP8ubiquitin specific peptidase 8 [KO:K11839]
929217v1VAPB mutationVAPBVAMP associated protein B and C [KO:K10707]
939817v1KEAP1 mutationKEAP1kelch like ECH associated protein 1 [KO:K10456]
94999v2CDH1 mutationCDH1cadherin 1 [KO:K05689]
\n", + "

95 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " Entry Variant_Name Variant_Gene \\\n", + "0 1019v2 CDK4 mutation CDK4 \n", + "1 1027v3 CDKN1B mutation CDKN1B \n", + "2 10280v1 SIGMAR1 mutation SIGMAR1 \n", + "3 1029v2 CDKN2A mutation CDKN2A \n", + "4 11315v1 PARK7 mutation PARK7 \n", + ".. ... ... ... \n", + "90 9049v1 AIP mutation AIP \n", + "91 9101v1 USP8 mutation USP8 \n", + "92 9217v1 VAPB mutation VAPB \n", + "93 9817v1 KEAP1 mutation KEAP1 \n", + "94 999v2 CDH1 mutation CDH1 \n", + "\n", + " Variant_Gene Info Variant_Type \n", + "0 cyclin dependent kinase 4 [KO:K02089] \n", + "1 cyclin dependent kinase inhibitor 1B [KO:K06624] \n", + "2 sigma non-opioid intracellular receptor 1 [KO:... \n", + "3 cyclin dependent kinase inhibitor 2A [KO:K06621] \n", + "4 Parkinsonism associated deglycase [KO:K05687] \n", + ".. ... ... \n", + "90 AHR interacting HSP90 co-chaperone [KO:K17767] \n", + "91 ubiquitin specific peptidase 8 [KO:K11839] \n", + "92 VAMP associated protein B and C [KO:K10707] \n", + "93 kelch like ECH associated protein 1 [KO:K10456] \n", + "94 cadherin 1 [KO:K05689] \n", + "\n", + "[95 rows x 5 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variant_info" + ] + }, + { + "cell_type": "markdown", + "id": "485ddbd6", + "metadata": {}, + "source": [ + "# Creating the Nt Variant Database\n", + "\n", + "# Nucleotide Sequence Database Creation\n", + "\n", + "This section creates nucleotide sequences with genomic context around each variant, generating both reference and mutated sequences for downstream analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8dba21b", + "metadata": {}, + "outputs": [], + "source": [ + "# Working directory already set - proceeding with nucleotide sequence processing\n", + "print(\"Starting nucleotide variant database creation...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8cf9f795", + "metadata": {}, + "outputs": [], + "source": [ + "from Bio import SeqIO\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bc18349", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "\n", + "# Load network and variant data\n", + "network_file = CONFIG['network_data_file']\n", + "if not os.path.exists(network_file):\n", + " print(f\"❌ Network data file not found: {network_file}\")\n", + " print(\"Please run KEGG_Data_1.ipynb first to generate this file\")\n", + " raise FileNotFoundError(f\"Network data not found: {network_file}\")\n", + "\n", + "variant_data = pd.read_csv(network_file, sep='\\t')\n", + "print(f\"✅ Loaded variant data: {len(variant_data)} entries\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "65dde804", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1449" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(variant_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c042831c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'N00073'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variant_data.iloc[1][\"Network\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92e4699c", + "metadata": {}, + "outputs": [], + "source": [ + "from Bio import SeqIO\n", + "import os\n", + "\n", + "# Assuming CONFIG is defined somewhere earlier in the code\n", + "# CONFIG = {'reference_fasta': 'path_to_your_fasta_file'}\n", + "\n", + "# Load reference genome sequences\n", + "fasta_file = CONFIG['reference_fasta']\n", + "if not os.path.exists(fasta_file):\n", + " print(f\"❌ Reference genome file not found: {fasta_file}\")\n", + " print(\"Please update CONFIG['reference_fasta'] with the correct path to your reference genome\")\n", + " print(\"Download from: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/\")\n", + " raise FileNotFoundError(f\"Reference genome not found: {fasta_file}\")\n", + "\n", + "print(f\"Loading reference genome from: {fasta_file}\")\n", + "record_dict = SeqIO.to_dict(SeqIO.parse(fasta_file, \"fasta\"))\n", + "print(f\"✅ Loaded {len(record_dict)} chromosome sequences\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c2efa951", + "metadata": {}, + "outputs": [], + "source": [ + "chromosome_dictionary = {\n", + " \"1\": \"NC_000001.11\",\n", + " \"2\": \"NC_000002.12\",\n", + " \"3\": \"NC_000003.12\",\n", + " \"4\": \"NC_000004.12\",\n", + " \"5\": \"NC_000005.10\",\n", + " \"6\": \"NC_000006.12\",\n", + " \"7\": \"NC_000007.14\",\n", + " \"9\": \"NC_000009.12\",\n", + " \"10\": \"NC_000010.11\",\n", + " \"11\": \"NC_000011.10\",\n", + " \"12\": \"NC_000012.12\",\n", + " \"13\": \"NC_000013.11\",\n", + " \"14\": \"NC_000014.9\",\n", + " \"15\": \"NC_000015.10\",\n", + " \"16\": \"NC_000016.10\",\n", + " \"17\": \"NC_000017.11\",\n", + " \"18\": \"NC_000018.10\",\n", + " \"19\": \"NC_000019.10\",\n", + " \"20\": \"NC_000020.11\",\n", + " \"21\": \"NC_000021.9\",\n", + " \"23\": \"NC_000023.11\"\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "a1323f95", + "metadata": {}, + "source": [ + "### Verification that the reference is present at the exact position I have in my data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0ec0979", + "metadata": {}, + "outputs": [], + "source": [ + "# Verify reference sequences against genome\n", + "verification_file = CONFIG['verification_file']\n", + "print(f\"Starting sequence verification - results will be saved to: {verification_file}\")\n", + "\n", + "with open(verification_file, \"w\") as f:\n", + " for i in range(len(variant_data)):\n", + " # ---- Input ----\n", + " chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]\n", + " if (variant_data.iloc[i]['TranscriptID'][:4] == \"ENST\"):\n", + " start = variant_data.iloc[i]['Start'] - 1\n", + " else:\n", + " start = variant_data.iloc[i]['Start']\n", + " reference_allele = variant_data.iloc[i]['RefAllele']\n", + " end = len(reference_allele) + start\n", + "\n", + " chrom_seq = record_dict[chromosome_id].seq\n", + "\n", + " # Adjust for 0-based indexing in Python\n", + " genomic_ref = chrom_seq[start: start + len(reference_allele)]\n", + "\n", + " if genomic_ref.upper() != reference_allele.upper():\n", + " f.write(f\"⚠️ Warning: Entry number {i} with variant {variant_data.iloc[i]['ID']} expected '{reference_allele}', but found '{genomic_ref}'\\n\")\n", + " else:\n", + " f.write(f\"✅ Verified: {chromosome_id}:{start}-{end} → '{reference_allele}' matches genome\\n\")\n", + " \n", + " if (i + 1) % 100 == 0:\n", + " print(f\"Verified {i + 1}/{len(variant_data)} variants...\")\n", + "\n", + "print(f\"✅ Verification complete. Results saved to: {verification_file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39174efe", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "# Assuming CONFIG is defined somewhere above in the code\n", + "# CONFIG = {'nt_seq_dir': 'desired/path/to/nt_seq'}\n", + "\n", + "# Create nucleotide sequence directory\n", + "nt_seq_dir = Path(CONFIG['nt_seq_dir'])\n", + "nt_seq_dir.mkdir(exist_ok=True)\n", + "print(f\"Created nucleotide sequence directory: {nt_seq_dir}\")" + ] + }, + { + "cell_type": "markdown", + "id": "3065cf9d", + "metadata": {}, + "source": [ + "### Performing the mutation and saving the reference and variant allele with a 1000 nt window" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6121945f", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate nucleotide sequences with mutations\n", + "nt_seq_dir = CONFIG['nt_seq_dir']\n", + "window = CONFIG['sequence_window']\n", + "\n", + "print(f\"Generating nucleotide sequences with {window}bp windows...\")\n", + "print(f\"Output directory: {nt_seq_dir}\")\n", + "\n", + "for i in range(len(variant_data)):\n", + " output_file = f\"{nt_seq_dir}/{variant_data.iloc[i]['Var_ID']}.txt\"\n", + " \n", + " with open(output_file, \"w\") as f:\n", + " # ---- Input ----\n", + " chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]\n", + " if (variant_data.iloc[i]['TranscriptID'][:4] == \"ENST\"):\n", + " start = variant_data.iloc[i]['Start'] - 1\n", + " else:\n", + " start = variant_data.iloc[i]['Start']\n", + " reference_allele = variant_data.iloc[i]['RefAllele']\n", + " variant_allele = variant_data.iloc[i]['AltAllele']\n", + "\n", + " end = len(reference_allele) + start\n", + " \n", + " chrom_seq = record_dict[chromosome_id].seq\n", + "\n", + " # Extract region\n", + " region_start = max(0, start - window)\n", + " region_end = end + window\n", + "\n", + " ref_seq = chrom_seq[region_start:region_end]\n", + " \n", + " if (variant_allele == \"deletion\"):\n", + " # Apply mutation\n", + " mutated_seq = ref_seq[:window] + ref_seq[window + len(reference_allele):]\n", + " \n", + " f.write(f\">{variant_data.iloc[i]['ID']}_reference_{reference_allele}\\n\")\n", + " f.write(f\"{ref_seq}\\n\")\n", + " f.write(f\">{variant_data.iloc[i]['ID']}_variant_{variant_allele}\\n\")\n", + " f.write(f\"{mutated_seq}\\n\")\n", + " else:\n", + " del_len = len(reference_allele)\n", + " # Apply mutation\n", + " mutated_seq = ref_seq[:window] + variant_allele + ref_seq[window + del_len:]\n", + " \n", + " f.write(f\">{variant_data.iloc[i]['ID']}_reference_{reference_allele}\\n\")\n", + " f.write(f\"{ref_seq}\\n\")\n", + " f.write(f\">{variant_data.iloc[i]['ID']}_variant_{variant_allele}\\n\")\n", + " f.write(f\"{mutated_seq}\\n\")\n", + " \n", + " if (i + 1) % 100 == 0:\n", + " print(f\"Generated sequences for {i + 1}/{len(variant_data)} variants...\")\n", + "\n", + "print(f\"✅ Sequence generation complete. {len(variant_data)} sequence files created in {nt_seq_dir}\")" + ] + }, + { + "cell_type": "markdown", + "id": "a83e9272-b34f-40f3-aedf-3aca0795944f", + "metadata": {}, + "source": [ + "# Adding in more Variant Data\n", + "\n", + "# Data Integration\n", + "\n", + "This section merges variant information with the main dataset to create a comprehensive database with all relevant annotations." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "9222e45a-7f9a-4762-8dd8-2cccc654ad3e", + "metadata": {}, + "outputs": [], + "source": [ + "final_data = variant_data.merge(variant_info, on='Entry')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae6d44d0-d1f2-4d41-b59d-f8c5888b4914", + "metadata": {}, + "outputs": [], + "source": [ + "final_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ab406cd-e9be-4885-811a-f3e2526efe8a", + "metadata": {}, + "outputs": [], + "source": [ + "# Save merged variant data\n", + "output_file = CONFIG['network_data_file']\n", + "final_data.to_csv(output_file, sep='\\t', header=True, index=False)\n", + "print(f\"✅ Final variant data with merged information saved to: {output_file}\")\n", + "print(f\"Dataset contains {len(final_data)} variants with complete information\")" + ] + }, + { + "cell_type": "markdown", + "id": "2ecb5318-ab15-4625-b556-50f8ff39cff3", + "metadata": {}, + "source": [ + "# Pulling Disease info\n", + "\n", + "# Disease Information Processing\n", + "\n", + "This section extracts disease identifiers from the variant data and downloads corresponding disease information from KEGG to create human-readable disease names." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "b266aa61-7a7f-49c7-a737-578b51b95f32", + "metadata": {}, + "outputs": [], + "source": [ + "import ast" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "a7aa0417-b1c2-40c9-ad67-f2077d1f1d3e", + "metadata": {}, + "outputs": [], + "source": [ + "diseases = []" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "a0865917-9074-43f4-98a1-74bdb456b2e5", + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(len(final_data)):\n", + " diseases.extend(list(ast.literal_eval(final_data['Disease'][i]).keys()))" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "8b469aee-d8fb-439d-a8bc-e8cb113ddc8f", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "disease = set(diseases)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e461b5d7-2200-4dbb-b640-ffd6bf2e3ac2", + "metadata": {}, + "outputs": [], + "source": [ + "# Save disease identifiers to file\n", + "diseases_file = CONFIG['diseases_file']\n", + "with open(diseases_file, 'w') as f:\n", + " for disease_id in disease:\n", + " f.write(f\"{disease_id}\\n\")\n", + " \n", + "print(f\"✅ Saved {len(disease)} unique disease identifiers to: {diseases_file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d079c88f-9e8b-4f80-bf6c-5d9a49155b86", + "metadata": {}, + "outputs": [], + "source": [ + "# Working directory already set - proceeding with disease information retrieval\n", + "print(\"Starting disease information processing...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "10d814f3-66ec-4580-866e-2cc2fda34109", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "disease KEGG Disease Database\n", + "ds Release 114.0+/04-28, Apr 25\n", + " Kanehisa Laboratories\n", + " 2,912 entries\n", + "\n", + "linked db pathway\n", + " brite\n", + " ko\n", + " hsa\n", + " genome\n", + " network\n", + " variant\n", + " drug\n", + " pubmed\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull rest info disease" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f095524-d58f-4869-9d1b-5459de85329d", + "metadata": {}, + "outputs": [], + "source": [ + "kegg_pull --full-help" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ed80556-4df8-4f0b-8c3e-2a6458c6dd6d", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "# Assuming CONFIG is defined somewhere earlier in the code\n", + "# CONFIG = {'disease_info_dir': 'desired/path/to/disease_info'}\n", + "\n", + "# Create disease information directory\n", + "disease_dir = Path(CONFIG['disease_info_dir'])\n", + "disease_dir.mkdir(exist_ok=True)\n", + "print(f\"Created disease information directory: {disease_dir}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96851b67-0689-4aa0-9208-a0cdabf95425", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████| 44/44 [00:06<00:00, 6.56it/s]\n" + ] + } + ], + "source": [ + "# Download disease information using kegg_pull\n", + "diseases_file = CONFIG['diseases_file']\n", + "disease_output_dir = CONFIG['disease_info_dir']\n", + "\n", + "if not os.path.exists(diseases_file):\n", + " print(f\"❌ Diseases file not found: {diseases_file}\")\n", + " print(\"Please run the previous cells to generate the diseases list\")\n", + "else:\n", + " print(f\"Downloading disease information for entries in: {diseases_file}\")\n", + " print(f\"Output directory: {disease_output_dir}\")\n", + " # Run the command to download disease information\n", + " !cat {diseases_file} | kegg_pull pull entry-ids - --output={disease_output_dir}\n", + " print(\"✅ Disease information download complete\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c01f97c-6376-4266-97e9-1d29ef207a51", + "metadata": {}, + "outputs": [], + "source": [ + "# Processing disease information files\n", + "print(\"Parsing disease information from KEGG files...\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ea01eac-ee3c-4a5e-9863-2fb061291b45", + "metadata": {}, + "outputs": [], + "source": [ + "# Parse disease information from downloaded files\n", + "diseases_file = CONFIG['diseases_file']\n", + "disease_info_dir = Path(CONFIG['disease_info_dir'])\n", + "\n", + "# Read all disease identifiers from diseases.txt\n", + "with open(diseases_file, 'r') as f:\n", + " disease_files = [line.strip() for line in f if line.strip()]\n", + "\n", + "print(f\"Processing {len(disease_files)} disease information files...\")\n", + "\n", + "# Initialize an empty dictionary\n", + "disease_info = {}\n", + "\n", + "# Function to extract the value after a keyword\n", + "def extract_value(line, key):\n", + " return line.split(key, 1)[-1].strip()\n", + "\n", + "# Process each disease file\n", + "processed_count = 0\n", + "not_found_count = 0\n", + "\n", + "for disease_id in disease_files:\n", + " file_path = disease_info_dir / f'{disease_id}.txt'\n", + "\n", + " try:\n", + " with open(file_path, 'r') as f:\n", + " lines = f.readlines()\n", + "\n", + " name = \"\"\n", + "\n", + " for line in lines:\n", + " line = line.strip()\n", + " if line.startswith(\"NAME\"):\n", + " name = extract_value(line, \"NAME\")\n", + " break # No need to check other lines once NAME is found\n", + "\n", + " # Save into dictionary: key = disease_id, value = name\n", + " disease_info[disease_id] = name\n", + " processed_count += 1\n", + " \n", + " if processed_count % 50 == 0:\n", + " print(f\"Processed {processed_count}/{len(disease_files)} disease files...\")\n", + "\n", + " except FileNotFoundError:\n", + " print(f\"[Warning] File not found: {file_path}\")\n", + " not_found_count += 1\n", + "\n", + "print(f\"✅ Disease processing complete: {processed_count} processed, {not_found_count} not found\")\n", + "print(f\"Extracted disease information for {len(disease_info)} diseases\")\n", + "\n", + "# Optional: Save the dictionary to a file (like JSON)\n", + "# import json\n", + "# with open('disease_info.json', 'w') as f:\n", + "# json.dump(disease_info, f, indent=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4dfb4f25-776e-45c6-9eda-457b13cd77bf", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'H00135': 'Krabbe disease;',\n", + " 'H01398': 'Primary hyperammonemia (Urea cycle disorders)',\n", + " 'H00032': 'Thyroid cancer',\n", + " 'H00559': 'von Hippel-Lindau syndrome',\n", + " 'H00260': 'Pigmented micronodular adrenocortical disease',\n", + " 'H00038': 'Melanoma',\n", + " 'H00485': 'Robinow syndrome',\n", + " 'H00251': 'Thyroid dyshormonogenesis;',\n", + " 'H00194': 'Lesch-Nyhan syndrome;',\n", + " 'H00026': 'Endometrial cancer',\n", + " 'H00020': 'Colorectal cancer',\n", + " 'H00031': 'Breast cancer',\n", + " 'H02049': 'Bilateral macronodular adrenal hyperplasia',\n", + " 'H00042': 'Glioma',\n", + " 'H00063': 'Spinocerebellar ataxia (SCA)',\n", + " 'H00195': 'Adenine phosphoribosyltransferase deficiency;',\n", + " 'H00033': 'Adrenal carcinoma',\n", + " 'H00048': 'Hepatocellular carcinoma;',\n", + " 'H01522': 'Zollinger-Ellison syndrome',\n", + " 'H00019': 'Pancreatic cancer',\n", + " 'H00004': 'Chronic myeloid leukemia',\n", + " 'H00058': 'Amyotrophic lateral sclerosis (ALS);',\n", + " 'H00022': 'Bladder cancer',\n", + " 'H00056': 'Alzheimer disease;',\n", + " 'H01032': 'N-acetylglutamate synthase deficiency',\n", + " 'H00247': 'Multiple endocrine neoplasia syndrome;',\n", + " 'H00246': 'Primary hyperparathyroidism;',\n", + " 'H00039': 'Basal cell carcinoma',\n", + " 'H00021': 'Renal cell carcinoma',\n", + " 'H00013': 'Small cell lung cancer',\n", + " 'H00003': 'Acute myeloid leukemia',\n", + " 'H00018': 'Gastric cancer',\n", + " 'H01603': 'Primary aldosteronism',\n", + " 'H00061': 'Prion disease',\n", + " 'H00014': 'Non-small cell lung cancer',\n", + " 'H00423': 'Sphingolipidosis',\n", + " 'H00024': 'Prostate cancer',\n", + " 'H01102': 'Pituitary adenomas',\n", + " 'H00034': 'Carcinoid',\n", + " 'H00059': 'Huntington disease',\n", + " 'H01431': 'Cushing syndrome',\n", + " 'H00057': 'Parkinson disease',\n", + " 'H00126': 'Gaucher disease',\n", + " 'H02221': 'Methylmalonic aciduria and homocystinuria'}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "disease_info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "458ca725-03e8-4b2a-98e7-f418f40190fb", + "metadata": {}, + "outputs": [], + "source": [ + "# Reload variant data for disease processing\n", + "variant_data = pd.read_csv(CONFIG['network_data_file'], sep='\\t')\n", + "print(f\"Processing disease information for {len(variant_data)} variants\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e86ddd65-cbde-42d3-be6f-cbc54e2dda06", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "\n", + "# Assume disease_info is already a dictionary {\"D001\": \"Cancer\", \"D002\": \"Diabetes\", ...}\n", + "\n", + "# Create a new column to store disease dictionaries\n", + "variant_data[\"Disease_Names\"] = \"\"\n", + "\n", + "# Process each row\n", + "for idx, row in variant_data.iterrows():\n", + " try:\n", + " # Convert the string dictionary into a real dictionary\n", + " disease_dict = ast.literal_eval(row[\"Disease\"])\n", + "\n", + " # Get the disease IDs (keys)\n", + " disease_ids = disease_dict.keys()\n", + "\n", + " # Build a new dictionary: {disease_id: disease_name}\n", + " disease_names_dict = {did: disease_info.get(did, \"\") for did in disease_ids}\n", + "\n", + " # Save it into the Disease_Names column\n", + " variant_data.at[idx, \"Disease_Names\"] = disease_names_dict\n", + "\n", + " except (ValueError, SyntaxError):\n", + " print(f\"[Warning] Couldn't parse disease info at row {idx}\")\n", + " variant_data.at[idx, \"Disease_Names\"] = {}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06a29f96-56b2-46b2-897e-d7006dd0ae52", + "metadata": {}, + "outputs": [], + "source": [ + "# Save updated variant data with disease names\n", + "output_file = CONFIG['network_data_file']\n", + "variant_data.to_csv(output_file, sep='\\t', header=True, index=False)\n", + "print(f\"✅ Updated variant data saved to: {output_file}\")\n", + "print(f\"Dataset now includes disease names for {len(variant_data)} variants\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "674b4a4a-93ab-4fdd-af73-cf0351381fe6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/BioReason-main/data/KEGG_Data_3.ipynb b/BioReason-main/data/KEGG_Data_3.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..91c3e078f20c827622ef9b3db0d24b4579d68c78 --- /dev/null +++ b/BioReason-main/data/KEGG_Data_3.ipynb @@ -0,0 +1,2739 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "744b9f11-6ef8-4409-a388-fe860480c9de", + "metadata": {}, + "source": [ + "# Processing the Reasoning Trace Data and Adding in Nucleotides" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8950d38a-dfa9-4dbd-b388-941dec69b3ee", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a1c3d972-c52e-4d73-9816-e970fca3e1bb", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from Bio import SeqIO" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c80d7741-7aaa-4c28-a93a-ad955f3da6bb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mkdir: processed_variants 1450 with seqs: File exists\n" + ] + } + ], + "source": [ + "!mkdir 'processed_variants 1450 with seqs'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e4021560-9130-4fdf-a640-15b5da6935a0", + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(1,1450):\n", + " # opened the json file\n", + " with open(f'processed_variants first 700/KEGG_{i}_processed.json', 'r') as file:\n", + " data = json.load(file)\n", + "\n", + " # open the nt file\n", + " fasta_file = f\"nt_seq/KEGG_{i}.txt\"\n", + " sequence_list = list(SeqIO.parse(fasta_file, \"fasta\"))\n", + " ref_seq = sequence_list[0].seq\n", + " var_seq = sequence_list[1].seq\n", + "\n", + " # Add sequences to the JSON data\n", + " data[\"reference_sequence\"] = str(ref_seq)\n", + " data[\"variant_sequence\"] = str(var_seq)\n", + "\n", + " # Save the updated JSON to a new file\n", + " with open(f'processed_variants 1450 with seqs/KEGG_{i}_with_seqs.json', 'w') as out_file:\n", + " json.dump(data, out_file, indent=2)" + ] + }, + { + "cell_type": "markdown", + "id": "4db8af16-a11f-4987-b1a6-db552c6714fb", + "metadata": {}, + "source": [ + "# Creating the Final KEGG SFT and RL Dataset\n", + "\n", + "# Final KEGG Dataset Creation\n", + "\n", + "This section creates the final machine learning dataset by combining variant data with sequences and generating structured question-answer pairs for biological reasoning tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9517d40-74e3-4ddb-bd16-95f9ab7927aa", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "53c5948f-4bde-432d-b35c-34c733eb9ad1", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import json\n", + "import ast" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "60c66a0d-359b-4d2a-8427-53f4d18d1047", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Var_IDNetworkEntrySourceIDTranscriptIDNucChangeChrStartEnd...Network ExpandedPathwayClassDiseaseGeneVariant_NameVariant_GeneVariant_Gene InfoVariant_TypeDisease_Names
0KEGG_1N000731019v2ClinVar16929NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
1KEGG_2N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
2KEGG_3N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
3KEGG_4N000731019v2ClinVar16928NC_000012.12NaN125775164757751647...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
4KEGG_5N000731019v2dbSNPrs11547328NC_000012.12NaN125775164757751647...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
..................................................................
1444KEGG_1445N002449817v1COSM6196635ENST00000393623.6c.706G>T191049219610492196...9817v1 // 4780 => (3162,1728,119391,221357,293...{'hsa05225': 'Hepatocellular carcinoma'}{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'9817': 'KEAP1; kelch like ECH associated pro...KEAP1 mutationKEAP1kelch like ECH associated protein 1 [KO:K10456]NaN{'H00048': 'Hepatocellular carcinoma;'}
1445KEGG_1446N002449817v1COSM6196637ENST00000393623.6c.548A>G191049948610499486...9817v1 // 4780 => (3162,1728,119391,221357,293...{'hsa05225': 'Hepatocellular carcinoma'}{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'9817': 'KEAP1; kelch like ECH associated pro...KEAP1 mutationKEAP1kelch like ECH associated protein 1 [KO:K10456]NaN{'H00048': 'Hepatocellular carcinoma;'}
1446KEGG_1447N00258999v2COSM4766271ENST00000621016.4c.662A>G166880882368808823...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
1447KEGG_1448N00258999v2COSM4766211ENST00000621016.4c.755T>G166881026468810264...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
1448KEGG_1449N00258999v2COSM1379150ENST00000621016.4c.769G>A166881027868810278...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
\n", + "

1449 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " Var_ID Network Entry Source ID TranscriptID \\\n", + "0 KEGG_1 N00073 1019v2 ClinVar 16929 NC_000012.12 \n", + "1 KEGG_2 N00073 1019v2 dbSNP rs104894340 NC_000012.12 \n", + "2 KEGG_3 N00073 1019v2 dbSNP rs104894340 NC_000012.12 \n", + "3 KEGG_4 N00073 1019v2 ClinVar 16928 NC_000012.12 \n", + "4 KEGG_5 N00073 1019v2 dbSNP rs11547328 NC_000012.12 \n", + "... ... ... ... ... ... ... \n", + "1444 KEGG_1445 N00244 9817v1 COSM 6196635 ENST00000393623.6 \n", + "1445 KEGG_1446 N00244 9817v1 COSM 6196637 ENST00000393623.6 \n", + "1446 KEGG_1447 N00258 999v2 COSM 4766271 ENST00000621016.4 \n", + "1447 KEGG_1448 N00258 999v2 COSM 4766211 ENST00000621016.4 \n", + "1448 KEGG_1449 N00258 999v2 COSM 1379150 ENST00000621016.4 \n", + "\n", + " NucChange Chr Start End ... \\\n", + "0 NaN 12 57751646 57751646 ... \n", + "1 NaN 12 57751646 57751646 ... \n", + "2 NaN 12 57751646 57751646 ... \n", + "3 NaN 12 57751647 57751647 ... \n", + "4 NaN 12 57751647 57751647 ... \n", + "... ... ... ... ... ... \n", + "1444 c.706G>T 19 10492196 10492196 ... \n", + "1445 c.548A>G 19 10499486 10499486 ... \n", + "1446 c.662A>G 16 68808823 68808823 ... \n", + "1447 c.755T>G 16 68810264 68810264 ... \n", + "1448 c.769G>A 16 68810278 68810278 ... \n", + "\n", + " Network Expanded \\\n", + "0 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "1 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "2 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "3 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "4 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "... ... \n", + "1444 9817v1 // 4780 => (3162,1728,119391,221357,293... \n", + "1445 9817v1 // 4780 => (3162,1728,119391,221357,293... \n", + "1446 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "1447 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "1448 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "\n", + " Pathway \\\n", + "0 {'hsa05218': 'Melanoma'} \n", + "1 {'hsa05218': 'Melanoma'} \n", + "2 {'hsa05218': 'Melanoma'} \n", + "3 {'hsa05218': 'Melanoma'} \n", + "4 {'hsa05218': 'Melanoma'} \n", + "... ... \n", + "1444 {'hsa05225': 'Hepatocellular carcinoma'} \n", + "1445 {'hsa05225': 'Hepatocellular carcinoma'} \n", + "1446 {'hsa05226': 'Gastric cancer'} \n", + "1447 {'hsa05226': 'Gastric cancer'} \n", + "1448 {'hsa05226': 'Gastric cancer'} \n", + "\n", + " Class \\\n", + "0 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "1 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "2 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "3 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "4 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "... ... \n", + "1444 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "1445 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "1446 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "1447 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "1448 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "\n", + " Disease \\\n", + "0 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "1 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "2 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "3 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "4 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "... ... \n", + "1444 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", + "1445 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", + "1446 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "1447 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "1448 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "\n", + " Gene Variant_Name \\\n", + "0 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "1 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "2 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "3 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "4 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "... ... ... \n", + "1444 {'9817': 'KEAP1; kelch like ECH associated pro... KEAP1 mutation \n", + "1445 {'9817': 'KEAP1; kelch like ECH associated pro... KEAP1 mutation \n", + "1446 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", + "1447 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", + "1448 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", + "\n", + " Variant_Gene Variant_Gene Info \\\n", + "0 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "1 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "2 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "3 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "4 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "... ... ... \n", + "1444 KEAP1 kelch like ECH associated protein 1 [KO:K10456] \n", + "1445 KEAP1 kelch like ECH associated protein 1 [KO:K10456] \n", + "1446 CDH1 cadherin 1 [KO:K05689] \n", + "1447 CDH1 cadherin 1 [KO:K05689] \n", + "1448 CDH1 cadherin 1 [KO:K05689] \n", + "\n", + " Variant_Type Disease_Names \n", + "0 NaN {'H00038': 'Melanoma'} \n", + "1 NaN {'H00038': 'Melanoma'} \n", + "2 NaN {'H00038': 'Melanoma'} \n", + "3 NaN {'H00038': 'Melanoma'} \n", + "4 NaN {'H00038': 'Melanoma'} \n", + "... ... ... \n", + "1444 NaN {'H00048': 'Hepatocellular carcinoma;'} \n", + "1445 NaN {'H00048': 'Hepatocellular carcinoma;'} \n", + "1446 NaN {'H00018': 'Gastric cancer'} \n", + "1447 NaN {'H00018': 'Gastric cancer'} \n", + "1448 NaN {'H00018': 'Gastric cancer'} \n", + "\n", + "[1449 rows x 24 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variant_data = pd.read_csv(\"final_network_with_variant.tsv\", sep='\\t')\n", + "variant_data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "51609538-9f96-4097-ac60-2a4a08a6e01c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'KEGG_2'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variant_data.iloc[1]['Var_ID']" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "846b6ee3-1e4d-44bc-ad59-4074b4ff39bb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mkdir: final_data: File exists\n" + ] + } + ], + "source": [ + "!mkdir final_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56449f64-85ae-4804-8a01-3ce2afe1e6da", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import ast\n", + "from CONFIG import CONFIG\n", + "\n", + "# Create final dataset with question-answer pairs\n", + "variants_with_seqs_dir = CONFIG['variants_with_seqs_dir']\n", + "final_data_dir = CONFIG['final_data_dir']\n", + "start_idx, end_idx = CONFIG['variant_range']\n", + "\n", + "print(f\"Creating final dataset with Q&A pairs...\")\n", + "print(f\"Input: {variants_with_seqs_dir}\")\n", + "print(f\"Output: {final_data_dir}\")\n", + "print(f\"Processing range: {start_idx} to {end_idx}\")\n", + "\n", + "processed_count = 0\n", + "error_count = 0\n", + "\n", + "for i in range(start_idx, end_idx):\n", + " try:\n", + " # Load the JSON file with sequences\n", + " input_file = f'{variants_with_seqs_dir}/KEGG_{i}_with_seqs.json'\n", + " if not os.path.exists(input_file):\n", + " error_count += 1\n", + " continue\n", + " \n", + " with open(input_file, 'r') as file:\n", + " data = json.load(file)\n", + "\n", + " # Build the question with fallback for inconsistent key casing\n", + " try:\n", + " chromosome = data['raw_data']['chromosome']\n", + " network = data['raw_data']['network']\n", + " except KeyError:\n", + " try:\n", + " chromosome = data['raw_data']['Chromosome']\n", + " network = data['raw_data']['Network']\n", + " except KeyError:\n", + " print(f\"[Warning] Missing chromosome/network data in {input_file}\")\n", + " error_count += 1\n", + " continue\n", + "\n", + " # Extract gene information\n", + " try:\n", + " gene_list = list(ast.literal_eval(variant_data.iloc[i-1]['Gene']).values())\n", + " gene_list_joined = ' | '.join(gene_list)\n", + " variant_gene = variant_data.iloc[i-1]['Variant_Gene']\n", + " except (KeyError, IndexError, ValueError) as e:\n", + " print(f\"[Warning] Gene information error for {input_file}: {e}\")\n", + " error_count += 1\n", + " continue\n", + "\n", + " question = (\n", + " f\"Chromosome Number: {chromosome}\\n\"\n", + " f\"Network Definition of the pathway: {network}\\n\"\n", + " f\"Genes in the pathway: {gene_list_joined}\\n\\n\"\n", + " f\"Given this context, what is the biological effect of this \"\n", + " f\"{variant_gene} allele, specifically what disease does this contribute to?\"\n", + " )\n", + "\n", + " # Add Q&A to reasoning steps\n", + " if 'reasoning' in data and 'reasoning_steps' in data['reasoning']:\n", + " data['reasoning']['reasoning_steps'].append(data.get('answer', ''))\n", + "\n", + " # Extract answer\n", + " try:\n", + " answer = data['reasoning']['labels']['disease'][0]\n", + " except (KeyError, IndexError):\n", + " print(f\"[Warning] Missing disease label in {input_file}\")\n", + " error_count += 1\n", + " continue\n", + "\n", + " data['question'] = question\n", + " data['answer'] = answer \n", + "\n", + " # Clean up unnecessary fields\n", + " if 'reasoning' in data:\n", + " for key in ['variant_id', 'hgvs', 'labels']:\n", + " data['reasoning'].pop(key, None)\n", + " data.pop('raw_data', None)\n", + "\n", + " # Save to final data directory\n", + " output_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'\n", + " with open(output_file, 'w') as out_file:\n", + " json.dump(data, out_file, indent=2)\n", + " \n", + " processed_count += 1\n", + " \n", + " if processed_count % 100 == 0:\n", + " print(f\"Created {processed_count} Q&A pairs...\")\n", + " \n", + " except Exception as e:\n", + " print(f\"[Error] Failed to process variant {i}: {str(e)}\")\n", + " error_count += 1\n", + "\n", + "print(f\"✅ Final dataset creation complete:\")\n", + "print(f\" Successfully processed: {processed_count}\")\n", + "print(f\" Errors encountered: {error_count}\")\n", + "print(f\" Output directory: {final_data_dir}\")" + ] + }, + { + "cell_type": "markdown", + "id": "11b3769e-33e5-4ab8-bc9d-f736913a2034", + "metadata": {}, + "source": [ + "# Fixing Disease Labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cfa4eca-c11e-4e52-ad6b-2fa7b43be2a4", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9e36bc3f-07af-4b3d-bc84-d449ced55e24", + "metadata": {}, + "outputs": [], + "source": [ + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd316862-e6c7-4dd9-a06c-33f3454355b0", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "\n", + "# CONFIG parameters\n", + "CONFIG = {\n", + " 'final_data_dir': 'final_data',\n", + " 'variant_range': (1, 1450)\n", + "}\n", + "\n", + "# Extract disease labels from final dataset for standardization\n", + "final_data_dir = CONFIG['final_data_dir']\n", + "start_idx, end_idx = CONFIG['variant_range']\n", + "\n", + "print(\"Extracting disease labels for standardization...\")\n", + "\n", + "disease = []\n", + "processed_count = 0\n", + "\n", + "for i in range(start_idx, end_idx):\n", + " try:\n", + " input_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'\n", + " if os.path.exists(input_file):\n", + " with open(input_file, 'r') as file:\n", + " data = json.load(file)\n", + " \n", + " if 'answer' in data:\n", + " disease.append(data['answer'])\n", + " processed_count += 1\n", + " \n", + " except Exception as e:\n", + " print(f\"[Warning] Could not process {input_file}: {str(e)}\")\n", + "\n", + "print(f\"✅ Extracted {len(disease)} disease labels from {processed_count} files\")\n", + "print(f\"Unique diseases: {len(set(disease))}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "cca4846c-aec9-49f3-b919-760cb9fa4bc7", + "metadata": {}, + "outputs": [], + "source": [ + "new_disease = {'Acute Myeloid Leukemia (AML)' : \"Acute Myeloid Leukemia\",\n", + " 'Acute myeloid leukemia (AML)' : \"Acute Myeloid Leukemia\",\n", + " 'Adenine Phosphoribosyltransferase Deficiency (APRTD)' : \"Adenine Phosphoribosyltransferase Deficiency\",\n", + " 'Adenine phosphoribosyltransferase deficiency (APRTD)' : \"Adenine Phosphoribosyltransferase Deficiency\",\n", + " \"Alzheimer's disease\" : \"Alzheimer's disease\",\n", + " \"Alzheimer's disease (AD)\" : \"Alzheimer's disease\",\n", + " 'Amyotrophic Lateral Sclerosis (ALS)' : \"Amyotrophic Lateral Sclerosis\",\n", + " 'Amyotrophic lateral sclerosis (ALS)' : \"Amyotrophic Lateral Sclerosis\",\n", + " 'Basal Cell Carcinoma (BCC)' : \"Basal Cell Carcinoma\",\n", + " 'Basal cell carcinoma' : \"Basal Cell Carcinoma\",\n", + " 'Basal cell carcinoma (BCC)' : \"Basal Cell Carcinoma\",\n", + " 'Chronic Myeloid Leukemia (CML)' : \"Chronic Myeloid Leukemia\",\n", + " 'Chronic myeloid leukemia (CML)' : \"Chronic Myeloid Leukemia\",\n", + " 'Clear cell Renal Cell Carcinoma (ccRCC)' : \"Clear cell Renal Cell Carcinoma\",\n", + " 'Clear cell renal cell carcinoma' : \"Clear cell Renal Cell Carcinoma\",\n", + " 'Clear cell renal cell carcinoma (ccRCC)' : \"Clear cell Renal Cell Carcinoma\",\n", + " 'Colorectal cancer' : \"Colorectal cancer\",\n", + " 'Colorectal cancer (CRC)' : \"Colorectal cancer\",\n", + " 'Cushing syndrome' : \"Cushing syndrome\",\n", + " \"Early-onset Alzheimer's disease\" : \"Alzheimer's disease\",\n", + " \"Early-onset familial Alzheimer's disease\" : \"Alzheimer's disease\",\n", + " \"Early-onset familial Alzheimer's disease (FAD)\" : \"Alzheimer's disease\",\n", + " 'Familial Creutzfeldt-Jakob Disease' : \"Creutzfeldt-Jakob Disease\",\n", + " 'Familial Creutzfeldt-Jakob Disease (fCJD)' : \"Creutzfeldt-Jakob Disease\",\n", + " 'Familial Creutzfeldt-Jakob disease' : \"Creutzfeldt-Jakob Disease\",\n", + " 'Familial Creutzfeldt-Jakob disease (fCJD)' : \"Creutzfeldt-Jakob Disease\",\n", + " \"Familial Early-Onset Alzheimer's Disease\" : \"Alzheimer's disease\",\n", + " 'Familial Isolated Pituitary Adenoma (FIPA)' : \"Pituitary Adenoma\",\n", + " \"Familial early-onset Alzheimer's disease\" : \"Alzheimer's disease\",\n", + " \"Familial early-onset Alzheimer's disease (FAD)\" : \"Alzheimer's disease\",\n", + " 'Familial isolated pituitary adenoma (FIPA)' : \"Pituitary Adenoma\",\n", + " 'Gastric cancer' : \"Gastric cancer\",\n", + " 'Gaucher disease' : \"Gaucher disease\",\n", + " 'Glioblastoma multiforme' : \"Glioblastoma multiforme\",\n", + " 'Glioblastoma multiforme (GBM)' : \"Glioblastoma multiforme\",\n", + " 'Hepatocellular carcinoma' : \"Hepatocellular carcinoma\",\n", + " 'Hepatocellular carcinoma (HCC)' : \"Hepatocellular carcinoma\",\n", + " 'Huntington disease' : \"Huntington's disease\",\n", + " 'Huntington disease (HD)' : \"Huntington's disease\",\n", + " \"Huntington's disease\" : \"Huntington's disease\",\n", + " \"Huntington's disease (HD)\" : \"Huntington's disease\",\n", + " 'Lesch-Nyhan syndrome' : \"Lesch-Nyhan syndrome\",\n", + " 'Melanoma' : \"Melanoma\",\n", + " 'Melanoma (H00038)' : \"Melanoma\",\n", + " 'Methylmalonic aciduria and homocystinuria (MAHC)' : \"Methylmalonic aciduria and homocystinuria\",\n", + " 'Multiple Endocrine Neoplasia type 1 (MEN1)' : \"Multiple Endocrine Neoplasia type 1\",\n", + " 'N-acetylglutamate synthase (NAGS) deficiency' : \"N-acetylglutamate synthase deficiency\",\n", + " 'Non-small cell lung cancer' : \"Non-small cell lung cancer\",\n", + " 'Non-small cell lung cancer (NSCLC)' : \"Non-small cell lung cancer\",\n", + " 'Non-small-cell lung cancer' : \"Non-small cell lung cancer\",\n", + " 'Non-small-cell lung cancer (NSCLC)' : \"Non-small cell lung cancer\",\n", + " 'Pancreatic ductal adenocarcinoma' : \"Pancreatic ductal adenocarcinoma\",\n", + " 'Papillary Renal Cell Carcinoma' : \"Papillary Renal Cell Carcinoma\",\n", + " 'Papillary renal cell carcinoma' : \"Papillary Renal Cell Carcinoma\",\n", + " 'Papillary thyroid carcinoma' : \"Papillary thyroid carcinoma\",\n", + " 'Papillary thyroid carcinoma (PTC)' : \"Papillary thyroid carcinoma\",\n", + " \"Parkinson's Disease\" : \"Parkinson's Disease\",\n", + " \"Parkinson's disease\" : \"Parkinson's Disease\",\n", + " \"Parkinson's disease (PD)\" : \"Parkinson's Disease\",\n", + " 'Pituitary adenoma' : \"Pituitary Adenoma\",\n", + " 'Primary Aldosteronism' : \"Primary Aldosteronism\",\n", + " 'Primary aldosteronism' : \"Primary Aldosteronism\",\n", + " 'Prion disease' : \"Prion disease\",\n", + " 'Prion diseases' : \"Prion disease\",\n", + " 'Prostate cancer' : \"Prostate cancer\",\n", + " 'Renal cell cancer (RCC)' : \"Renal cell carcinoma\",\n", + " 'Renal cell carcinoma' : \"Renal cell carcinoma\",\n", + " 'Renal cell carcinoma (RCC)' : \"Renal cell carcinoma\",\n", + " 'Robinow syndrome' : \"Robinow syndrome\",\n", + " 'Sphingolipidoses' : \"Sphingolipidoses\",\n", + " 'Sphingolipidosis' : \"Sphingolipidoses\",\n", + " 'Spinocerebellar Ataxia (SCA)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia (SCA19/22)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia Type 1 (SCA1)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia Type 13 (SCA13)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia Type 14 (SCA14)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia Type 15 (SCA15)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia Type 2 (SCA2)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia Type 3' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia Type 3 (SCA3)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia Type 5 (SCA5)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia type 13 (SCA13)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia type 6 (SCA6)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia (SCA)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia (SCA19/22)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia type 1 (SCA1)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia type 19 (SCA19)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia type 19/22 (SCA19/22)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia type 2 (SCA2)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia type 3' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia type 3 (SCA3)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia type 5 (SCA5)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia type 6 (SCA6)' : \"Spinocerebellar Ataxia\",\n", + " 'Thyroid cancer' : \"Thyroid cancer\",\n", + " 'Thyroid dyshormonogenesis' : \"Thyroid dyshormonogenesis\",\n", + " 'Urothelial carcinoma' : \"Urothelial carcinoma\",\n", + " 'von Hippel-Lindau syndrome' : \"von Hippel-Lindau syndrome\"}" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "2451ebb1-a9d8-494c-9f7e-4f800cd158e8", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir final_data_fix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c71719e5-5215-4559-a47d-dfc160779260", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "\n", + "# CONFIG parameters\n", + "CONFIG = {\n", + " 'final_data_dir': 'final_data',\n", + " 'final_data_fix_dir': 'final_data_fix',\n", + " 'variant_range': (1, 1450)\n", + "}\n", + "\n", + "# Dummy new_disease mapping for demonstration\n", + "new_disease = {\n", + " \"disease_A\": \"new_disease_A\",\n", + " \"disease_B\": \"new_disease_B\"\n", + " # Add more mappings as needed\n", + "}\n", + "\n", + "# Standardize disease labels using the mapping dictionary\n", + "final_data_dir = CONFIG['final_data_dir']\n", + "final_data_fix_dir = CONFIG['final_data_fix_dir']\n", + "start_idx, end_idx = CONFIG['variant_range']\n", + "\n", + "print(\"Applying disease label standardization...\")\n", + "print(f\"Input: {final_data_dir}\")\n", + "print(f\"Output: {final_data_fix_dir}\")\n", + "\n", + "processed_count = 0\n", + "error_count = 0\n", + "\n", + "for i in range(start_idx, end_idx):\n", + " try:\n", + " input_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'\n", + " if not os.path.exists(input_file):\n", + " continue\n", + " \n", + " with open(input_file, 'r') as file:\n", + " data = json.load(file)\n", + "\n", + " # Get original answer\n", + " temp = data.get('answer', '')\n", + " \n", + " # Apply standardization if mapping exists\n", + " if temp in new_disease:\n", + " data['answer'] = new_disease[temp]\n", + " else:\n", + " print(f\"[Warning] No mapping found for disease: {temp}\")\n", + " \n", + " # Save to standardized directory\n", + " output_file = f'{final_data_fix_dir}/KEGG_{i}_with_seqs.json'\n", + " with open(output_file, 'w') as out_file:\n", + " json.dump(data, out_file, indent=2)\n", + " \n", + " processed_count += 1\n", + " \n", + " if processed_count % 100 == 0:\n", + " print(f\"Standardized {processed_count} disease labels...\")\n", + " \n", + " except Exception as e:\n", + " print(f\"[Error] Failed to process {input_file}: {str(e)}\")\n", + " error_count += 1\n", + "\n", + "print(f\"✅ Disease label standardization complete:\")\n", + "print(f\" Successfully processed: {processed_count}\")\n", + "print(f\" Errors encountered: {error_count}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a12df3e-9ceb-4a51-acaf-e2931792a844", + "metadata": {}, + "outputs": [], + "source": [ + "# Remove original final_data directory and replace with standardized version\n", + "final_data_dir = CONFIG['final_data_dir']\n", + "final_data_fix_dir = CONFIG['final_data_fix_dir']\n", + "\n", + "import shutil\n", + "import os\n", + "\n", + "if os.path.exists(final_data_dir):\n", + " shutil.rmtree(final_data_dir)\n", + " print(f\"Removed original directory: {final_data_dir}\")\n", + "else:\n", + " print(f\"Directory not found: {final_data_dir}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbba2c19-08f6-4769-b38d-a64d8643e142", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from your_config_module import CONFIG # Adjust the import based on your project structure\n", + "\n", + "# Rename standardized directory to final_data\n", + "final_data_dir = CONFIG['final_data_dir']\n", + "final_data_fix_dir = CONFIG['final_data_fix_dir']\n", + "\n", + "if os.path.exists(final_data_fix_dir):\n", + " os.rename(final_data_fix_dir, final_data_dir)\n", + " print(f\"Renamed {final_data_fix_dir} to {final_data_dir}\")\n", + " print(\"✅ Final dataset with standardized disease labels is ready\")\n", + "else:\n", + " print(f\"Directory not found: {final_data_fix_dir}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c87a0df-09c8-4fb6-baca-21a9cdd65b85", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "\n", + "# Assuming CONFIG is defined somewhere earlier in the code\n", + "# CONFIG = {\n", + "# 'final_data_dir': 'path_to_final_data_dir',\n", + "# 'variant_range': (1, 1450)\n", + "# }\n", + "\n", + "# Verify standardized disease labels\n", + "final_data_dir = CONFIG['final_data_dir']\n", + "start_idx, end_idx = CONFIG['variant_range']\n", + "\n", + "print(\"Verifying standardized disease labels...\")\n", + "\n", + "disease = []\n", + "for i in range(start_idx, end_idx):\n", + " try:\n", + " input_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'\n", + " if os.path.exists(input_file):\n", + " with open(input_file, 'r') as file:\n", + " data = json.load(file)\n", + " \n", + " if 'answer' in data:\n", + " disease.append(data['answer'])\n", + " \n", + " except Exception as e:\n", + " print(f\"[Warning] Could not verify {input_file}: {str(e)}\")\n", + "\n", + "print(f\"✅ Verification complete:\")\n", + "print(f\" Total disease labels: {len(disease)}\")\n", + "print(f\" Unique diseases: {len(set(disease))}\")\n", + "print(f\" Top 10 diseases: {list(set(disease))[:10]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "60f75d92-e2f2-495f-ba8f-cb423410f1f4", + "metadata": {}, + "source": [ + "# Saving the KEGG Task to the WangLab Hugging Face" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a069a67-b410-4adf-ab75-62eca67ab259", + "metadata": {}, + "outputs": [], + "source": [ + "cd ../../bioR_tasks" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "10e9f0fb-4943-41bf-bef3-9fcd64796ddf", + "metadata": {}, + "outputs": [], + "source": [ + "mkdir kegg_variant" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cced244e-9d03-47be-8fa1-864f2736fe01", + "metadata": {}, + "outputs": [], + "source": [ + "cp ../BioReason/data/kegg_data/final_data/* kegg_variant/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bff9ce06-2cd8-4675-a23f-080027770bdb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "4c56a919", + "metadata": {}, + "source": [ + "# Creating the Nt Variant Database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c28bc9f", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7618faf2", + "metadata": {}, + "outputs": [], + "source": [ + "from Bio import SeqIO\n", + "import pandas as pd\n", + "import json\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "# Optional: Uncomment if you want to use HuggingFace datasets\n", + "# from datasets import load_dataset, Dataset, DatasetDict\n", + "\n", + "print(\"Imports loaded for nucleotide database creation\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b8cac05", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Var_IDNetworkEntrySourceIDTranscriptIDNucChangeChrStartEnd...Network ExpandedPathwayClassDiseaseGeneVariant_NameVariant_GeneVariant_Gene InfoVariant_TypeDisease_Names
0KEGG_1N000731019v2ClinVar16929NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
1KEGG_2N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
2KEGG_3N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
3KEGG_4N000731019v2ClinVar16928NC_000012.12NaN125775164757751647...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
4KEGG_5N000731019v2dbSNPrs11547328NC_000012.12NaN125775164757751647...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
..................................................................
1444KEGG_1445N002449817v1COSM6196635ENST00000393623.6c.706G>T191049219610492196...9817v1 // 4780 => (3162,1728,119391,221357,293...{'hsa05225': 'Hepatocellular carcinoma'}{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'9817': 'KEAP1; kelch like ECH associated pro...KEAP1 mutationKEAP1kelch like ECH associated protein 1 [KO:K10456]NaN{'H00048': 'Hepatocellular carcinoma;'}
1445KEGG_1446N002449817v1COSM6196637ENST00000393623.6c.548A>G191049948610499486...9817v1 // 4780 => (3162,1728,119391,221357,293...{'hsa05225': 'Hepatocellular carcinoma'}{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'9817': 'KEAP1; kelch like ECH associated pro...KEAP1 mutationKEAP1kelch like ECH associated protein 1 [KO:K10456]NaN{'H00048': 'Hepatocellular carcinoma;'}
1446KEGG_1447N00258999v2COSM4766271ENST00000621016.4c.662A>G166880882368808823...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
1447KEGG_1448N00258999v2COSM4766211ENST00000621016.4c.755T>G166881026468810264...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
1448KEGG_1449N00258999v2COSM1379150ENST00000621016.4c.769G>A166881027868810278...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
\n", + "

1449 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " Var_ID Network Entry Source ID TranscriptID \\\n", + "0 KEGG_1 N00073 1019v2 ClinVar 16929 NC_000012.12 \n", + "1 KEGG_2 N00073 1019v2 dbSNP rs104894340 NC_000012.12 \n", + "2 KEGG_3 N00073 1019v2 dbSNP rs104894340 NC_000012.12 \n", + "3 KEGG_4 N00073 1019v2 ClinVar 16928 NC_000012.12 \n", + "4 KEGG_5 N00073 1019v2 dbSNP rs11547328 NC_000012.12 \n", + "... ... ... ... ... ... ... \n", + "1444 KEGG_1445 N00244 9817v1 COSM 6196635 ENST00000393623.6 \n", + "1445 KEGG_1446 N00244 9817v1 COSM 6196637 ENST00000393623.6 \n", + "1446 KEGG_1447 N00258 999v2 COSM 4766271 ENST00000621016.4 \n", + "1447 KEGG_1448 N00258 999v2 COSM 4766211 ENST00000621016.4 \n", + "1448 KEGG_1449 N00258 999v2 COSM 1379150 ENST00000621016.4 \n", + "\n", + " NucChange Chr Start End ... \\\n", + "0 NaN 12 57751646 57751646 ... \n", + "1 NaN 12 57751646 57751646 ... \n", + "2 NaN 12 57751646 57751646 ... \n", + "3 NaN 12 57751647 57751647 ... \n", + "4 NaN 12 57751647 57751647 ... \n", + "... ... ... ... ... ... \n", + "1444 c.706G>T 19 10492196 10492196 ... \n", + "1445 c.548A>G 19 10499486 10499486 ... \n", + "1446 c.662A>G 16 68808823 68808823 ... \n", + "1447 c.755T>G 16 68810264 68810264 ... \n", + "1448 c.769G>A 16 68810278 68810278 ... \n", + "\n", + " Network Expanded \\\n", + "0 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "1 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "2 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "3 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "4 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "... ... \n", + "1444 9817v1 // 4780 => (3162,1728,119391,221357,293... \n", + "1445 9817v1 // 4780 => (3162,1728,119391,221357,293... \n", + "1446 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "1447 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "1448 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "\n", + " Pathway \\\n", + "0 {'hsa05218': 'Melanoma'} \n", + "1 {'hsa05218': 'Melanoma'} \n", + "2 {'hsa05218': 'Melanoma'} \n", + "3 {'hsa05218': 'Melanoma'} \n", + "4 {'hsa05218': 'Melanoma'} \n", + "... ... \n", + "1444 {'hsa05225': 'Hepatocellular carcinoma'} \n", + "1445 {'hsa05225': 'Hepatocellular carcinoma'} \n", + "1446 {'hsa05226': 'Gastric cancer'} \n", + "1447 {'hsa05226': 'Gastric cancer'} \n", + "1448 {'hsa05226': 'Gastric cancer'} \n", + "\n", + " Class \\\n", + "0 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "1 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "2 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "3 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "4 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "... ... \n", + "1444 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "1445 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "1446 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "1447 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "1448 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "\n", + " Disease \\\n", + "0 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "1 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "2 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "3 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "4 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "... ... \n", + "1444 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", + "1445 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", + "1446 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "1447 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "1448 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "\n", + " Gene Variant_Name \\\n", + "0 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "1 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "2 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "3 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "4 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "... ... ... \n", + "1444 {'9817': 'KEAP1; kelch like ECH associated pro... KEAP1 mutation \n", + "1445 {'9817': 'KEAP1; kelch like ECH associated pro... KEAP1 mutation \n", + "1446 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", + "1447 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", + "1448 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", + "\n", + " Variant_Gene Variant_Gene Info \\\n", + "0 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "1 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "2 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "3 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "4 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "... ... ... \n", + "1444 KEAP1 kelch like ECH associated protein 1 [KO:K10456] \n", + "1445 KEAP1 kelch like ECH associated protein 1 [KO:K10456] \n", + "1446 CDH1 cadherin 1 [KO:K05689] \n", + "1447 CDH1 cadherin 1 [KO:K05689] \n", + "1448 CDH1 cadherin 1 [KO:K05689] \n", + "\n", + " Variant_Type Disease_Names \n", + "0 NaN {'H00038': 'Melanoma'} \n", + "1 NaN {'H00038': 'Melanoma'} \n", + "2 NaN {'H00038': 'Melanoma'} \n", + "3 NaN {'H00038': 'Melanoma'} \n", + "4 NaN {'H00038': 'Melanoma'} \n", + "... ... ... \n", + "1444 NaN {'H00048': 'Hepatocellular carcinoma;'} \n", + "1445 NaN {'H00048': 'Hepatocellular carcinoma;'} \n", + "1446 NaN {'H00018': 'Gastric cancer'} \n", + "1447 NaN {'H00018': 'Gastric cancer'} \n", + "1448 NaN {'H00018': 'Gastric cancer'} \n", + "\n", + "[1449 rows x 24 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load variant data for nucleotide database creation\n", + "network_file = CONFIG['network_data_file']\n", + "variant_data = pd.read_csv(network_file, sep='\\t')\n", + "print(f\"✅ Loaded variant data: {len(variant_data)} entries\")\n", + "variant_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a7d31451", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1449" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(variant_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fc9baca9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'N00073'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variant_data.iloc[1][\"Network\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "928146a6", + "metadata": {}, + "outputs": [], + "source": [ + "from Bio import SeqIO\n", + "import os\n", + "\n", + "# Load reference genome sequences\n", + "fasta_file = CONFIG['reference_fasta']\n", + "if not os.path.exists(fasta_file):\n", + " print(f\"❌ Reference genome file not found: {fasta_file}\")\n", + " print(\"Please update CONFIG['reference_fasta'] with correct path\")\n", + " raise FileNotFoundError(f\"Reference genome not found: {fasta_file}\")\n", + "\n", + "record_dict = SeqIO.to_dict(SeqIO.parse(fasta_file, \"fasta\"))\n", + "print(f\"✅ Loaded reference genome: {len(record_dict)} sequences\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3184e72", + "metadata": {}, + "outputs": [], + "source": [ + "# Use chromosome dictionary from configuration\n", + "chromosome_dictionary = CONFIG['chromosome_dictionary']\n", + "print(f\"✅ Chromosome mapping loaded: {len(chromosome_dictionary)} chromosomes\")\n", + "print(\"Available chromosomes:\", list(chromosome_dictionary.keys()))" + ] + }, + { + "cell_type": "markdown", + "id": "1cd34cc2", + "metadata": {}, + "source": [ + "### Verification that the reference is present at the exact position I have in my data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70cc6625", + "metadata": {}, + "outputs": [], + "source": [ + "# Verify reference sequences (alternative implementation)\n", + "chromosome_dictionary = CONFIG['chromosome_dictionary']\n", + "verification_file = \"verification_alt.txt\"\n", + "\n", + "print(f\"Starting alternative sequence verification...\")\n", + "print(f\"Results will be saved to: {verification_file}\")\n", + "\n", + "with open(verification_file, \"w\") as f:\n", + " for i in range(len(variant_data)):\n", + " try:\n", + " # ---- Input ----\n", + " chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]\n", + " if (variant_data.iloc[i]['TranscriptID'][:4] == \"ENST\"):\n", + " start = variant_data.iloc[i]['Start'] - 1\n", + " else:\n", + " start = variant_data.iloc[i]['Start']\n", + " reference_allele = variant_data.iloc[i]['RefAllele']\n", + " end = len(reference_allele) + start\n", + "\n", + " chrom_seq = record_dict[chromosome_id].seq\n", + "\n", + " # Adjust for 0-based indexing in Python\n", + " genomic_ref = chrom_seq[start: start + len(reference_allele)]\n", + "\n", + " if genomic_ref.upper() != reference_allele.upper():\n", + " f.write(f\"⚠️ Warning: Entry number {i} with variant {variant_data.iloc[i]['ID']} expected '{reference_allele}', but found '{genomic_ref}'\\n\")\n", + " else:\n", + " f.write(f\"✅ Verified: {chromosome_id}:{start}-{end} → '{reference_allele}' matches genome\\n\")\n", + " \n", + " except Exception as e:\n", + " f.write(f\"❌ Error verifying variant {i}: {str(e)}\\n\")\n", + " \n", + " if (i + 1) % 200 == 0:\n", + " print(f\"Verified {i + 1}/{len(variant_data)} variants...\")\n", + "\n", + "print(f\"✅ Alternative verification complete. Results: {verification_file}\")" + ] + }, + { + "cell_type": "markdown", + "id": "83c0dcce-81b3-4162-a683-3ba86d065eb7", + "metadata": {}, + "source": [ + "## Read in Final_data JSON files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9745a67d-3b2a-4679-92c3-92fc199a8763", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questionanswerreference_sequencevariant_sequencereasoning.reasoning_stepsIDtemp_ID
0Chromosome Number: 20\\nNetwork Definition of t...Creutzfeldt-Jakob DiseaseAATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...[Step 1: The variant is an insertion in the PR...KEGG_854854
1Chromosome Number: 20\\nNetwork Definition of t...Creutzfeldt-Jakob DiseaseAATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...[Step 1: The variant is a deletion of 47 nucle...KEGG_841841
2Chromosome Number: 21\\nNetwork Definition of t...Alzheimer's diseaseGCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA...GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA...[Step 1: The TC>GA mutation in the APP gene on...KEGG_468468
3Chromosome Number: 1\\nNetwork Definition of th...Primary AldosteronismAATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA...AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA...[Step 1: The variant KEGG_635 is a 15-nucleoti...KEGG_635635
4Chromosome Number: 14\\nNetwork Definition of t...Spinocerebellar AtaxiaTCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG...TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG...[Step 1: The variant is a trinucleotide repeat...KEGG_620620
........................
1444Chromosome Number: 6\\nNetwork Definition of th...Spinocerebellar AtaxiagaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT...gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT...[Step 1: The variant KEGG_286 is an A>G substi...KEGG_286286
1445Chromosome Number: 6\\nNetwork Definition of th...Spinocerebellar AtaxiaTTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA...TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA...[Step 1: The variant is a single cytosine (C) ...KEGG_293293
1446Chromosome Number: 12\\nNetwork Definition of t...Pituitary AdenomaGTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC...GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC...[Step 1: The variant is a 20-nucleotide duplic...KEGG_77
1447Chromosome Number: 11\\nNetwork Definition of t...Spinocerebellar AtaxiaATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG...ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG...[Step 1: The variant KEGG_1285 is an A>G subst...KEGG_12851285
1448Chromosome Number: 7\\nNetwork Definition of th...MelanomatataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC...tataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC...[Step 1: The variant involves a nucleotide cha...KEGG_12901290
\n", + "

1449 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " question \\\n", + "0 Chromosome Number: 20\\nNetwork Definition of t... \n", + "1 Chromosome Number: 20\\nNetwork Definition of t... \n", + "2 Chromosome Number: 21\\nNetwork Definition of t... \n", + "3 Chromosome Number: 1\\nNetwork Definition of th... \n", + "4 Chromosome Number: 14\\nNetwork Definition of t... \n", + "... ... \n", + "1444 Chromosome Number: 6\\nNetwork Definition of th... \n", + "1445 Chromosome Number: 6\\nNetwork Definition of th... \n", + "1446 Chromosome Number: 12\\nNetwork Definition of t... \n", + "1447 Chromosome Number: 11\\nNetwork Definition of t... \n", + "1448 Chromosome Number: 7\\nNetwork Definition of th... \n", + "\n", + " answer \\\n", + "0 Creutzfeldt-Jakob Disease \n", + "1 Creutzfeldt-Jakob Disease \n", + "2 Alzheimer's disease \n", + "3 Primary Aldosteronism \n", + "4 Spinocerebellar Ataxia \n", + "... ... \n", + "1444 Spinocerebellar Ataxia \n", + "1445 Spinocerebellar Ataxia \n", + "1446 Pituitary Adenoma \n", + "1447 Spinocerebellar Ataxia \n", + "1448 Melanoma \n", + "\n", + " reference_sequence \\\n", + "0 AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG... \n", + "1 AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG... \n", + "2 GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA... \n", + "3 AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA... \n", + "4 TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG... \n", + "... ... \n", + "1444 gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT... \n", + "1445 TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA... \n", + "1446 GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC... \n", + "1447 ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG... \n", + "1448 tataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC... \n", + "\n", + " variant_sequence \\\n", + "0 AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG... \n", + "1 AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG... \n", + "2 GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA... \n", + "3 AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA... \n", + "4 TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG... \n", + "... ... \n", + "1444 gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT... \n", + "1445 TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA... \n", + "1446 GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC... \n", + "1447 ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG... \n", + "1448 tataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC... \n", + "\n", + " reasoning.reasoning_steps ID temp_ID \n", + "0 [Step 1: The variant is an insertion in the PR... KEGG_854 854 \n", + "1 [Step 1: The variant is a deletion of 47 nucle... KEGG_841 841 \n", + "2 [Step 1: The TC>GA mutation in the APP gene on... KEGG_468 468 \n", + "3 [Step 1: The variant KEGG_635 is a 15-nucleoti... KEGG_635 635 \n", + "4 [Step 1: The variant is a trinucleotide repeat... KEGG_620 620 \n", + "... ... ... ... \n", + "1444 [Step 1: The variant KEGG_286 is an A>G substi... KEGG_286 286 \n", + "1445 [Step 1: The variant is a single cytosine (C) ... KEGG_293 293 \n", + "1446 [Step 1: The variant is a 20-nucleotide duplic... KEGG_7 7 \n", + "1447 [Step 1: The variant KEGG_1285 is an A>G subst... KEGG_1285 1285 \n", + "1448 [Step 1: The variant involves a nucleotide cha... KEGG_1290 1290 \n", + "\n", + "[1449 rows x 7 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re\n", + "import os\n", + "import json\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "\n", + "# Read final dataset JSON files and create combined DataFrame\n", + "\n", + "# Path to the directory containing JSON files\n", + "json_dir = CONFIG['final_data_dir']\n", + "if not os.path.exists(json_dir):\n", + " print(f\"❌ JSON directory not found: {json_dir}\")\n", + " print(\"Please ensure previous processing steps completed successfully\")\n", + " raise FileNotFoundError(f\"Directory not found: {json_dir}\")\n", + "\n", + "print(f\"Processing JSON files from: {json_dir}\")\n", + "\n", + "# Initialize a list to hold DataFrames\n", + "df_list = []\n", + "processed_count = 0\n", + "\n", + "# Loop through all files in the directory\n", + "for filename in os.listdir(json_dir):\n", + " if filename.endswith(\".json\"):\n", + " match = re.search(r\"(KEGG_\\d+)_with_seqs\", filename)\n", + " if match:\n", + " kegg_id = match.group(1) # Extract 'KEGG_'\n", + " file_path = os.path.join(json_dir, filename)\n", + " \n", + " try:\n", + " with open(file_path, 'r') as f:\n", + " data = json.load(f)\n", + " \n", + " df = pd.json_normalize(data)\n", + " df['ID'] = kegg_id # Add the full KEGG ID string\n", + " df['temp_ID'] = int(kegg_id[5:]) # Extract numeric ID for sorting\n", + " df_list.append(df)\n", + " processed_count += 1\n", + " \n", + " if processed_count % 100 == 0:\n", + " print(f\"Processed {processed_count} JSON files...\")\n", + " \n", + " except Exception as e:\n", + " print(f\"[Warning] Could not process {filename}: {str(e)}\")\n", + "\n", + "# Concatenate all DataFrames into one\n", + "if df_list:\n", + " combined_df = pd.concat(df_list, ignore_index=True)\n", + " print(f\"✅ Combined {len(df_list)} JSON files into DataFrame\")\n", + " print(f\"Total samples: {len(combined_df)}\")\n", + "else:\n", + " print(\"❌ No JSON files found or processed successfully\")\n", + " combined_df = pd.DataFrame()\n", + "\n", + "# Display the result\n", + "combined_df.head() if not combined_df.empty else print(\"No data to display\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a81e8836-9618-4e62-b192-ee397a063ce7", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "46c1083a-d499-428e-9180-2b62e83f1751", + "metadata": {}, + "outputs": [], + "source": [ + "combined_df = combined_df.sort_values(by=['temp_ID'])\n", + "combined_df = combined_df.rename(columns={\"reasoning.reasoning_steps\" : \"reasoning\"})\n", + "combined_df = combined_df.drop(columns=['temp_ID'])" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "1c3e8a2e-444e-4d48-b4c1-c8b5dea5753e", + "metadata": {}, + "outputs": [], + "source": [ + "combined_df = combined_df[['ID','question','answer','reference_sequence','variant_sequence','reasoning']]\n", + "combined_df = combined_df.reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "4200c786-4365-407e-96d4-f5cabfc7b3b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDquestionanswerreference_sequencevariant_sequencereasoning
0KEGG_1Chromosome Number: 12\\nNetwork Definition of t...Melanomagcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...[Step 1: The C>T mutation at position 57751646...
1KEGG_2Chromosome Number: 12\\nNetwork Definition of t...Melanomagcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...[Step 1: The C>A mutation at position 57751646...
2KEGG_3Chromosome Number: 12\\nNetwork Definition of t...Melanomagcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...[Step 1: The C>G mutation at position 57751646...
3KEGG_4Chromosome Number: 12\\nNetwork Definition of t...Melanomacttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...[Step 1: The G>A mutation at position 57751647...
4KEGG_5Chromosome Number: 12\\nNetwork Definition of t...Melanomacttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...[Step 1: The G>C mutation at position 57751647...
.....................
1444KEGG_1445Chromosome Number: 19\\nNetwork Definition of t...Hepatocellular carcinomagagctgagatcatgccactgcactccaacctgggcaacagagcgag...gagctgagatcatgccactgcactccaacctgggcaacagagcgag...[Step 1: The variant is a C>A substitution at ...
1445KEGG_1446Chromosome Number: 19\\nNetwork Definition of t...Hepatocellular carcinomaTGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT...TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT...[Step 1: The variant is a T>C substitution at ...
1446KEGG_1447Chromosome Number: 16\\nNetwork Definition of t...Gastric cancerCAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt...CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt...[Step 1: The variant KEGG_1447 represents an A...
1447KEGG_1448Chromosome Number: 16\\nNetwork Definition of t...Gastric cancerGATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA...GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA...[Step 1: The variant KEGG_1448 is a T>G substi...
1448KEGG_1449Chromosome Number: 16\\nNetwork Definition of t...Gastric cancerGTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC...GTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC...[Step 1: The variant KEGG_1449 is a G>A substi...
\n", + "

1449 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " ID question \\\n", + "0 KEGG_1 Chromosome Number: 12\\nNetwork Definition of t... \n", + "1 KEGG_2 Chromosome Number: 12\\nNetwork Definition of t... \n", + "2 KEGG_3 Chromosome Number: 12\\nNetwork Definition of t... \n", + "3 KEGG_4 Chromosome Number: 12\\nNetwork Definition of t... \n", + "4 KEGG_5 Chromosome Number: 12\\nNetwork Definition of t... \n", + "... ... ... \n", + "1444 KEGG_1445 Chromosome Number: 19\\nNetwork Definition of t... \n", + "1445 KEGG_1446 Chromosome Number: 19\\nNetwork Definition of t... \n", + "1446 KEGG_1447 Chromosome Number: 16\\nNetwork Definition of t... \n", + "1447 KEGG_1448 Chromosome Number: 16\\nNetwork Definition of t... \n", + "1448 KEGG_1449 Chromosome Number: 16\\nNetwork Definition of t... \n", + "\n", + " answer \\\n", + "0 Melanoma \n", + "1 Melanoma \n", + "2 Melanoma \n", + "3 Melanoma \n", + "4 Melanoma \n", + "... ... \n", + "1444 Hepatocellular carcinoma \n", + "1445 Hepatocellular carcinoma \n", + "1446 Gastric cancer \n", + "1447 Gastric cancer \n", + "1448 Gastric cancer \n", + "\n", + " reference_sequence \\\n", + "0 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", + "1 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", + "2 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", + "3 cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt... \n", + "4 cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt... \n", + "... ... \n", + "1444 gagctgagatcatgccactgcactccaacctgggcaacagagcgag... \n", + "1445 TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT... \n", + "1446 CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt... \n", + "1447 GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA... \n", + "1448 GTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC... \n", + "\n", + " variant_sequence \\\n", + "0 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", + "1 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", + "2 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", + "3 cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt... \n", + "4 cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt... \n", + "... ... \n", + "1444 gagctgagatcatgccactgcactccaacctgggcaacagagcgag... \n", + "1445 TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT... \n", + "1446 CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt... \n", + "1447 GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA... \n", + "1448 GTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC... \n", + "\n", + " reasoning \n", + "0 [Step 1: The C>T mutation at position 57751646... \n", + "1 [Step 1: The C>A mutation at position 57751646... \n", + "2 [Step 1: The C>G mutation at position 57751646... \n", + "3 [Step 1: The G>A mutation at position 57751647... \n", + "4 [Step 1: The G>C mutation at position 57751647... \n", + "... ... \n", + "1444 [Step 1: The variant is a C>A substitution at ... \n", + "1445 [Step 1: The variant is a T>C substitution at ... \n", + "1446 [Step 1: The variant KEGG_1447 represents an A... \n", + "1447 [Step 1: The variant KEGG_1448 is a T>G substi... \n", + "1448 [Step 1: The variant KEGG_1449 is a G>A substi... \n", + "\n", + "[1449 rows x 6 columns]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "combined_df" + ] + }, + { + "cell_type": "markdown", + "id": "f5cd7e22", + "metadata": {}, + "source": [ + "### Performing the mutation and saving the reference and variant allele with a 1000 nt window" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "8c89d455-598d-45e3-821b-6e37075b3a77", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4001" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(combined_df.iloc[0]['reference_sequence'])" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "a1dd3ed8-18ca-4468-9ab9-98ebf4713260", + "metadata": {}, + "outputs": [], + "source": [ + "KEGG_2000 = combined_df.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "688a7d0b-4a31-484d-9835-eb66d674b5de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'KEGG_2'" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "KEGG_2000.at[1,'ID']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6fc35c2", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate sequences with updated window size\n", + "chromosome_dictionary = CONFIG['chromosome_dictionary']\n", + "window = CONFIG['sequence_window']\n", + "\n", + "print(f\"Generating sequences with {window}bp windows...\")\n", + "KEGG_2000 = combined_df.copy()\n", + "\n", + "for i in range(len(KEGG_2000)):\n", + " try:\n", + " chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]\n", + " if (variant_data.iloc[i]['TranscriptID'][:4] == \"ENST\"):\n", + " start = variant_data.iloc[i]['Start'] - 1\n", + " else:\n", + " start = variant_data.iloc[i]['Start']\n", + " reference_allele = variant_data.iloc[i]['RefAllele']\n", + " variant_allele = variant_data.iloc[i]['AltAllele']\n", + "\n", + " end = len(reference_allele) + start\n", + " \n", + " chrom_seq = record_dict[chromosome_id].seq\n", + "\n", + " # Extract region\n", + " region_start = max(0, start - window)\n", + " region_end = end + window\n", + "\n", + " ref_seq = chrom_seq[region_start:region_end]\n", + "\n", + " if (variant_allele == \"deletion\"):\n", + " # Apply mutation\n", + " mutated_seq = ref_seq[:window] + ref_seq[window + len(reference_allele):]\n", + "\n", + " KEGG_2000.at[i,'reference_sequence'] = str(ref_seq)\n", + " KEGG_2000.at[i,'variant_sequence'] = str(mutated_seq)\n", + " \n", + " else:\n", + " del_len = len(reference_allele)\n", + " # Apply mutation\n", + " mutated_seq = ref_seq[:window] + variant_allele + ref_seq[window + del_len:]\n", + "\n", + " KEGG_2000.at[i,'reference_sequence'] = str(ref_seq)\n", + " KEGG_2000.at[i,'variant_sequence'] = str(mutated_seq)\n", + " \n", + " if (i + 1) % 100 == 0:\n", + " print(f\"Generated sequences for {i + 1}/{len(KEGG_2000)} variants...\")\n", + " \n", + " except Exception as e:\n", + " print(f\"[Error] Failed to generate sequence for variant {i}: {str(e)}\")\n", + "\n", + "print(f\"✅ Sequence generation complete for {window}bp windows\")" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "e2a50c08-ccae-45ca-98e1-0c3d3e7d4647", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDquestionanswerreference_sequencevariant_sequencereasoning
0KEGG_1Chromosome Number: 12\\nNetwork Definition of t...MelanomaTTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...[Step 1: The C>T mutation at position 57751646...
1KEGG_2Chromosome Number: 12\\nNetwork Definition of t...MelanomaTTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...[Step 1: The C>A mutation at position 57751646...
2KEGG_3Chromosome Number: 12\\nNetwork Definition of t...MelanomaTTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...[Step 1: The C>G mutation at position 57751646...
3KEGG_4Chromosome Number: 12\\nNetwork Definition of t...MelanomaTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...[Step 1: The G>A mutation at position 57751647...
4KEGG_5Chromosome Number: 12\\nNetwork Definition of t...MelanomaTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...[Step 1: The G>C mutation at position 57751647...
.....................
1444KEGG_1445Chromosome Number: 19\\nNetwork Definition of t...Hepatocellular carcinomagcactccagcctgggcaacagagcaagagagacagggtcttactct...gcactccagcctgggcaacagagcaagagagacagggtcttactct...[Step 1: The variant is a C>A substitution at ...
1445KEGG_1446Chromosome Number: 19\\nNetwork Definition of t...Hepatocellular carcinomactcccaaagtgctgggattacaggcgtgagccactgggccctgcCC...ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC...[Step 1: The variant is a T>C substitution at ...
1446KEGG_1447Chromosome Number: 16\\nNetwork Definition of t...Gastric cancerggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg...ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg...[Step 1: The variant KEGG_1447 represents an A...
1447KEGG_1448Chromosome Number: 16\\nNetwork Definition of t...Gastric cancertttgagatagggtttcactctgtcacccaggctggaaccacaacct...tttgagatagggtttcactctgtcacccaggctggaaccacaacct...[Step 1: The variant KEGG_1448 is a T>G substi...
1448KEGG_1449Chromosome Number: 16\\nNetwork Definition of t...Gastric cancertcactctgtcacccaggctggaaccacaacctccacttcccgggtt...tcactctgtcacccaggctggaaccacaacctccacttcccgggtt...[Step 1: The variant KEGG_1449 is a G>A substi...
\n", + "

1449 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " ID question \\\n", + "0 KEGG_1 Chromosome Number: 12\\nNetwork Definition of t... \n", + "1 KEGG_2 Chromosome Number: 12\\nNetwork Definition of t... \n", + "2 KEGG_3 Chromosome Number: 12\\nNetwork Definition of t... \n", + "3 KEGG_4 Chromosome Number: 12\\nNetwork Definition of t... \n", + "4 KEGG_5 Chromosome Number: 12\\nNetwork Definition of t... \n", + "... ... ... \n", + "1444 KEGG_1445 Chromosome Number: 19\\nNetwork Definition of t... \n", + "1445 KEGG_1446 Chromosome Number: 19\\nNetwork Definition of t... \n", + "1446 KEGG_1447 Chromosome Number: 16\\nNetwork Definition of t... \n", + "1447 KEGG_1448 Chromosome Number: 16\\nNetwork Definition of t... \n", + "1448 KEGG_1449 Chromosome Number: 16\\nNetwork Definition of t... \n", + "\n", + " answer \\\n", + "0 Melanoma \n", + "1 Melanoma \n", + "2 Melanoma \n", + "3 Melanoma \n", + "4 Melanoma \n", + "... ... \n", + "1444 Hepatocellular carcinoma \n", + "1445 Hepatocellular carcinoma \n", + "1446 Gastric cancer \n", + "1447 Gastric cancer \n", + "1448 Gastric cancer \n", + "\n", + " reference_sequence \\\n", + "0 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", + "1 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", + "2 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", + "3 TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT... \n", + "4 TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT... \n", + "... ... \n", + "1444 gcactccagcctgggcaacagagcaagagagacagggtcttactct... \n", + "1445 ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC... \n", + "1446 ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg... \n", + "1447 tttgagatagggtttcactctgtcacccaggctggaaccacaacct... \n", + "1448 tcactctgtcacccaggctggaaccacaacctccacttcccgggtt... \n", + "\n", + " variant_sequence \\\n", + "0 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", + "1 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", + "2 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", + "3 TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT... \n", + "4 TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT... \n", + "... ... \n", + "1444 gcactccagcctgggcaacagagcaagagagacagggtcttactct... \n", + "1445 ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC... \n", + "1446 ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg... \n", + "1447 tttgagatagggtttcactctgtcacccaggctggaaccacaacct... \n", + "1448 tcactctgtcacccaggctggaaccacaacctccacttcccgggtt... \n", + "\n", + " reasoning \n", + "0 [Step 1: The C>T mutation at position 57751646... \n", + "1 [Step 1: The C>A mutation at position 57751646... \n", + "2 [Step 1: The C>G mutation at position 57751646... \n", + "3 [Step 1: The G>A mutation at position 57751647... \n", + "4 [Step 1: The G>C mutation at position 57751647... \n", + "... ... \n", + "1444 [Step 1: The variant is a C>A substitution at ... \n", + "1445 [Step 1: The variant is a T>C substitution at ... \n", + "1446 [Step 1: The variant KEGG_1447 represents an A... \n", + "1447 [Step 1: The variant KEGG_1448 is a T>G substi... \n", + "1448 [Step 1: The variant KEGG_1449 is a G>A substi... \n", + "\n", + "[1449 rows x 6 columns]" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "KEGG_2000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26c939b5-0768-4565-873a-10cba7396d99", + "metadata": {}, + "outputs": [], + "source": [ + "# Create dataset structure (HuggingFace datasets optional)\n", + "try:\n", + " from datasets import Dataset, DatasetDict\n", + " \n", + " # Create Hugging Face Datasets\n", + " train_dataset = Dataset.from_pandas(KEGG_2000)\n", + " \n", + " # Combine into a DatasetDict\n", + " dataset = DatasetDict({\n", + " \"train\": train_dataset,\n", + " })\n", + " \n", + " print(\"✅ HuggingFace dataset created\")\n", + " use_hf_datasets = True\n", + " \n", + "except ImportError:\n", + " print(\"⚠️ HuggingFace datasets not available, using pandas only\")\n", + " dataset = KEGG_2000\n", + " train_dataset = KEGG_2000\n", + " use_hf_datasets = False\n", + "\n", + "print(f\"Final dataset contains {len(train_dataset)} samples\")" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "afa07e17-e86a-41d8-9db3-5df6d77443f8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['ID', 'question', 'answer', 'reference_sequence', 'variant_sequence', 'reasoning'],\n", + " num_rows: 1449\n", + " })\n", + "})" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "affe2720-e748-45d2-97d0-0baf1d6530ce", + "metadata": {}, + "outputs": [], + "source": [ + "# Save final dataset locally instead of uploading to HuggingFace\n", + "# Users can upload to their own repositories if needed\n", + "\n", + "output_file = \"kegg_variant_dataset_final.parquet\"\n", + "dataset_info_file = \"dataset_info.json\"\n", + "\n", + "# Save dataset as Parquet for efficient storage\n", + "train_dataset.to_parquet(output_file)\n", + "print(f\"✅ Dataset saved to: {output_file}\")\n", + "\n", + "# Save dataset information\n", + "dataset_info = {\n", + " \"name\": \"KEGG Variant Dataset\",\n", + " \"description\": \"Genetic variants with biological reasoning for disease association\",\n", + " \"total_samples\": len(train_dataset),\n", + " \"sequence_length\": f\"~{CONFIG['sequence_window']*2}bp\",\n", + " \"features\": list(train_dataset.column_names),\n", + " \"diseases\": len(set(disease)) if 'disease' in locals() else \"Unknown\",\n", + " \"created_by\": \"KEGG Data Processing Pipeline\",\n", + " \"version\": \"1.0\"\n", + "}\n", + "\n", + "with open(dataset_info_file, 'w') as f:\n", + " json.dump(dataset_info, f, indent=2)\n", + " \n", + "print(f\"✅ Dataset information saved to: {dataset_info_file}\")\n", + "print(f\"\\nDataset ready for use:\")\n", + "print(f\" - Main dataset: {output_file}\")\n", + "print(f\" - Information: {dataset_info_file}\")\n", + "print(f\" - Samples: {len(train_dataset)}\")\n", + "print(f\" - Features: {train_dataset.column_names}\")\n", + "\n", + "print(\"\\n📝 To upload to HuggingFace Hub:\")\n", + "print(\"dataset.push_to_hub('your-username/your-dataset-name')\")" + ] + }, + { + "cell_type": "markdown", + "id": "5b448bd7-e256-4fad-ae95-dbe299d380f0", + "metadata": {}, + "source": [ + "# KEGG Dataset with Alternative Window Size\n", + "\n", + "This section demonstrates creating the dataset with different sequence window parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fd609ca-6276-4425-997f-0589fe03f1ea", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/BioReason-main/pyproject.toml b/BioReason-main/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..1a022fc575a30aec818f2629b4f307b0d85d662a --- /dev/null +++ b/BioReason-main/pyproject.toml @@ -0,0 +1,57 @@ +[build-system] +requires = ["setuptools>=42", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "bioreason" +version = "0.1.0" +description = "Bio-related Reasoning with Language Models" +readme = "README.md" +requires-python = ">=3.11" +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] +dependencies = [ + "torch", + "torchvision", + "transformers", + "accelerate", + "qwen-vl-utils", + "jupyter", + "datasets", + "peft", + "pytorch_lightning", + "wandb", + "trl[vllm]", + "bitsandbytes", + "deepspeed", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "black", + "isort", + "mypy", +] + +[tool.setuptools] +packages = ["bioreason"] + +[tool.black] +line-length = 88 +target-version = ["py311"] + +[tool.isort] +profile = "black" +line_length = 88 + +[tool.mypy] +python_version = "3.11" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = true +disallow_incomplete_defs = true \ No newline at end of file diff --git a/BioReason-main/reason.py b/BioReason-main/reason.py new file mode 100644 index 0000000000000000000000000000000000000000..d6c48ed16166f8cf010cf065aa9bfcf2a9f9c8ed --- /dev/null +++ b/BioReason-main/reason.py @@ -0,0 +1,610 @@ +import os +import re + +import pathlib +from argparse import ArgumentParser +from typing import List, Dict, Optional +from dataclasses import dataclass, field + +import torch +from torch import nn +import torch.nn.functional as F +from torch.optim import AdamW +from torch.utils.data import DataLoader, Dataset +from transformers import get_cosine_schedule_with_warmup, AutoTokenizer + +from transformers import ( + AutoTokenizer, + AutoModelForCausalLM, + AutoModelForMaskedLM, + AutoProcessor, +) + +from datasets import load_dataset, DatasetDict + +from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training +from transformers import BitsAndBytesConfig + +import pytorch_lightning as pl +from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor +from pytorch_lightning.loggers import WandbLogger + +from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config +#from unsloth import FastLanguageModel, is_bfloat16_supported + +from bioreason.models.dna_llm import DNALLMModel +from bioreason.dna_modules import NucleotideDNAModule +from bioreason.models.dl.processing_dl import DLProcessor +from bioreason.trainer import DNALLMGRPOTrainer, DNALLMGRPOConfig +from bioreason.models.evo2_tokenizer import Evo2Tokenizer, register_evo2_tokenizer +register_evo2_tokenizer() + +# Custom TrainerCallback to override the saving mechanism +from transformers import TrainerCallback, TrainerState, TrainerControl +from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR + +class SaveWithPyTorchCallback(TrainerCallback): + """Custom callback to save models with PyTorch's native save mechanism instead of safetensors""" + def on_save(self, args, state, control, **kwargs): + # Get the checkpoint folder + checkpoint_folder = os.path.join( + args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}" + ) + os.makedirs(checkpoint_folder, exist_ok=True) + + # Save with PyTorch instead of safetensors + checkpoint_path = os.path.join(checkpoint_folder, "pytorch_model.bin") + model = kwargs.get("model") + + # Get model unwrapped from accelerator etc. + unwrapped_model = model.module if hasattr(model, "module") else model + + # Save using PyTorch directly + torch.save(unwrapped_model.state_dict(), checkpoint_path) + + # DNALLMModel doesn't have a direct config attribute, so we need to save + # the configs of its sub-models + if hasattr(unwrapped_model, "text_model"): + if hasattr(unwrapped_model.text_model, "config"): + unwrapped_model.text_model.config.save_pretrained(checkpoint_folder) + # Handle PEFT models which might have base_model + elif hasattr(unwrapped_model.text_model, "base_model") and hasattr(unwrapped_model.text_model.base_model, "config"): + unwrapped_model.text_model.base_model.config.save_pretrained(checkpoint_folder) + + # Print info about what's being saved + print(f"Saved model checkpoint to {checkpoint_folder}") + lora_params = [k for k in unwrapped_model.state_dict().keys() if "lora" in k] + print(f"Checkpoint contains {len(lora_params)} LoRA parameters") + + # Signal that we've saved + control.should_save = False + return control + +def _get_target_modules(model: DNALLMModel): + # Apply LoRA to all linear layers in the text model + target_modules = [] + + # Get all unique linear layer names + seen_names = set() + for name, module in model.text.named_modules(): + if isinstance(module, torch.nn.Linear): + names = name.split(".") + target_name = names[-1] # Use the last part of the name + + # Skip output head but include all other linear layers + if target_name != "lm_head" and target_name not in seen_names: + target_modules.append(target_name) + seen_names.add(target_name) + + # Add attention-specific layers + attention_patterns = [ + "q_proj", + "k_proj", + "v_proj", + "out_proj", + "query", + "key", + "value", + ] + for pattern in attention_patterns: + if pattern not in seen_names: + target_modules.append(pattern) + + # Return all unique layer names to apply LoRA to all layers + return list(target_modules) + + +def extract_xml_answer(text: str) -> str: + # answer = text.split("")[-1] + # answer = answer.split("")[0] + answer = text.split("")[-1] + return answer.strip() + +def extract_hash_answer(text: str) -> str | None: + if "####" not in text: + return None + return text.split("####")[1].strip() + +def get_kegg_questions() -> Dataset: + data = load_dataset('wanglab/kegg', 'default') # type: ignore + example_dna_sequences = ["ATCTACATGCAT", "CAGCAGCTACAG", "CATCACATCGACATCGAC"] + num_dna_sequences = 2 # TODO: Change to 2! + + data = data.map(lambda x: { # type: ignore + 'prompt': [ + + { + 'role': 'user', + 'content': [ + *({'type': 'dna', 'text': None} for _ in range(num_dna_sequences)), + {'type': 'text', 'text': x['question']}, + ], + }, + ], + 'dna_sequences': [x['reference_sequence'], x['variant_sequence']], + 'answer': x['answer'], + }) # type: ignore + + return data + +# uncomment middle messages for 1-shot prompting +def get_gsm8k_questions(question_prompt: str) -> Dataset: + data = load_dataset('openai/gsm8k', 'main') # type: ignore + + example_dna_sequences = ["ATCTACATGCAT", "CAGCAGCTACAG", "CATCACATCGACATCGAC"] + data = data.map(lambda x: { # type: ignore + 'prompt': [ + + { + 'role': 'user', + 'content': [ + *({'type': 'dna', 'text': None} for _ in range(len(example_dna_sequences))), + {'type': 'text', 'text': 'Give me a short introduction to large language model.'} + ] + }, + ], + 'dna_sequences': [dna for dna in example_dna_sequences], + 'answer': extract_hash_answer(x['answer']), + }) # type: ignore + + return data # type: ignore + +def get_gsm8k_questions_old(question_prompt: str) -> Dataset: + data = load_dataset('openai/gsm8k', 'main') # type: ignore + + example_dna_sequences = ["ATCTACATGCAT", "CAGCAGCTACAG", "CATCACATCGACATCGAC"] + data = data.map(lambda x: { # type: ignore + 'prompt': [ + { + 'role': 'user', + 'content': [ + *({'type': 'dna', 'text': None} for _ in range(len(example_dna_sequences))), + {'type': 'text', 'text': question_prompt.format(Question=x['question'])} + ] + }, + ], + 'dna_sequences': [dna for dna in example_dna_sequences], + 'answer': extract_hash_answer(x['answer']), + }) # type: ignore + + return data # type: ignore + +# Reward functions +def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]: + responses = [completion[0]['content'] for completion in completions] + q = prompts[0][-1]['content'] + extracted_responses = [extract_xml_answer(r) for r in responses] + # extracted_responses = [r.lower().replace("answer:", "").strip() for r in extracted_responses] + print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}") + return [2.0 if a.lower() in r.lower() else 0.0 for r, a in zip(extracted_responses, answer[0])] + +def less_than_4_reward_func(completions, **kwargs) -> list[float]: + responses = [completion[0]['content'] for completion in completions] + extracted_responses = [extract_xml_answer(r) for r in responses] + return [0.5 if len(r.split(' ')) <= 4 else 0.0 for r in extracted_responses] + +def strict_format_reward_func(completions, **kwargs) -> list[float]: + """Reward function that checks if the completion has a specific format.""" + pattern = r"^\n.*?\n\n.*?\n$" + responses = [completion[0]["content"] for completion in completions] + matches = [re.match(pattern, r) for r in responses] + return [0.5 if match else 0.0 for match in matches] + +def soft_format_reward_func(completions, **kwargs) -> list[float]: + """Reward function that checks if the completion has a specific format.""" + pattern = r".*?\s*.*?" + responses = [completion[0]["content"] for completion in completions] + matches = [re.match(pattern, r) for r in responses] + return [0.5 if match else 0.0 for match in matches] + +def count_xml(text) -> float: + count = 0.0 + if text.count("\n") == 1: + count += 0.125 + if text.count("\n\n") == 1: + count += 0.125 + return count + +def xmlcount_reward_func(completions, **kwargs) -> list[float]: + contents = [completion[0]["content"] for completion in completions] + return [count_xml(c) for c in contents] + +# Format into conversation +def make_conversation(example): + return { + "prompt": [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": example["problem"]}, + ], + } + +def make_conversation_image(example): + return { + "prompt": [ + { + "role": "user", + "content": [ + {"type": "image"}, + ], + }, + ], + } + +@dataclass +class GRPOModelConfig(ModelConfig): + + # "HuggingFaceTB/SmolLM-135M-Instruct" + # "Qwen/Qwen2.5-0.5B-Instruct" + model_name_or_path: str = field(default="Qwen/Qwen3-0.6B", metadata={"help": "Model checkpoint for weights initialization."}) + dna_model_name_or_path: str = field(default="InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", metadata={"help": "Model checkpoint for weights initialization."}) + cache_dir: str = field(default=None, metadata={"help": "Path to model cache directory."}) + max_length_text: int = field(default=800, metadata={"help": "Maximum length of text sequences."}) + max_length_dna: int = field(default=800, metadata={"help": "Maximum length of DNA sequences, in groups of 6 nucleotides."}) + sft_checkpoint: str = field(default=None, metadata={"help": "Path to the checkpoint for SFT."}) + lora_r: int = field(default=32, metadata={"help": "LoRA R value."}) + lora_alpha: int = field(default=64, metadata={"help": "LoRA alpha."}) + lora_dropout: float = field(default=0.05, metadata={"help": "LoRA dropout."}) + lora_modules_to_save: Optional[list[str]] = field( + default="embed_tokens", + metadata={"help": "Model layers to unfreeze & train."}, + ) + freeze_dna_modules: bool = False + +@dataclass +class GRPOScriptArguments(ScriptArguments): + """ + Script arguments for the GRPO training script. + """ + dataset_name: str = field(default="wanglab/kegg", metadata={"help": "Dataset name with default."}) + data_file_paths: str = field( + default=None, + metadata={"help": "Paths to data files, separated by ':'"}, + ) + arrow_cache_dir: str = field( + default=None, + metadata={"help": "Path to arrow cache directory"}, + ) + val_split_ratio: float = field( + default=0.0, + metadata={"help": "Ratio of validation split, default 0.0"}, + ) + reward_funcs: list[str] = field( + #default_factory=lambda: ["accuracy", "format"], + default_factory=lambda: ["xmlcount", "soft_format", "strict_format", "less_than_4", "correctness"], + #metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"}, + metadata={"help": "List of reward functions. Possible values: 'accuracy', 'xmlcount', 'soft_format', 'strict_format', 'less_than_4', 'correctness'"}, + ) + # max_pixels: Optional[int] = field( + # default=12845056, + # metadata={"help": "Maximum number of pixels for the image (for QwenVL)"}, + # ) + # min_pixels: Optional[int] = field( + # default=3136, + # metadata={"help": "Minimum number of pixels for the image (for QwenVL)"}, + # ) + # task_type: Optional[str] = field( + # default=None, + # metadata={"help": "Choose task type: 'default', 'gui', ..."}, + # ) + + + +reward_funcs_registry = { + # "accuracy": accuracy_reward, + # "format": format_reward, + "xmlcount": xmlcount_reward_func, + "soft_format": soft_format_reward_func, + "strict_format": strict_format_reward_func, + "less_than_4": less_than_4_reward_func, + "correctness": correctness_reward_func, +} + +def get_vlm_module(model_name_or_path): + if any(mini_name in model_name_or_path.lower() for mini_name in ["qwen", "smol"]): + return NucleotideDNAModule + else: + raise ValueError(f"Unsupported model: {model_name_or_path}") + +def _get_target_modules(model): + # Apply LoRA to all linear layers in the text model + target_modules = [] + + # Get all unique linear layer names + seen_names = set() + for name, module in model.text_model.named_modules(): + if isinstance(module, torch.nn.Linear): + names = name.split(".") + target_name = names[-1] # Use the last part of the name + + # Skip output head but include all other linear layers + if target_name != "lm_head" and target_name not in seen_names: + target_modules.append(target_name) + seen_names.add(target_name) + + # Add attention-specific layers + attention_patterns = [ + "q_proj", + "k_proj", + "v_proj", + "out_proj", + "query", + "key", + "value", + ] + for pattern in attention_patterns: + if pattern not in seen_names: + target_modules.append(pattern) + + # Return all unique layer names to apply LoRA to all layers + return list(target_modules) + + +def _prep_for_training(model, training_args, dna_model_finetune: bool = False) -> LoraConfig: + """ + Load and configure the DNALLMModel. + """ + + # Freeze DNA encoder parameters + if dna_model_finetune: + pass + else: + for param in model.dna_model.parameters(): + param.requires_grad = False + + target_modules = _get_target_modules(model) + + lora_config = LoraConfig( + r=training_args.lora_r, + lora_alpha=training_args.lora_alpha, + lora_dropout=training_args.lora_dropout, + target_modules=target_modules, + init_lora_weights="gaussian", + bias="none", + task_type="CAUSAL_LM", + ) + + # Prepare text model for training + model.text_model = prepare_model_for_kbit_training(model.text_model) + model.text_model = get_peft_model(model.text_model, lora_config) + + # Make projection layer trainable + for param in model.dna_projection.parameters(): + param.requires_grad = True + + return lora_config + +def main(script_args, training_args, model_args): + + print(training_args.output_dir) + #pl.seed_everything(args.seed) + # os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + torch.cuda.empty_cache() + torch.set_float32_matmul_precision("medium") + + # Initialize model + # Load tokenizer for target text + # tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) + # tokenizer.pad_token = tokenizer.eos_token + + # Load model + model = DNALLMModel( + text_model_name=model_args.model_name_or_path, + dna_model_name=model_args.dna_model_name_or_path, + cache_dir=model_args.cache_dir, + max_length_text=model_args.max_length_text, + max_length_dna=model_args.max_length_dna, + text_model_finetune=True, + dna_model_finetune=not model_args.freeze_dna_modules, + debug=False, + ) + + # load checkpoint + if model_args.sft_checkpoint is not None: + print(f"Loading SFT checkpoint from {model_args.sft_checkpoint}") + + # Determine if it's a directory (PEFT format) or file (PyTorch state dict) + is_directory = os.path.isdir(model_args.sft_checkpoint) + + if is_directory: + # It's a PEFT checkpoint directory - load properly with PEFT + from peft import PeftModel + + # First initialize the text model with PEFT + print("Loading as PEFT checkpoint directory") + model.text_model = PeftModel.from_pretrained( + model.text_model, + model_args.sft_checkpoint, + is_trainable=True + ) + + # Verify loaded adapters + print("Loaded LoRA adapters:", model.text_model.active_adapter) + + # Optional: Merge weights into base model + print("Merging SFT LoRA weights into base model...") + model.text_model = model.text_model.merge_and_unload() + print("Successfully merged SFT knowledge into base model") + + else: + # It's a PyTorch state dict file + print("Loading as PyTorch state dict file") + checkpoint = torch.load(model_args.sft_checkpoint) + + # replace model.text_model with text_model for all in state dict + def new_key(k): + if k.startswith("=model."): return k[6:] + elif k.startswith("_forward_module."): return k[len("_forward_module."):] + else: return k + + if "state_dict" in checkpoint: + magic = {new_key(k): v for k, v in checkpoint["state_dict"].items()} + elif "module" in checkpoint: + magic = {new_key(k): v for k, v in checkpoint["module"].items()} + elif isinstance(checkpoint, dict) and all(isinstance(k, str) for k in checkpoint.keys()): + # Direct state dict - the checkpoint itself is the state dict + print("Detected direct state dict format") + magic = {new_key(k): v for k, v in checkpoint.items()} + else: + raise ValueError(f"Unsupported checkpoint format: {model_args.sft_checkpoint}") + + # Handle prefix mapping for different model architectures + lora_prefix = False + for key in magic.keys(): + if "lora" in key: + lora_prefix = True + break + + if lora_prefix: + print("Detected LoRA weights in state dict") + # First prepare model for LoRA training + _prep_for_training(model, model_args, dna_model_finetune=model_args.freeze_dna_modules) + + # Print some diagnostic info about the keys + model_keys = set(model.state_dict().keys()) + checkpoint_keys = set(magic.keys()) + print(f"Model has {len(model_keys)} keys") + print(f"Checkpoint has {len(checkpoint_keys)} keys") + + # Try to map LoRA keys more intelligently + new_magic = {} + for k, v in magic.items(): + # Try different prefix mappings based on common patterns + if "base_model.model" in k and k not in model_keys: + new_k = k.replace("text_model.base_model.model", "text_model") + if new_k in model_keys: + new_magic[new_k] = v + continue + + # Try removing common prefixes + if k.startswith("text_model.") and k not in model_keys: + new_k = "text_model.base_model.model." + k[len("text_model."):] + if new_k in model_keys: + new_magic[new_k] = v + continue + + # Keep original key if no mapping found + new_magic[k] = v + + # Include missing target modules in diagnostic info + magic = new_magic + print(f"After key mapping: {len(magic)} keys") + + # Then load weights, allowing missing/extra keys + result = model.load_state_dict(magic, strict=False) + + if len(result.unexpected_keys) > 0: + print(f"Sample unexpected keys: {result.unexpected_keys[:5]}") + if len(result.missing_keys) > 0: + print(f"Sample missing keys: {result.missing_keys[:5]}") + + print(f"Loaded checkpoint with {len(result.missing_keys)} missing keys and {len(result.unexpected_keys)} unexpected keys") + else: + print("Standard weights detected - remapping keys") + # Map keys to model structure + magic = {k.replace("text_model", "text_model.base_model.model"): v for k, v in magic.items()} + magic = {k.replace("dna_model", "dna_model"): v for k, v in magic.items()} + + # Fix the shared memory tensors issue by making a copy of weights + for key in list(magic.keys()): + if 'lm_head.weight' in key: + magic[key] = magic[key].clone() + + # Load weights before setting up LoRA + result = model.load_state_dict(magic, strict=False) + print(f"Loaded checkpoint with {len(result.missing_keys)} missing keys and {len(result.unexpected_keys)} unexpected keys") + + # Now prepare for LoRA training + _prep_for_training(model, model_args, dna_model_finetune=model_args.freeze_dna_modules) + else: + # No checkpoint, just prepare for training + _prep_for_training(model, model_args, dna_model_finetune=model_args.freeze_dna_modules) + + # Get reward functions + reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs] + # reward_funcs = [ + # xmlcount_reward_func, + # soft_format_reward_func, + # strict_format_reward_func, + # int_reward_func, + # correctness_reward_func, + # ] + print("reward_funcs:", reward_funcs) + + vlm_module_cls = get_vlm_module(model_args.model_name_or_path) + print("using vlm module:", vlm_module_cls.__name__) + question_prompt = vlm_module_cls.get_question_template() + + + dataset = get_kegg_questions() + + #dataset = get_gsm8k_questions(question_prompt) + + print(dataset) + + #print('ITEM ONE OF THE DATASET', dataset['train'][0]) + + # Custom callback to handle saving with PyTorch's native mechanism + custom_save_callback = SaveWithPyTorchCallback() + + # Initialize the GRPO trainer with custom callback + trainer = DNALLMGRPOTrainer( + model=model, + reward_funcs=reward_funcs, + args=training_args, + dna_module=vlm_module_cls(), + train_dataset=dataset['train'], + eval_dataset=dataset['val'] if training_args.eval_strategy != "no" else None, + peft_config=get_peft_config(model_args), + attn_implementation=model_args.attn_implementation, + torch_dtype=model_args.torch_dtype, + callbacks=[custom_save_callback], # Add our custom callback + ) + + # Set the trainer to save in PyTorch format instead of safetensors + training_args.save_safetensors = False + + # Train and push the model to the Hub + # if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): + # trainer.train(resume_from_checkpoint=True) + # else: + # trainer.train() + + # Train and push the model to the Hub + trainer.train() + + +if __name__ == "__main__": + # os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" + print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES')}") + parser = TrlParser((GRPOScriptArguments, DNALLMGRPOConfig, GRPOModelConfig)) + script_args, training_args, model_args = parser.parse_args_and_config() + + # Ensure we use PyTorch's save mechanism instead of safetensors + training_args.save_safetensors = False + + main(script_args, training_args, model_args) + + # parser.add_argument("--wandb_project", type=str, default="dna-text-finetune") + # parser.add_argument("--wandb_entity", type=str, default="adibvafa") + + # args = parser.parse_args() diff --git a/BioReason-main/requirements.txt b/BioReason-main/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..00ceeaaf18e3267cee6416f59819f2a82c7e147d --- /dev/null +++ b/BioReason-main/requirements.txt @@ -0,0 +1,13 @@ +torch +torchvision +transformers +accelerate +qwen-vl-utils +jupyter +datasets +peft +pytorch_lightning +wandb +trl[vllm] +bitsandbytes +deepspeed \ No newline at end of file diff --git a/BioReason-main/sh_reason.sh b/BioReason-main/sh_reason.sh new file mode 100644 index 0000000000000000000000000000000000000000..cf748286a3f173f243d96ca97c768a558ba1371a --- /dev/null +++ b/BioReason-main/sh_reason.sh @@ -0,0 +1,57 @@ +#!/bin/bash +#SBATCH --job-name=Qwen3_1.7B_SFT_RL # Name of the job +#SBATCH --gres=gpu:4 # Number of GPUs +#SBATCH -p a100 # Partition +#SBATCH -c 12 # Number of cores +#SBATCH --time=12:00:00 # Time limit +#SBATCH --mem=128gb # Memory limit +#SBATCH --output=Qwen3_1.7B_SFT_RL_a100-%j.out # Output file +#SBATCH --error=Qwen3_1.7B_SFT_RL_a100-%j.err # Error file + +## Environment Setup +echo "CUDA_HOME: $CUDA_HOME" +echo "PATH: $PATH" +echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH" +echo "which python: $(which python)" + +## Configuration Variables +# Change these to match your setup +SFT_CHECKPOINT=SFT_CHECKPOINT # Change to the checkpoint of the SFT model +CACHE_DIR=CACHE_DIR # Change to the directory where the model weights are cached +OUTPUT_DIR=OUTPUT_DIR # Change to the directory where the model will be saved +CONDA_ENV=CONDA_ENV # Change to the conda environment + +## Setup Environment +conda activate $CONDA_ENV # Change to the conda environment +cd .../BioReason/ # Change to the directory containing the script +nvidia-smi # Check GPU status + +## Dependencies +# You might need to install this on a gpu session +# pip install trl[vllm] + +## ============================================================================= +## Reinforcement Learning Training with DeepSpeed +## ============================================================================= + +# Run with DeepSpeed ZeRO Stage 2 +srun deepspeed --num_gpus=4 --num_nodes=1 \ + reason.py \ + --deepspeed grpo_trainer_lora_model/ds_config_stage2.json \ + --num_generations 4 \ + --per_device_train_batch_size 2 \ + --bf16 true \ + --ddp_find_unused_parameters false \ + --sft_checkpoint $SFT_CHECKPOINT \ + --model_name_or_path Qwen/Qwen3-1.7B \ + --dna_model_name_or_path InstaDeepAI/nucleotide-transformer-v2-500m-multi-species \ + --cache_dir $CACHE_DIR \ + --output_dir $OUTPUT_DIR \ + --save_strategy "steps" \ + --save_steps 100 \ + --save_total_limit 2 \ + --use_vllm true \ + --temperature 0.6 \ + --top_p 0.95 \ + --top_k 20 \ + --num_train_epochs 1 diff --git a/BioReason-main/sh_train_dna_only.sh b/BioReason-main/sh_train_dna_only.sh new file mode 100644 index 0000000000000000000000000000000000000000..6775d8536606bf2ac2b97dde2c56ee2fdee8783f --- /dev/null +++ b/BioReason-main/sh_train_dna_only.sh @@ -0,0 +1,138 @@ +#!/bin/bash +#SBATCH --job-name=train_dna # Name of the job +#SBATCH --time=8:00:00 # Time limit +#SBATCH --partition=gpu_batch # Partition +#SBATCH --gpus=1 # Number of GPUs +#SBATCH --ntasks=1 # Number of tasks +#SBATCH --cpus-per-task=6 # Number of cores +#SBATCH --mem=128gb # Memory limit +#SBATCH --output=train_dna_%j_%x.out # Output file +#SBATCH --error=train_dna_%j_%x.err # Error file + +## Environment Setup +echo "CUDA_HOME: $CUDA_HOME" +echo "PATH: $PATH" +echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH" +echo "which python: $(which python)" + +## Configuration Variables +# Change these to match your setup +CONDA_ENV=CONDA_ENV # Change to your conda environment name +CACHE_DIR=CACHE_DIR # Change to your HuggingFace cache directory +WANDB_PROJECT=WANDB_PROJECT # Change to your W&B project name + +## Setup Environment +conda activate $CONDA_ENV # Change to your conda environment +cd .../BioReason/ # Change to the directory containing the script +nvidia-smi # Check GPU status + + +## ============================================================================= +## KEGG Dataset Training (DNA-only models) +## ============================================================================= + +# NT-500M on KEGG +stdbuf -oL -eL srun python train_dna_only.py \ + --cache_dir $CACHE_DIR \ + --wandb_project $WANDB_PROJECT \ + --dna_model_name InstaDeepAI/nucleotide-transformer-v2-500m-multi-species \ + --strategy ddp \ + --max_epochs 5 \ + --num_gpus 1 \ + --batch_size 1 \ + --max_length_dna 2048 \ + --truncate_dna_per_side 1024 \ + --train_just_classifier True \ + --learning_rate 3e-4 \ + --dataset_type kegg \ + --merge_val_test_set True + +# EVO2-1B on KEGG +stdbuf -oL -eL srun python train_dna_only.py \ + --cache_dir $CACHE_DIR \ + --wandb_project $WANDB_PROJECT \ + --dna_model_name evo2_1b_base \ + --strategy ddp \ + --max_epochs 5 \ + --num_gpus 1 \ + --batch_size 1 \ + --max_length_dna 2048 \ + --truncate_dna_per_side 1024 \ + --train_just_classifier True \ + --dna_is_evo2 True \ + --dna_embedding_layer blocks.20.mlp.l3 \ + --learning_rate 3e-4 \ + --dataset_type kegg \ + --merge_val_test_set True + +## ============================================================================= +## Variant Effect Prediction (VEP) Training +## ============================================================================= + +# NT-500M on VEP +stdbuf -oL -eL srun python train_dna_only.py \ + --cache_dir $CACHE_DIR \ + --wandb_project $WANDB_PROJECT \ + --dna_model_name InstaDeepAI/nucleotide-transformer-v2-500m-multi-species \ + --strategy ddp \ + --max_epochs 3 \ + --num_gpus 1 \ + --batch_size 2 \ + --max_length_dna 2048 \ + --truncate_dna_per_side 1024 \ + --train_just_classifier True \ + --learning_rate 3e-4 \ + --dataset_type variant_effect_coding + +# EVO2-1B on VEP +stdbuf -oL -eL srun python train_dna_only.py \ + --cache_dir $CACHE_DIR \ + --wandb_project $WANDB_PROJECT \ + --dna_model_name evo2_1b_base \ + --strategy ddp \ + --max_epochs 3 \ + --num_gpus 1 \ + --batch_size 2 \ + --max_length_dna 2048 \ + --truncate_dna_per_side 1024 \ + --train_just_classifier True \ + --dna_is_evo2 True \ + --dna_embedding_layer blocks.20.mlp.l3 \ + --learning_rate 3e-4 \ + --dataset_type variant_effect_coding + +## ============================================================================= +## Variant Effect Prediction Non-SNV Training +## ============================================================================= + +# NT-500M on VEP Non-SNV +stdbuf -oL -eL srun python train_dna_only.py \ + --cache_dir $CACHE_DIR \ + --wandb_project $WANDB_PROJECT \ + --dna_model_name InstaDeepAI/nucleotide-transformer-v2-500m-multi-species \ + --strategy ddp \ + --max_epochs 3 \ + --num_gpus 1 \ + --batch_size 2 \ + --max_length_dna 2048 \ + --truncate_dna_per_side 1024 \ + --train_just_classifier True \ + --learning_rate 3e-4 \ + --dataset_type variant_effect_non_snv + +# EVO2-1B on VEP Non-SNV +stdbuf -oL -eL srun python train_dna_only.py \ + --cache_dir $CACHE_DIR \ + --wandb_project $WANDB_PROJECT \ + --dna_model_name evo2_1b_base \ + --strategy ddp \ + --max_epochs 3 \ + --num_gpus 1 \ + --batch_size 2 \ + --max_length_dna 2048 \ + --truncate_dna_per_side 1024 \ + --train_just_classifier True \ + --dna_is_evo2 True \ + --dna_embedding_layer blocks.20.mlp.l3 \ + --learning_rate 3e-4 \ + --dataset_type variant_effect_non_snv \ No newline at end of file diff --git a/BioReason-main/sh_train_dna_qwen.sh b/BioReason-main/sh_train_dna_qwen.sh new file mode 100644 index 0000000000000000000000000000000000000000..29c16701e905875224e12580379f316ba56f7048 --- /dev/null +++ b/BioReason-main/sh_train_dna_qwen.sh @@ -0,0 +1,191 @@ +#!/bin/bash +#SBATCH --job-name=train_dna_qwen # Name of the job +#SBATCH --time=12:00:00 # Time limit +#SBATCH --partition=gpu_batch # Partition +#SBATCH --gpus=1 # Number of GPUs +#SBATCH --ntasks=1 # Number of tasks +#SBATCH --cpus-per-task=8 # Number of cores +#SBATCH --mem=128gb # Memory limit +#SBATCH --output=train_dna_qwen_%j_%x.out # Output file +#SBATCH --error=train_dna_qwen_%j_%x.err # Error file + +## Environment Setup +echo "CUDA_HOME: $CUDA_HOME" +echo "PATH: $PATH" +echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH" +echo "which python: $(which python)" + +## Configuration Variables +# Change these to match your setup +CONDA_ENV=CONDA_ENV # Change to your conda environment name +CACHE_DIR=CACHE_DIR # Change to your HuggingFace cache directory +OUTPUT_DIR=OUTPUT_DIR # Change to your output/log directory +WANDB_PROJECT=WANDB_PROJECT # Change to your W&B project name + +## Setup Environment +conda activate $CONDA_ENV # Change to your conda environment +cd .../BioReason/ # Change to the directory containing the script +nvidia-smi # Check GPU status + + +## ============================================================================= +## KEGG Dataset Training +## ============================================================================= + +# NT-500M + Qwen3-1.7B on KEGG +stdbuf -oL -eL srun python train_dna_qwen.py \ + --cache_dir $CACHE_DIR \ + --wandb_project $WANDB_PROJECT \ + --text_model_name Qwen/Qwen3-1.7B \ + --dna_model_name InstaDeepAI/nucleotide-transformer-v2-500m-multi-species \ + --strategy deepspeed_stage_2 \ + --max_epochs 5 \ + --num_gpus 1 \ + --batch_size 1 \ + --model_type dna-llm \ + --dataset_type kegg \ + --merge_val_test_set True \ + --return_answer_in_batch True + +# EVO2-1B + Qwen3-1.7B on KEGG +stdbuf -oL -eL srun python train_dna_qwen.py \ + --cache_dir $CACHE_DIR \ + --wandb_project $WANDB_PROJECT \ + --text_model_name Qwen/Qwen3-1.7B \ + --dna_model_name evo2_1b_base \ + --strategy deepspeed_stage_2 \ + --max_epochs 5 \ + --num_gpus 1 \ + --batch_size 1 \ + --model_type dna-llm \ + --dataset_type kegg \ + --max_length_dna 2048 \ + --truncate_dna_per_side 1024 \ + --dna_is_evo2 True \ + --dna_embedding_layer blocks.20.mlp.l3 \ + --merge_val_test_set True \ + --return_answer_in_batch True + +# Qwen3-4B on KEGG (LLM-only) +stdbuf -oL -eL srun python train_dna_qwen.py \ + --cache_dir $CACHE_DIR \ + --wandb_project $WANDB_PROJECT \ + --text_model_name Qwen/Qwen3-4B \ + --dna_model_name InstaDeepAI/nucleotide-transformer-v2-500m-multi-species \ + --strategy deepspeed_stage_2 \ + --max_epochs 5 \ + --num_gpus 1 \ + --batch_size 1 \ + --model_type llm \ + --dataset_type kegg \ + --max_length_dna 4 \ + --max_length_text 8192 \ + --truncate_dna_per_side 1024 \ + --merge_val_test_set True \ + --return_answer_in_batch True + +## ============================================================================= +## Variant Effect Prediction (VEP) Training +## ============================================================================= + +# NT-500M + Qwen3-4B on VEP +stdbuf -oL -eL srun python train_dna_qwen.py \ + --cache_dir $CACHE_DIR \ + --wandb_project $WANDB_PROJECT \ + --text_model_name Qwen/Qwen3-4B \ + --dna_model_name InstaDeepAI/nucleotide-transformer-v2-500m-multi-species \ + --strategy deepspeed_stage_2 \ + --max_epochs 3 \ + --num_gpus 1 \ + --batch_size 2 \ + --model_type dna-llm \ + --dataset_type variant_effect_coding \ + --return_answer_in_batch True + +# EVO2-1B + Qwen3-1.7B on VEP +stdbuf -oL -eL srun python train_dna_qwen.py \ + --cache_dir $CACHE_DIR \ + --wandb_project $WANDB_PROJECT \ + --text_model_name Qwen/Qwen3-1.7B \ + --dna_model_name evo2_1b_base \ + --strategy deepspeed_stage_2 \ + --max_epochs 3 \ + --num_gpus 1 \ + --batch_size 2 \ + --model_type dna-llm \ + --dataset_type variant_effect_coding \ + --max_length_dna 2048 \ + --truncate_dna_per_side 1024 \ + --dna_is_evo2 True \ + --dna_embedding_layer blocks.20.mlp.l3 \ + --return_answer_in_batch True + +# Qwen3-4B on VEP (LLM-only) - Testing max length text +stdbuf -oL -eL srun python train_dna_qwen.py \ + --cache_dir $CACHE_DIR \ + --wandb_project $WANDB_PROJECT \ + --text_model_name Qwen/Qwen3-4B \ + --dna_model_name InstaDeepAI/nucleotide-transformer-v2-500m-multi-species \ + --strategy deepspeed_stage_2 \ + --max_epochs 3 \ + --num_gpus 1 \ + --batch_size 2 \ + --model_type llm \ + --dataset_type variant_effect_coding \ + --max_length_dna 4 \ + --max_length_text 4096 \ + --truncate_dna_per_side 1024 \ + --return_answer_in_batch True + +## ============================================================================= +## Variant Effect Prediction Non-SNV Training +## ============================================================================= + +# NT-500M + Qwen3-4B on VEP Non-SNV +stdbuf -oL -eL srun python train_dna_qwen.py \ + --cache_dir $CACHE_DIR \ + --wandb_project $WANDB_PROJECT \ + --text_model_name Qwen/Qwen3-4B \ + --dna_model_name InstaDeepAI/nucleotide-transformer-v2-500m-multi-species \ + --strategy deepspeed_stage_2 \ + --max_epochs 1 \ + --num_gpus 1 \ + --batch_size 2 \ + --model_type dna-llm \ + --dataset_type variant_effect_non_snv \ + --return_answer_in_batch True + +# EVO2-1B + Qwen3-4B on VEP Non-SNV +stdbuf -oL -eL srun python train_dna_qwen.py \ + --cache_dir $CACHE_DIR \ + --wandb_project $WANDB_PROJECT \ + --text_model_name Qwen/Qwen3-4B \ + --dna_model_name evo2_1b_base \ + --strategy deepspeed_stage_2 \ + --max_epochs 3 \ + --num_gpus 1 \ + --batch_size 2 \ + --model_type dna-llm \ + --dataset_type variant_effect_non_snv \ + --max_length_dna 2048 \ + --truncate_dna_per_side 1024 \ + --dna_is_evo2 True \ + --dna_embedding_layer blocks.20.mlp.l3 \ + --return_answer_in_batch True + +# Qwen3-4B on VEP Non-SNV (LLM-only) - Testing max length text +stdbuf -oL -eL srun python train_dna_qwen.py \ + --cache_dir $CACHE_DIR \ + --wandb_project $WANDB_PROJECT \ + --text_model_name Qwen/Qwen3-4B \ + --dna_model_name InstaDeepAI/nucleotide-transformer-v2-500m-multi-species \ + --strategy deepspeed_stage_2 \ + --max_epochs 1 \ + --num_gpus 1 \ + --batch_size 2 \ + --model_type llm \ + --dataset_type variant_effect_non_snv \ + --max_length_dna 4 \ + --max_length_text 4096 \ + --truncate_dna_per_side 1024 \ + --return_answer_in_batch True \ No newline at end of file diff --git a/BioReason-main/train_dna_only.py b/BioReason-main/train_dna_only.py new file mode 100644 index 0000000000000000000000000000000000000000..7220f5e114c71d0f8b26f43a81751ae27bbc21e6 --- /dev/null +++ b/BioReason-main/train_dna_only.py @@ -0,0 +1,502 @@ +import os +import time +import argparse +import torch +import wandb +from torch.optim import AdamW +from torch.utils.data import DataLoader +from transformers import get_cosine_schedule_with_warmup, AutoTokenizer +from datasets import load_dataset, concatenate_datasets +import pytorch_lightning as pl +from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor +from pytorch_lightning.loggers import WandbLogger +from pytorch_lightning.strategies import DeepSpeedStrategy +from bioreason.models.dna_only import DNAClassifierModel +from bioreason.dataset.utils import truncate_dna +from bioreason.dataset.kegg import dna_collate_fn +from bioreason.dataset.variant_effect import clean_variant_effect_example +from bioreason.models.evo2_tokenizer import Evo2Tokenizer, register_evo2_tokenizer +register_evo2_tokenizer() + + +class DNAClassifierModelTrainer(pl.LightningModule): + """ + PyTorch Lightning module for training the DNA classifier. + """ + + def __init__(self, args): + """ + Initialize the DNAClassifierModelTrainer. + + Args: + args: Command line arguments + """ + super().__init__() + self.save_hyperparameters(args) + + # Load dataset and labels + self.dataset, self.labels = self.load_dataset() + self.label2id = {label: i for i, label in enumerate(self.labels)} + + # Load model + self.dna_model = DNAClassifierModel( + dna_model_name=self.hparams.dna_model_name, + cache_dir=self.hparams.cache_dir, + max_length_dna=self.hparams.max_length_dna, + num_classes=len(self.labels), + dna_is_evo2=self.hparams.dna_is_evo2, + dna_embedding_layer=self.hparams.dna_embedding_layer, + train_just_classifier=self.hparams.train_just_classifier, + ) + self.dna_tokenizer = self.dna_model.dna_tokenizer + + # Set the training mode for the classifier and pooler + self.dna_model.pooler.train() + self.dna_model.classifier.train() + + # Freeze the DNA model parameters + if self.hparams.dna_is_evo2: + self.dna_model_params = self.dna_model.dna_model.model.parameters() + else: + self.dna_model_params = self.dna_model.dna_model.parameters() + + if self.hparams.train_just_classifier: + for param in self.dna_model_params: + param.requires_grad = False + + def _step(self, prefix, batch_idx, batch): + """ + Performs a single training/validation step. + + Args: + batch: Dictionary containing the batch data + prefix: String indicating the step type ('train' or 'val') + + Returns: + torch.Tensor: The computed loss for this batch + """ + ref_ids = batch["ref_ids"].to(self.device) + alt_ids = batch["alt_ids"].to(self.device) + ref_attention_mask = batch["ref_attention_mask"].to(self.device) + alt_attention_mask = batch["alt_attention_mask"].to(self.device) + labels = batch["labels"].to(self.device) + + # Forward pass + logits = self.dna_model(ref_ids=ref_ids, alt_ids=alt_ids, ref_attention_mask=ref_attention_mask, alt_attention_mask=alt_attention_mask) + + # Calculate loss + loss_fn = torch.nn.CrossEntropyLoss() + loss = loss_fn(logits, labels) + + # Calculate accuracy + preds = torch.argmax(logits, dim=1) + acc = (preds == labels).float().mean() + + # Calculate F1 score, precision, and recall for binary classification + # Assuming label 1 is positive and label 0 is negative as mentioned + true_positives = ((preds == 1) & (labels == 1)).float().sum() + false_positives = ((preds == 1) & (labels == 0)).float().sum() + false_negatives = ((preds == 0) & (labels == 1)).float().sum() + + # Calculate precision, recall, and F1 score + precision = true_positives / (true_positives + false_positives + 1e-8) # add small epsilon to avoid division by zero + recall = true_positives / (true_positives + false_negatives + 1e-8) + f1 = 2 * precision * recall / (precision + recall + 1e-8) + + # Logging metrics + self.log( + f"{prefix}_loss", + loss, + on_step=True, + on_epoch=False, + prog_bar=True, + logger=True, + ) + self.log( + f"{prefix}_acc", + acc, + on_step=True, + on_epoch=False, + prog_bar=True, + logger=True, + ) + self.log( + f"{prefix}_loss_epoch", + loss, + on_step=False, + on_epoch=True, + prog_bar=True, + logger=True, + sync_dist=True, + ) + self.log( + f"{prefix}_acc_epoch", + acc, + on_step=False, + on_epoch=True, + prog_bar=True, + logger=True, + sync_dist=True, + ) + self.log( + f"{prefix}_precision", + precision, + on_step=True, + on_epoch=False, + prog_bar=True, + logger=True, + ) + self.log( + f"{prefix}_precision_epoch", + precision, + on_step=False, + on_epoch=True, + prog_bar=True, + logger=True, + sync_dist=True, + ) + self.log( + f"{prefix}_recall", + recall, + on_step=True, + on_epoch=False, + prog_bar=True, + logger=True, + ) + self.log( + f"{prefix}_recall_epoch", + recall, + on_step=False, + on_epoch=True, + prog_bar=True, + logger=True, + sync_dist=True, + ) + self.log( + f"{prefix}_f1", + f1, + on_step=True, + on_epoch=False, + prog_bar=True, + logger=True, + ) + self.log( + f"{prefix}_f1_epoch", + f1, + on_step=False, + on_epoch=True, + prog_bar=True, + logger=True, + sync_dist=True, + ) + + if (prefix == "test") or (prefix == "train" and (self.global_step % 1000 == 0)) or (prefix == "val" and (batch_idx % 100 == 0)): + wandb_logger = self.logger.experiment + + pred_label = self.labels[preds[0]] + true_label = self.labels[labels[0]] + timestamp = time.time() + step_id = f"gen_{self.global_step}-{timestamp}" + + wandb_logger.log( + { + step_id: wandb.Table( + columns=["timestamp", "prefix", "pred_label", "true_label"], + data=[[timestamp, prefix, pred_label, true_label]], + ) + } + ) + + print(f"Example {prefix} {batch_idx} {self.global_step}: Prediction: {pred_label}, Target: {true_label}") + + return loss + + def training_step(self, batch, batch_idx): + """Perform a training step.""" + return self._step(prefix="train", batch_idx=batch_idx, batch=batch) + + def validation_step(self, batch, batch_idx): + """Perform a validation step.""" + return self._step(prefix="val", batch_idx=batch_idx, batch=batch) + + def test_step(self, batch, batch_idx): + """Perform a test step.""" + return self._step(prefix="test", batch_idx=batch_idx, batch=batch) + + def configure_optimizers(self): + """Configure optimizers and learning rate schedulers.""" + # Only include parameters that require gradients + classifier_params = [ + { + "params": self.dna_model.classifier.parameters(), + "lr": self.hparams.learning_rate, + }, + { + "params": self.dna_model.pooler.parameters(), + "lr": self.hparams.learning_rate, + } + ] + dna_model_params = [ + { + "params": self.dna_model_params, + "lr": self.hparams.learning_rate * 0.1, + }, + ] + + if self.hparams.train_just_classifier: + # Only train classifier parameters + optimizer = AdamW( + classifier_params, + weight_decay=self.hparams.weight_decay, + ) + else: + # Train both DNA model and classifier with different learning rates + optimizer = AdamW( + classifier_params + dna_model_params, + weight_decay=self.hparams.weight_decay, + ) + + # Get total steps from trainer's estimated stepping batches + total_steps = self.trainer.estimated_stepping_batches + warmup_steps = int(0.1 * total_steps) + + # Create scheduler + scheduler = get_cosine_schedule_with_warmup( + optimizer, + num_warmup_steps=warmup_steps, + num_training_steps=total_steps, + ) + + return [optimizer], [{"scheduler": scheduler, "interval": "step"}] + + def load_dataset(self): + """Load the dataset based on the dataset type.""" + if self.hparams.dataset_type == "kegg": + dataset = load_dataset(self.hparams.kegg_data_dir_huggingface) + + if self.hparams.truncate_dna_per_side: + dataset = dataset.map( + truncate_dna, fn_kwargs={"truncate_dna_per_side": self.hparams.truncate_dna_per_side} + ) + + labels = [] + for split, data in dataset.items(): + labels.extend(data["answer"]) + labels = list(set(labels)) + + elif self.hparams.dataset_type == "variant_effect_coding": + dataset = load_dataset("wanglab/bioR_tasks", "variant_effect_coding") + dataset = dataset.map(clean_variant_effect_example) + + if self.hparams.truncate_dna_per_side: + dataset = dataset.map( + truncate_dna, fn_kwargs={"truncate_dna_per_side": self.hparams.truncate_dna_per_side} + ) + + labels = [] + for split, data in dataset.items(): + labels.extend(data["answer"]) + labels = sorted(list(set(labels))) + + elif self.hparams.dataset_type == "variant_effect_non_snv": + dataset = load_dataset("wanglab/bioR_tasks", "task5_variant_effect_non_snv") + dataset = dataset.rename_column("mutated_sequence", "variant_sequence") + dataset = dataset.map(clean_variant_effect_example) + + if self.hparams.truncate_dna_per_side: + dataset = dataset.map( + truncate_dna, fn_kwargs={"truncate_dna_per_side": self.hparams.truncate_dna_per_side} + ) + + labels = [] + for split, data in dataset.items(): + labels.extend(data["answer"]) + labels = sorted(list(set(labels))) + + else: + raise ValueError(f"Invalid dataset type: {self.hparams.dataset_type}") + + print(f"Dataset:\n{dataset}\nLabels:\n{labels}\nNumber of labels:{len(labels)}") + return dataset, labels + + def train_dataloader(self): + """Create and return the training DataLoader.""" + if self.hparams.dataset_type == "kegg": + train_dataset = self.dataset["train"] + collate_fn = lambda b: dna_collate_fn(b, dna_tokenizer=self.dna_tokenizer, label2id=self.label2id, max_length=self.hparams.max_length_dna) + + elif self.hparams.dataset_type == "variant_effect_coding": + train_dataset = self.dataset["train"] + collate_fn = lambda b: dna_collate_fn(b, dna_tokenizer=self.dna_tokenizer, label2id=self.label2id, max_length=self.hparams.max_length_dna) + + elif self.hparams.dataset_type == "variant_effect_non_snv": + train_dataset = self.dataset["train"] + collate_fn = lambda b: dna_collate_fn(b, dna_tokenizer=self.dna_tokenizer, label2id=self.label2id, max_length=self.hparams.max_length_dna) + + else: + raise ValueError(f"Invalid dataset type: {self.hparams.dataset_type}") + + return DataLoader( + train_dataset, + batch_size=self.hparams.batch_size, + shuffle=True, + collate_fn=collate_fn, + num_workers=self.hparams.num_workers, + persistent_workers=True, + ) + + def val_dataloader(self): + """Create and return the training DataLoader.""" + if self.hparams.dataset_type == "kegg": + + if self.hparams.merge_val_test_set: + val_dataset = concatenate_datasets([self.dataset['test'], self.dataset['val']]) + else: + val_dataset = self.dataset["val"] + + collate_fn = lambda b: dna_collate_fn(b, dna_tokenizer=self.dna_tokenizer, label2id=self.label2id, max_length=self.hparams.max_length_dna) + + elif self.hparams.dataset_type == "variant_effect_coding": + val_dataset = self.dataset["test"] + collate_fn = lambda b: dna_collate_fn(b, dna_tokenizer=self.dna_tokenizer, label2id=self.label2id, max_length=self.hparams.max_length_dna) + + elif self.hparams.dataset_type == "variant_effect_non_snv": + val_dataset = self.dataset["test"] + collate_fn = lambda b: dna_collate_fn(b, dna_tokenizer=self.dna_tokenizer, label2id=self.label2id, max_length=self.hparams.max_length_dna) + + else: + raise ValueError(f"Invalid dataset type: {self.hparams.dataset_type}") + + return DataLoader( + val_dataset, + batch_size=self.hparams.batch_size, + shuffle=False, + collate_fn=collate_fn, + num_workers=self.hparams.num_workers, + persistent_workers=True, + ) + + def test_dataloader(self): + """Create and return the test DataLoader.""" + return self.val_dataloader() + + +def main(args): + """Main function to run the training process.""" + # Set random seed and environment variables + pl.seed_everything(args.seed) + torch.cuda.empty_cache() + torch.set_float32_matmul_precision("medium") + + # Initialize model + model = DNAClassifierModelTrainer(args) + + # Setup directories + run_name = f"{args.wandb_project}-{args.dataset_type}-{args.dna_model_name.split('/')[-1]}" + args.checkpoint_dir = f"{args.checkpoint_dir}/{run_name}-{time.strftime('%Y%m%d-%H%M%S')}" + args.output_dir = f"{args.output_dir}/{run_name}-{time.strftime('%Y%m%d-%H%M%S')}" + os.makedirs(args.output_dir, exist_ok=True) + os.makedirs(args.checkpoint_dir, exist_ok=True) + + # Setup callbacks + callbacks = [ + ModelCheckpoint( + dirpath=args.checkpoint_dir, + filename=f"{run_name}-" + "{epoch:02d}-{val_loss_epoch:.4f}", + save_top_k=2, + monitor="val_acc_epoch", + mode="max", + save_last=True, + ), + LearningRateMonitor(logging_interval="step"), + ] + + # Setup logger + is_resuming = args.ckpt_path is not None + logger = WandbLogger( + project=args.wandb_project, + entity=args.wandb_entity, + save_dir=args.log_dir, + name=run_name, + resume="allow" if is_resuming else None, # Allow resuming existing run + ) + + # Initialize trainer + trainer = pl.Trainer( + max_epochs=args.max_epochs, + accelerator="gpu", + devices=args.num_gpus, + strategy=( + "ddp" + if args.strategy == "ddp" + else DeepSpeedStrategy(stage=2, offload_optimizer=False, allgather_bucket_size=5e8, reduce_bucket_size=5e8) + ), + precision="bf16-mixed", + callbacks=callbacks, + logger=logger, + deterministic=False, + enable_checkpointing=True, + enable_progress_bar=True, + enable_model_summary=True, + log_every_n_steps=5, + accumulate_grad_batches=args.gradient_accumulation_steps, + gradient_clip_val=1.0, + val_check_interval=1 / 3, + ) + + # Train model + trainer.fit(model, ckpt_path=args.ckpt_path) + trainer.test(model, ckpt_path=args.ckpt_path if args.ckpt_path else "best") + + # Save final model + final_model_path = os.path.join(args.output_dir, "final_model") + torch.save(model.dna_model.state_dict(), final_model_path) + print(f"Final model saved to {final_model_path}") + + +if __name__ == "__main__": + os.environ["CUDA_VISIBLE_DEVICES"] = "0" + parser = argparse.ArgumentParser(description="Train DNA Classifier") + + # Model parameters + parser.add_argument( + "--dna_model_name", + type=str, + default="InstaDeepAI/nucleotide-transformer-v2-500m-multi-species", + ) + parser.add_argument("--cache_dir", type=str, default="/model-weights") + parser.add_argument("--max_length_dna", type=int, default=1024) + parser.add_argument("--dna_is_evo2", type=bool, default=False) + parser.add_argument("--dna_embedding_layer", type=str, default=None) + + # Training parameters + parser.add_argument("--strategy", type=str, default="ddp") + parser.add_argument("--batch_size", type=int, default=8) + parser.add_argument("--learning_rate", type=float, default=5e-5) + parser.add_argument("--weight_decay", type=float, default=0.01) + parser.add_argument("--max_epochs", type=int, default=5) + parser.add_argument("--max_steps", type=int, default=-1) + parser.add_argument("--gradient_accumulation_steps", type=int, default=8) + parser.add_argument("--num_workers", type=int, default=4) + parser.add_argument("--num_gpus", type=int, default=1) + parser.add_argument("--train_just_classifier", type=bool, default=True) + parser.add_argument("--dataset_type", type=str, choices=["kegg", "variant_effect_coding", "variant_effect_non_snv"], default="kegg") + parser.add_argument("--kegg_data_dir_huggingface", type=str, default="wanglab/kegg") + parser.add_argument("--truncate_dna_per_side", type=int, default=0) + + # Output parameters + parser.add_argument("--output_dir", type=str, default="dna_classifier_output") + parser.add_argument( + "--checkpoint_dir", type=str, default="checkpoints" + ) + parser.add_argument("--ckpt_path", type=str, default=None) + parser.add_argument("--log_dir", type=str, default="logs") + parser.add_argument("--wandb_project", type=str, default="dna-only-nt-500m") + parser.add_argument("--wandb_entity", type=str, default="adibvafa") + parser.add_argument("--merge_val_test_set", type=bool, default=True) + + # Other parameters + parser.add_argument("--seed", type=int, default=23) + + args = parser.parse_args() + main(args) diff --git a/BioReason-main/train_dna_qwen.py b/BioReason-main/train_dna_qwen.py new file mode 100644 index 0000000000000000000000000000000000000000..2e5623f31d1f31e5ce34e5e7c2ea3d13c2f88856 --- /dev/null +++ b/BioReason-main/train_dna_qwen.py @@ -0,0 +1,1064 @@ +import csv +import gc +import io +import multiprocessing +import os +import time +import traceback +from argparse import ArgumentParser +from functools import partial +from typing import * + +import pandas as pd +import torch +import wandb +from datasets import DatasetDict, concatenate_datasets, load_dataset +from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training +from torch.optim import AdamW +from torch.utils.data import DataLoader +from transformers import AutoTokenizer, get_cosine_schedule_with_warmup +from transformers.tokenization_utils_base import BatchEncoding + +import pytorch_lightning as pl +from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint +from pytorch_lightning.loggers import WandbLogger +from pytorch_lightning.strategies import DeepSpeedStrategy + +from bioreason.dataset.kegg import get_format_kegg_function, qwen_dna_collate_fn +from bioreason.dataset.utils import truncate_dna +from bioreason.dataset.variant_effect import ( + clean_variant_effect_example, + clean_variant_effect_non_snv_example, + get_format_variant_effect_function, +) +from bioreason.models.dl.processing_dl import DLProcessor +from bioreason.models.dna_llm import DNALLMModel +from bioreason.models.evo2_tokenizer import register_evo2_tokenizer + +register_evo2_tokenizer() + +# Set start method to 'spawn' for CUDA compatibility with multiprocessing +torch.multiprocessing.set_sharing_strategy("file_system") +os.environ["TOKENIZERS_PARALLELISM"] = "false" + + +class DNALLMFineTuner(pl.LightningModule): + """ + PyTorch Lightning module for fine-tuning DNA-LLM models. + """ + + def __init__(self, hparams): + """ + Initialize the DNALLMFineTuner. + + Args: + hparams: Hyperparameters for the model and training + """ + super().__init__() + self.save_hyperparameters(hparams) + + self.text_model_name = self.hparams.text_model_name + self.dna_model_name = self.hparams.dna_model_name + self.cache_dir = self.hparams.cache_dir + self.learning_rate = self.hparams.learning_rate + self.weight_decay = self.hparams.weight_decay + self.text_model_finetune = self.hparams.text_model_finetune + self.dna_model_finetune = self.hparams.dna_model_finetune + self.lora_rank = self.hparams.lora_rank + self.lora_alpha = self.hparams.lora_alpha + self.lora_dropout = self.hparams.lora_dropout + self.max_length_dna = self.hparams.max_length_dna + self.max_length_text = self.hparams.max_length_text + self.dna_is_evo2 = self.hparams.dna_is_evo2 + self.dna_embedding_layer = self.hparams.dna_embedding_layer + self.return_answer_in_batch = self.hparams.return_answer_in_batch + self.merge_val_test_set = self.hparams.merge_val_test_set + + # Store dataset configuration + self.dataset_type = self.hparams.dataset_type + + # Load model + self.model = DNALLMModel( + text_model_name=self.text_model_name, + dna_model_name=self.dna_model_name, + cache_dir=self.cache_dir, + max_length_dna=self.max_length_dna, + max_length_text=self.max_length_text, + text_model_finetune=self.text_model_finetune, + dna_model_finetune=self.dna_model_finetune, + dna_is_evo2=self.dna_is_evo2, + dna_embedding_layer=self.dna_embedding_layer, + ) + + self.text_model = self.model.text_model + self.dna_model = self.model.dna_model + self.dna_projection = self.model.dna_projection + + # Load tokenizer for target text + self.tokenizer = self.model.text_tokenizer + + # Prepare model for training + self.lora_config = self._prep_for_training() + + def _get_target_modules(self): + # Apply LoRA to all linear layers in the text model + target_modules = [] + + # Get all unique linear layer names + seen_names = set() + for name, module in self.text_model.named_modules(): + if isinstance(module, torch.nn.Linear): + names = name.split(".") + target_name = names[-1] # Use the last part of the name + + # Skip output head but include all other linear layers + if target_name != "lm_head" and target_name not in seen_names: + target_modules.append(target_name) + seen_names.add(target_name) + + # Add attention-specific layers + attention_patterns = [ + "q_proj", + "k_proj", + "v_proj", + "out_proj", + "query", + "key", + "value", + ] + for pattern in attention_patterns: + if pattern not in seen_names: + target_modules.append(pattern) + + # Return all unique layer names to apply LoRA to all layers + return list(target_modules) + + def _prep_for_training(self) -> LoraConfig: + """ + Load and configure the DNALLMModel. + """ + + # Freeze DNA encoder parameters + if self.dna_model_finetune: + pass + else: + if self.dna_is_evo2: + for param in self.dna_model.model.parameters(): + param.requires_grad = False + else: + for param in self.dna_model.parameters(): + param.requires_grad = False + + if self.text_model_finetune: + target_modules = self._get_target_modules() + + lora_config = LoraConfig( + r=self.lora_rank, + lora_alpha=self.lora_alpha, + lora_dropout=self.lora_dropout, + target_modules=target_modules, + init_lora_weights="gaussian", + bias="none", + task_type="CAUSAL_LM", + ) + + # Prepare text model for training + self.text_model = prepare_model_for_kbit_training(self.text_model) + self.text_model = get_peft_model(self.text_model, lora_config) + else: + # Freeze text model parameters + for param in self.text_model.parameters(): + param.requires_grad = False + + # Make projection layer trainable + for param in self.dna_projection.parameters(): + param.requires_grad = True + + return lora_config + + def _step(self, batch: Dict, batch_idx: int, prefix: str) -> torch.Tensor: + """ + Performs a single step for training, validation, or testing. + + Args: + batch: Dictionary containing the batch data + batch_idx: Integer indicating the batch index + prefix: String indicating the step type ('train', 'val', or 'test') + + Returns: + torch.Tensor: The computed loss for this batch + """ + if prefix == "test": + return {"loss": torch.tensor(0.0, device=self.device)} + + # Get batch data from the collate function + input_ids = batch["input_ids"].to(self.device) + attention_mask = batch["attention_mask"].to(self.device) + labels = batch["labels"].to(self.device) if "labels" in batch else None + dna_tokenized = batch.get("dna_tokenized") + if dna_tokenized is not None: + dna_tokenized = dna_tokenized.to(self.device) + batch_idx_map = batch.get("batch_idx_map") + + # Forward pass through the model + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + dna_tokenized=dna_tokenized, + batch_idx_map=batch_idx_map, + labels=labels, + ) + + # Get the loss from model outputs + loss = outputs.loss + + # Occasionally show generations for debugging purposes - ONLY during training/validation + # You can reduce the frequency of generations by increasing the step size to make the model train faster + if (prefix == "train" and (self.global_step % 3000 == 0)) or (prefix == "val" and (batch_idx % 300 == 0)): + try: + # Select first example from batch for demonstration + example_idx = 0 + + print( + f"\n=== Sample Generation (step {self.global_step} / {self.trainer.estimated_stepping_batches}) ===" + ) + + # Get the tokens that define the assistant pattern + assistant_start_marker = "<|im_start|>assistant\n" + assistant_marker_tokens = self.tokenizer.encode(assistant_start_marker, add_special_tokens=False) + marker_tensor = torch.tensor(assistant_marker_tokens, device=input_ids.device) + marker_len = len(assistant_marker_tokens) + + # Find non-padding tokens in input + non_pad = (input_ids[example_idx] != self.tokenizer.pad_token_id).nonzero(as_tuple=True)[0] + if len(non_pad) > 0: + start_idx = non_pad[0].item() # First non-padding token + else: + start_idx = 0 + + # For each position, check if the next marker_len tokens match the pattern + matches = [] + for pos in range(start_idx, input_ids.size(1) - marker_len + 1): + if torch.all(input_ids[example_idx, pos : pos + marker_len] == marker_tensor): + matches.append(pos) + break # Stop at first match + + assistant_pos = matches[0] if matches else None + + if assistant_pos is not None: + # Get input up to and including the assistant marker + gen_input_ids = input_ids[ + example_idx : example_idx + 1, start_idx : assistant_pos + marker_len + ] + gen_attention_mask = attention_mask[ + example_idx : example_idx + 1, start_idx : assistant_pos + marker_len + ] + + # Extract DNA data for this example + example_dna_data = None + example_batch_map = None + + if dna_tokenized is not None and batch_idx_map is not None: + # Find DNA sequences for this example + example_indices = [i for i, idx in enumerate(batch_idx_map) if idx == example_idx] + + if len(example_indices) > 0: + # Extract just this example's DNA data + example_dna_data = BatchEncoding( + { + "input_ids": dna_tokenized.input_ids[example_indices].to(self.device), + "attention_mask": dna_tokenized.attention_mask[example_indices].to(self.device), + } + ) + + # For generation we need all sequences mapped to index 0 + example_batch_map = [0] * len(example_indices) + + # Generate text + with torch.no_grad(): + generated = self.model.generate( + input_ids=gen_input_ids, + attention_mask=gen_attention_mask, + dna_tokenized=example_dna_data, + batch_idx_map=example_batch_map, + max_new_tokens=800, + temperature=0.6, + top_p=0.95, + top_k=20, + do_sample=True, + ) + + # Decode and display + user_input = self.tokenizer.decode(gen_input_ids[0], skip_special_tokens=False).strip() + generation = self.tokenizer.decode(generated[0], skip_special_tokens=False).strip() + + # Free memory early + del generated, gen_input_ids, gen_attention_mask, example_dna_data, example_batch_map + gc.collect() + + print(f"=====[Sample {prefix} {batch_idx}]=====") + print(f"=====[User input]=====\n{user_input}") + print(f"=====[Complete generation]=====\n{generation}") + + # Get ground truth if available + ground_truth = "" + if labels is not None: + # Find all positions where we have valid labels (not -100) + valid_label_pos = (labels[example_idx] != -100).nonzero(as_tuple=True)[0] + + if len(valid_label_pos) > 0: + # Check if valid labels start after assistant marker + if valid_label_pos[0] >= assistant_pos + marker_len: + ground_truth = self.tokenizer.decode( + input_ids[example_idx, valid_label_pos], skip_special_tokens=False + ).strip() + print(f"=====[Ground truth]=====\n{ground_truth}") + + # Log to wandb + timestamp = time.time() + step_id = f"gen_{self.global_step}-{timestamp}" + wandb_logger = self.logger.experiment + wandb_logger.log( + { + step_id: wandb.Table( + columns=["timestamp", "prefix", "batch_idx", "user_input", "generation", "ground_truth"], + data=[[timestamp, prefix, batch_idx, user_input, generation, ground_truth]], + ) + } + ) + + # Clean up memory + del user_input, generation, ground_truth + torch.cuda.empty_cache() + gc.collect() + + else: + print("No assistant marker found in the input sequence") + + except Exception as e: + print(f"Error during sample generation: {str(e)}") + traceback.print_exc() + + # Get current learning rate (skip during test as scheduler might not be available) + if prefix != "test": + current_lr = self.lr_schedulers().get_last_lr()[0] + else: + current_lr = 0 + + # Logging metrics + self.log( + f"{prefix}_loss", + loss, + on_step=True, + on_epoch=False, + prog_bar=True, + logger=True, + ) + self.log( + f"{prefix}_loss_epoch", + loss, + on_step=False, + on_epoch=True, + prog_bar=True, + logger=True, + sync_dist=True, + ) + + # Only log learning rate during training/validation + if prefix != "test": + self.log( + "lr", + current_lr, + on_step=True, + on_epoch=True, + prog_bar=True, + logger=True, + sync_dist=True, + ) + + return loss + + def training_step(self, batch: Dict, batch_idx: int) -> torch.Tensor: + """Perform a single training step.""" + return self._step(batch, batch_idx, prefix="train") + + def validation_step(self, batch: Dict, batch_idx: int) -> torch.Tensor: + """Perform a single validation step.""" + return self._step(batch, batch_idx, prefix="val") + + def test_step(self, batch: Dict, batch_idx: int) -> torch.Tensor: + """Perform a single test step.""" + return self._step(batch, batch_idx, prefix="test") + + def configure_optimizers(self): + """ + Configure optimizers and learning rate schedulers. + + Returns: + Tuple[List, List]: A tuple containing a list of optimizers and schedulers + """ + optimizer = AdamW(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) + + total_steps = self.trainer.estimated_stepping_batches + warmup_steps = int(0.1 * total_steps) + + scheduler = get_cosine_schedule_with_warmup( + optimizer, + num_warmup_steps=warmup_steps, + num_training_steps=total_steps, + ) + + return [optimizer], [{"scheduler": scheduler, "interval": "step"}] + + def train_dataloader(self) -> DataLoader: + """Create and return the training DataLoader.""" + # Load dataset based on type specified in hyperparameters + + if self.hparams.dataset_type == "kegg": + # Use Hugging Face dataset if provided + dataset = load_dataset(self.hparams.kegg_data_dir_huggingface) + dataset = dataset.map(get_format_kegg_function(self.hparams.model_type)) + + labels = [] + for split, data in dataset.items(): + labels.extend(data["answer"]) + self.labels = sorted(list(set(labels))) + + train_dataset = dataset["train"] + + if self.hparams.truncate_dna_per_side: + train_dataset = train_dataset.map( + truncate_dna, fn_kwargs={"truncate_dna_per_side": self.hparams.truncate_dna_per_side} + ) + + processor = DLProcessor( + tokenizer=self.model.text_tokenizer, + dna_tokenizer=self.model.dna_tokenizer, + ) + + # Create partial function with all required arguments except the batch + collate_fn = partial( + qwen_dna_collate_fn, + processor=processor, + max_length_text=self.max_length_text, + max_length_dna=self.max_length_dna, + return_answer_in_batch=self.return_answer_in_batch, + ) + + + elif self.hparams.dataset_type == "variant_effect_coding": + dataset = load_dataset(self.hparams.variant_effect_coding_data_dir_huggingface) + cleaned_dataset = dataset.map(clean_variant_effect_example) + dataset = dataset.map(get_format_variant_effect_function(self.hparams.model_type)) + + labels = [] + for split, data in cleaned_dataset.items(): + labels.extend(data["answer"]) + self.labels = sorted(list(set(labels))) + + train_dataset = dataset["train"] + + if self.hparams.truncate_dna_per_side: + train_dataset = train_dataset.map( + truncate_dna, fn_kwargs={"truncate_dna_per_side": self.hparams.truncate_dna_per_side} + ) + + processor = DLProcessor( + tokenizer=self.model.text_tokenizer, + dna_tokenizer=self.model.dna_tokenizer, + ) + + # Create partial function with all required arguments except the batch + collate_fn = partial( + qwen_dna_collate_fn, + processor=processor, + max_length_text=self.max_length_text, + max_length_dna=self.max_length_dna, + return_answer_in_batch=self.return_answer_in_batch, + ) + + elif self.hparams.dataset_type == "variant_effect_non_snv": + dataset = load_dataset(self.hparams.variant_effect_non_snv_data_dir_huggingface) + dataset = dataset.map(clean_variant_effect_non_snv_example) + cleaned_dataset = dataset.map(clean_variant_effect_example) + dataset = dataset.rename_column("mutated_sequence", "variant_sequence") + + labels = [] + for split, data in cleaned_dataset.items(): + labels.extend(data["answer"]) + self.labels = sorted(list(set(labels))) + + train_dataset = dataset["train"] + + if self.hparams.truncate_dna_per_side: + train_dataset = train_dataset.map( + truncate_dna, fn_kwargs={"truncate_dna_per_side": self.hparams.truncate_dna_per_side} + ) + train_dataset = train_dataset.map(get_format_variant_effect_function(self.hparams.model_type)) + + processor = DLProcessor( + tokenizer=self.model.text_tokenizer, + dna_tokenizer=self.model.dna_tokenizer, + ) + + # Create partial function with all required arguments except the batch + collate_fn = partial( + qwen_dna_collate_fn, + processor=processor, + max_length_text=self.max_length_text, + max_length_dna=self.max_length_dna, + return_answer_in_batch=self.return_answer_in_batch, + ) + + else: + raise ValueError(f"Unknown dataset type: {self.hparams.dataset_type}") + + return DataLoader( + train_dataset, + batch_size=self.hparams.batch_size, + shuffle=True, + collate_fn=collate_fn, + num_workers=self.hparams.num_workers, + persistent_workers=False, + pin_memory=False, + ) + + def val_dataloader(self) -> DataLoader: + """Create and return the validation DataLoader.""" + + if self.hparams.dataset_type == "kegg": + # Use Hugging Face dataset + dataset = load_dataset(self.hparams.kegg_data_dir_huggingface) + dataset = dataset.map(get_format_kegg_function(self.hparams.model_type)) + + if self.hparams.merge_val_test_set: + val_dataset = concatenate_datasets([dataset['test'], dataset['val']]) + else: + val_dataset = dataset["val"] + + labels = [] + for split, data in dataset.items(): + labels.extend(data["answer"]) + self.labels = sorted(list(set(labels))) + + if self.hparams.truncate_dna_per_side: + val_dataset = val_dataset.map( + truncate_dna, fn_kwargs={"truncate_dna_per_side": self.hparams.truncate_dna_per_side} + ) + + processor = DLProcessor( + tokenizer=self.model.text_tokenizer, + dna_tokenizer=self.model.dna_tokenizer, + ) + + # Create partial function with all required arguments except the batch + collate_fn = partial( + qwen_dna_collate_fn, + processor=processor, + max_length_text=self.max_length_text, + max_length_dna=self.max_length_dna, + return_answer_in_batch=self.return_answer_in_batch, + ) + + elif self.hparams.dataset_type == "variant_effect_coding": + dataset = load_dataset(self.hparams.variant_effect_coding_data_dir_huggingface) + cleaned_dataset = dataset.map(clean_variant_effect_example) + dataset = dataset.map(get_format_variant_effect_function(self.hparams.model_type)) + + labels = [] + for split, data in cleaned_dataset.items(): + labels.extend(data["answer"]) + self.labels = sorted(list(set(labels))) + + val_dataset = dataset["test"] + + if self.hparams.truncate_dna_per_side: + val_dataset = val_dataset.map( + truncate_dna, fn_kwargs={"truncate_dna_per_side": self.hparams.truncate_dna_per_side} + ) + + processor = DLProcessor( + tokenizer=self.model.text_tokenizer, + dna_tokenizer=self.model.dna_tokenizer, + ) + + # Create partial function with all required arguments except the batch + collate_fn = partial( + qwen_dna_collate_fn, + processor=processor, + max_length_text=self.max_length_text, + max_length_dna=self.max_length_dna, + return_answer_in_batch=self.return_answer_in_batch, + ) + + elif self.hparams.dataset_type == "variant_effect_non_snv": + dataset = load_dataset(self.hparams.variant_effect_non_snv_data_dir_huggingface) + cleaned_dataset = dataset.map(clean_variant_effect_example) + dataset = dataset.map(clean_variant_effect_non_snv_example) + + labels = [] + for split, data in cleaned_dataset.items(): + labels.extend(data["answer"]) + self.labels = sorted(list(set(labels))) + + dataset = dataset.rename_column("mutated_sequence", "variant_sequence") + val_dataset = dataset["test"] + + if self.hparams.truncate_dna_per_side: + val_dataset = val_dataset.map( + truncate_dna, fn_kwargs={"truncate_dna_per_side": self.hparams.truncate_dna_per_side} + ) + val_dataset = val_dataset.map(get_format_variant_effect_function(self.hparams.model_type)) + + processor = DLProcessor( + tokenizer=self.model.text_tokenizer, + dna_tokenizer=self.model.dna_tokenizer, + ) + + # Create partial function with all required arguments except the batch + collate_fn = partial( + qwen_dna_collate_fn, + processor=processor, + max_length_text=self.max_length_text, + max_length_dna=self.max_length_dna, + return_answer_in_batch=self.return_answer_in_batch, + ) + + else: + raise ValueError(f"Unknown dataset type: {self.hparams.dataset_type}") + + return DataLoader( + val_dataset, + batch_size=self.hparams.batch_size, + shuffle=False, + collate_fn=collate_fn, + num_workers=self.hparams.num_workers, + persistent_workers=False, + pin_memory=False, + ) + + def test_dataloader(self) -> DataLoader: + """Create and return the test DataLoader.""" + return self.val_dataloader() + + # Only for VEP datasets, for KEGG use the resulting generations in W&B + def on_test_epoch_end(self): + """ + Called at the end of test epoch to generate text for all test examples + and calculate accuracy, precision, recall, and F1 score based on whether + the label appears in the generated response. + """ + # Get wandb logger + wandb_logger = self.logger.experiment + wandb_logger.log({"test_progress": 0.0, "status": "starting test generation"}) + + # Set model to eval mode + self.model.eval() + + # Get test dataloader + test_dataloader = self.test_dataloader() + total_batches = len(test_dataloader) + + # Get negative and positive labels + neg_label = self.labels[0] # Negative label (first item) + pos_label = self.labels[1] # Positive label (second item) + + # Log label information + wandb_logger.log({ + "positive_label": pos_label, + "negative_label": neg_label + }) + print(f"Using labels - Positive: '{pos_label}', Negative: '{neg_label}'") + + # Initialize counters and storage for generations + total_examples = 0 + true_positives = 0 + true_negatives = 0 + false_positives = 0 + false_negatives = 0 + processed_batches = 0 + generations = [] + + # Process each batch in the test dataloader + for batch_idx, batch in enumerate(test_dataloader): + # Log batch start to wandb + wandb_logger.log({ + "test_progress": batch_idx / total_batches, + "status": f"processing batch {batch_idx}/{total_batches}" + }) + + # Get batch data + input_ids = batch["input_ids"].to(self.device) + attention_mask = batch["attention_mask"].to(self.device) + answer = batch["answer"] + dna_tokenized = batch.get("dna_tokenized") + if dna_tokenized is not None: + dna_tokenized = dna_tokenized.to(self.device) + batch_idx_map = batch.get("batch_idx_map") + + # Get assistant marker position + assistant_start_marker = "<|im_start|>assistant\n" + assistant_marker_tokens = self.tokenizer.encode(assistant_start_marker, add_special_tokens=False) + marker_tensor = torch.tensor(assistant_marker_tokens, device=input_ids.device) + marker_len = len(assistant_marker_tokens) + + # Log batch metadata to wandb + wandb_logger.log({ + "batch_size": input_ids.shape[0], + "input_sequence_length": input_ids.shape[1] + }) + + # Process examples in the batch + examples_in_batch = 0 + for example_idx in range(input_ids.size(0)): + # Log example progress to wandb + if total_examples % 10 == 0: + current_accuracy = (true_positives + true_negatives) / max(1, total_examples) + wandb_logger.log({ + "examples_processed": total_examples, + "current_accuracy": current_accuracy + }) + + # Find non-padding tokens + non_pad = (input_ids[example_idx] != self.tokenizer.pad_token_id).nonzero(as_tuple=True)[0] + start_idx = non_pad[0].item() if len(non_pad) > 0 else 0 + + # Find assistant marker position + assistant_pos = None + for pos in range(start_idx, input_ids.size(1) - marker_len + 1): + if torch.all(input_ids[example_idx, pos:pos + marker_len] == marker_tensor): + assistant_pos = pos + break + + # Log to wandb if assistant marker was found + wandb_logger.log({"assistant_marker_found": assistant_pos is not None}) + + if assistant_pos is not None: + # Prepare input for generation + gen_input_ids = input_ids[example_idx:example_idx + 1, start_idx:assistant_pos + marker_len] + gen_attention_mask = attention_mask[example_idx:example_idx + 1, start_idx:assistant_pos + marker_len] + + # Extract DNA data for this example + example_dna_data = None + example_batch_map = None + + if dna_tokenized is not None and batch_idx_map is not None: + example_indices = [i for i, idx in enumerate(batch_idx_map) if idx == example_idx] + + if example_indices: + example_dna_data = BatchEncoding({ + "input_ids": dna_tokenized.input_ids[example_indices].to(self.device), + "attention_mask": dna_tokenized.attention_mask[example_indices].to(self.device), + }) + example_batch_map = [0] * len(example_indices) + + # Log generation start to wandb + wandb_logger.log({"status": f"generating for example {example_idx} in batch {batch_idx}"}) + + # Generate text + with torch.no_grad(): + generated = self.model.generate( + input_ids=gen_input_ids, + attention_mask=gen_attention_mask, + dna_tokenized=example_dna_data, + batch_idx_map=example_batch_map, + max_new_tokens=800, + temperature=0.6, + top_p=0.95, + top_k=20, + do_sample=True, + ) + + # Decode user input and generated text + user_input = self.tokenizer.decode(gen_input_ids[0], skip_special_tokens=False).strip() + generation = self.tokenizer.decode(generated[0], skip_special_tokens=False).strip() + + # Get ground truth and clean it if needed + ground_truth = answer[example_idx] + if ";" in ground_truth: + ground_truth = ground_truth.split(";")[0] + + # Determine if this is a positive or negative example + is_positive_example = ground_truth.lower() == pos_label.lower() + is_negative_example = ground_truth.lower() == neg_label.lower() + + # Check if the generated text contains the ground truth + generation_contains_ground_truth = ground_truth.lower() in generation.lower() + + # Update metrics based on the classification + total_examples += 1 + examples_in_batch += 1 + + if is_positive_example and generation_contains_ground_truth: + true_positives += 1 + elif is_positive_example and not generation_contains_ground_truth: + false_negatives += 1 + elif is_negative_example and generation_contains_ground_truth: + true_negatives += 1 + elif is_negative_example and not generation_contains_ground_truth: + false_positives += 1 + + # Add metadata about the prediction + prediction_category = ( + "TP" if is_positive_example and generation_contains_ground_truth else + "FN" if is_positive_example and not generation_contains_ground_truth else + "TN" if is_negative_example and generation_contains_ground_truth else + "FP" + ) + + # Store generation data + generations.append({ + "batch_idx": batch_idx, + "example_idx": example_idx, + "user_input": user_input, + "generation": generation, + "ground_truth": ground_truth, + "contains_ground_truth": generation_contains_ground_truth, + "is_positive_example": is_positive_example, + "prediction_category": prediction_category + }) + + # Clean up memory + torch.cuda.empty_cache() + gc.collect() + + # Log batch completion to wandb + processed_batches += 1 + + # Calculate current metrics + current_accuracy = (true_positives + true_negatives) / max(total_examples, 1) + current_precision = true_positives / max(true_positives + false_positives, 1) + current_recall = true_positives / max(true_positives + false_negatives, 1) + current_f1 = 2 * current_precision * current_recall / max(current_precision + current_recall, 1e-8) + + wandb_logger.log({ + "batches_processed": processed_batches, + "examples_processed": total_examples, + "examples_in_last_batch": examples_in_batch, + "current_accuracy": current_accuracy, + "current_precision": current_precision, + "current_recall": current_recall, + "current_f1": current_f1, + "progress_percentage": (batch_idx + 1) / total_batches * 100 + }) + + # Calculate final metrics + accuracy = (true_positives + true_negatives) / max(total_examples, 1) + precision = true_positives / max(true_positives + false_positives, 1) + recall = true_positives / max(true_positives + false_negatives, 1) + f1 = 2 * precision * recall / max(precision + recall, 1e-8) + + # Log final metrics to wandb + wandb_logger.log({ + "test_accuracy": accuracy, + "test_precision": precision, + "test_recall": recall, + "test_f1": f1, + "true_positives": true_positives, + "false_positives": false_positives, + "true_negatives": true_negatives, + "false_negatives": false_negatives, + "total_examples_processed": total_examples, + "positive_examples": true_positives + false_negatives, + "negative_examples": true_negatives + false_positives, + "test_status": "completed" + }) + + # Create a confusion matrix + confusion_matrix = { + "True Positives": true_positives, + "False Positives": false_positives, + "True Negatives": true_negatives, + "False Negatives": false_negatives + } + wandb_logger.log({"confusion_matrix": confusion_matrix}) + + # Create a table with all the generations + if generations: + columns = [ + "batch_idx", + "example_idx", + "user_input", + "generation", + "ground_truth", + "contains_ground_truth", + "is_positive_example", + "prediction_category" + ] + data = [] + for g in generations: + # Handle any missing keys + row = [g.get(c, "") for c in columns] + data.append(row) + + wandb_logger.log({ + f"test_generations_{time.strftime('%Y%m%d-%H%M%S')}:": wandb.Table(columns=columns, data=data) + }) + + # Save generations to a CSV file + model_name = self.hparams.text_model_name.split('/')[-1] + if self.hparams.ckpt_path: + csv_path = os.path.join(self.hparams.ckpt_path, f"{time.strftime('%Y%m%d-%H%M%S')}-test_generations_{model_name}.csv") + else: + csv_path = os.path.join(self.hparams.checkpoint_dir, f"{time.strftime('%Y%m%d-%H%M%S')}-test_generations_{model_name}.csv") + + try: + with open(csv_path, 'w', newline='', encoding='utf-8') as f: + if generations: + writer = csv.DictWriter(f, fieldnames=generations[0].keys()) + writer.writeheader() + for g in generations: + writer.writerow(g) + + wandb_logger.log({"csv_saved": True, "csv_path": csv_path}) + except Exception as e: + wandb_logger.log({"csv_saved": False, "csv_path": csv_path, "error": str(e)}) + + # Log a summary of the metrics + summary = ( + f"Test Results Summary:\n" + f"Total examples: {total_examples}\n" + f"Accuracy: {accuracy:.4f}\n" + f"Precision: {precision:.4f}\n" + f"Recall: {recall:.4f}\n" + f"F1 Score: {f1:.4f}\n" + f"TP: {true_positives}, FP: {false_positives}, TN: {true_negatives}, FN: {false_negatives}" + ) + print(summary) + wandb_logger.log({"test_summary": summary}) + + # Force garbage collection + torch.cuda.empty_cache() + gc.collect() + + return { + "test_accuracy": accuracy, + "test_precision": precision, + "test_recall": recall, + "test_f1": f1 + } + + +def main(args: ArgumentParser): + """ + Main function to run the DNA-Text fine-tuning process. + + Args: + args (ArgumentParser): Parsed command-line arguments + """ + # Set random seed and environment variables + pl.seed_everything(args.seed) + torch.cuda.empty_cache() + torch.set_float32_matmul_precision("medium") + + # Setup directories + run_name = f"{args.wandb_project}-{args.dataset_type}-{args.text_model_name.split('/')[-1]}" + args.checkpoint_dir = f"{args.checkpoint_dir}/{run_name}-{time.strftime('%Y%m%d-%H%M%S')}" + + # Initialize model + model = DNALLMFineTuner(args) + + # Setup callbacks + callbacks = [ + ModelCheckpoint( + dirpath=args.checkpoint_dir, + filename=f"{run_name}-" + "{epoch:02d}-{val_loss_epoch:.4f}", + save_top_k=2, + monitor="val_loss_epoch", + mode="min", + save_last=True, + ), + LearningRateMonitor(logging_interval="step"), + ] + + # Setup logger + is_resuming = args.ckpt_path is not None + logger = WandbLogger( + project=args.wandb_project, + entity=args.wandb_entity, + save_dir=args.log_dir, + name=run_name, + resume="allow" if is_resuming else None, # Allow resuming existing run + ) + + # Initialize the PyTorch Lightning Trainer + trainer = pl.Trainer( + max_epochs=args.max_epochs, + accelerator="gpu", + devices=args.num_gpus, + strategy=( + "ddp" + if args.strategy == "ddp" + else DeepSpeedStrategy(stage=2, offload_optimizer=False, allgather_bucket_size=5e8, reduce_bucket_size=5e8) + ), + precision="bf16-mixed", + callbacks=callbacks, + logger=logger, + deterministic=False, + enable_checkpointing=True, + enable_progress_bar=True, + enable_model_summary=True, + log_every_n_steps=5, + accumulate_grad_batches=args.gradient_accumulation_steps, + gradient_clip_val=1.0, + val_check_interval=1 / 3, + ) + + # Start the training process + trainer.fit(model, ckpt_path=args.ckpt_path) + trainer.test(model, ckpt_path=args.ckpt_path if args.ckpt_path else "best") + +if __name__ == "__main__": + parser = ArgumentParser() + + # Model configuration + parser.add_argument("--model_type", type=str, choices=["llm", "dna-llm"], default="dna-llm") + parser.add_argument("--text_model_name", type=str, default="Qwen/Qwen3-1.7B") + parser.add_argument("--dna_model_name", type=str, default="InstaDeepAI/nucleotide-transformer-v2-500m-multi-species") + parser.add_argument("--text_model_finetune", type=bool, default=True) + parser.add_argument("--dna_model_finetune", type=bool, default=False) + parser.add_argument("--dna_is_evo2", type=bool, default=False) + parser.add_argument("--dna_embedding_layer", type=str, default=None) + + # Training parameters + parser.add_argument("--seed", type=int, default=23) + parser.add_argument("--batch_size", type=int, default=1) + parser.add_argument("--max_epochs", type=int, default=5) + parser.add_argument("--learning_rate", type=float, default=5e-5) + parser.add_argument("--weight_decay", type=float, default=0.01) + parser.add_argument("--gradient_accumulation_steps", type=int, default=8) + parser.add_argument("--max_length_dna", type=int, default=1024) + parser.add_argument("--max_length_text", type=int, default=1024) + parser.add_argument("--truncate_dna_per_side", type=int, default=1024) + parser.add_argument("--return_answer_in_batch", type=bool, default=False) + + # LoRA parameters + parser.add_argument("--lora_rank", type=int, default=32) + parser.add_argument("--lora_alpha", type=int, default=64) + parser.add_argument("--lora_dropout", type=float, default=0.05) + + # Infrastructure and paths + parser.add_argument("--checkpoint_dir", type=str, default="checkpoints") + parser.add_argument("--log_dir", type=str, default="logs") + parser.add_argument("--cache_dir", type=str, default="/model-weights") + parser.add_argument("--ckpt_path", type=str, default=None) + parser.add_argument("--num_workers", type=int, default=4) + parser.add_argument("--num_gpus", type=int, default=1) + parser.add_argument("--strategy", type=str, default="ddp") + + # Dataset configuration + parser.add_argument("--dataset_type", type=str, choices=["kegg", "variant_effect_coding", "variant_effect_non_snv"], default="kegg") + parser.add_argument("--use_qwen_dna_collate_fn", type=bool, default=True) + parser.add_argument("--kegg_data_dir_local", type=str, default="data/kegg") + parser.add_argument("--kegg_data_dir_huggingface", type=str, default="wanglab/kegg") + parser.add_argument("--variant_effect_coding_data_dir_huggingface", type=str, default="wanglab/variant_effect_coding") + parser.add_argument("--variant_effect_non_snv_data_dir_huggingface", type=str, default="wanglab/variant_effect_non_snv") + parser.add_argument("--merge_val_test_set", type=bool, default=False) + + # Logging and monitoring + parser.add_argument("--wandb_project", type=str, default="nt-500m-qwen3-1.7b-finetune") + parser.add_argument("--wandb_entity", type=str) + + args = parser.parse_args() + + main(args) \ No newline at end of file diff --git a/BioReason/data/Clinvar_SNV_Non_SNV.ipynb b/BioReason/data/Clinvar_SNV_Non_SNV.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..62657133d3efd260415bab1cfbf958e9551843e3 --- /dev/null +++ b/BioReason/data/Clinvar_SNV_Non_SNV.ipynb @@ -0,0 +1,3425 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ClinVar SNV and Non-SNV Processing Pipeline\n", + "\n", + "This notebook processes ClinVar genetic variants to create machine learning datasets for variant effect prediction. See `Clinvar_SNV_Non_SNV_README.md` for detailed documentation.\n", + "\n", + "## Quick Start\n", + "\n", + "1. Update file paths in the configuration section\n", + "2. Ensure all dependencies are installed\n", + "3. Run cells in order\n", + "4. Monitor progress and memory usage\n", + "\n", + "**⚠️ Important**: This pipeline requires significant computational resources and storage space." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "Update these paths for your environment:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration - Update these paths for your environment\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "# File paths (update these for your system)\n", + "CONFIG = {\n", + " # Input data\n", + " 'clinvar_vcf': 'data/clinvar_grch38.vcf.gz',\n", + " 'reference_genome': 'data/reference/GRCh38.fa',\n", + " 'hgnc_mapping': 'data/hgnc_complete_set.txt',\n", + " \n", + " # VEP configuration\n", + " 'vep_root': '/path/to/vep',\n", + " 'vep_cache': '/path/to/vep/cache',\n", + " \n", + " # Output paths\n", + " 'output_dir': 'output',\n", + " 'temp_dir': 'temp',\n", + " \n", + " # Processing parameters\n", + " 'window_size': 4096,\n", + " 'max_variant_size': 64,\n", + " 'num_threads': 8,\n", + " 'batch_size': 100000\n", + "}\n", + "\n", + "SCRATCH_DIR = '/your/scratch/directory' # Update this to your scratch directory\n", + "\n", + "# Create output directories\n", + "for dir_path in [CONFIG['output_dir'], CONFIG['temp_dir']]:\n", + " os.makedirs(dir_path, exist_ok=True)\n", + " \n", + "print(\"Configuration loaded. Please verify all paths are correct:\")\n", + "for key, value in CONFIG.items():\n", + " if 'path' in key or 'dir' in key:\n", + " exists = os.path.exists(value) if not key.endswith('dir') else True\n", + " status = \"✅\" if exists else \"❌\"\n", + " print(f\" {status} {key}: {value}\")\n", + " \n", + "print(\"\\n📝 Update CONFIG dictionary above with your actual file paths\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ClinVar SNV and Non-SNV Variant Processing Pipeline\n", + "\n", + "This notebook processes ClinVar genetic variants (both SNVs and non-SNVs) to create a comprehensive machine learning dataset for variant effect prediction. The pipeline includes:\n", + "\n", + "## Overview\n", + "\n", + "1. **Data Processing**: Download and process ClinVar VCF data using VEP (Variant Effect Predictor)\n", + "2. **Sequence Window Extraction**: Generate 4096bp genomic windows centered on variants\n", + "3. **Feature Engineering**: Extract pathogenicity, disease associations, and gene information\n", + "4. **Dataset Creation**: Build training/test datasets with disjoint disease splits\n", + "5. **Quality Control**: Comprehensive statistics and validation\n", + "\n", + "## Key Features\n", + "\n", + "- **Genomic Windows**: 4096bp sequences with centered mutations\n", + "- **Variant Types**: Both SNVs and structural variants (insertions, deletions, etc.)\n", + "- **Clinical Annotations**: Pathogenicity classification and disease associations\n", + "- **Gene Mapping**: Integration with HGNC gene nomenclature\n", + "- **Disjoint Splits**: Train/test splits ensuring no disease overlap\n", + "\n", + "## Requirements\n", + "\n", + "- **Computational Resources**: High-memory system (recommended for large datasets)\n", + "- **Software Dependencies**: VEP, Python libraries (pandas, pysam, pyarrow, hgvs)\n", + "- **Reference Data**: GRCh38 genome assembly, HGNC gene mapping\n", + "- **Storage**: Sufficient space for intermediate files (~100GB+)\n", + "\n", + "## Output\n", + "\n", + "Final datasets suitable for:\n", + "- Variant effect prediction models\n", + "- Pathogenicity classification\n", + "- Disease association studies\n", + "- Genomic language model training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initial Setup (For HPC/Cluster Environments)\n", + "\n", + "**Note**: This section contains setup instructions for high-performance computing environments. Adapt paths and module loading commands for your specific system.\n", + "\n", + "### Prerequisites Installation\n", + "If running on a cluster, you may need to download Python wheels and reference data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Download required Python packages and reference data\n", + "# Adjust paths and module loading for your specific environment\n", + "\n", + "# Example for cluster environments:\n", + "# module load python gcc arrow postgresql\n", + "\n", + "# Create directory for Python wheels (adjust path as needed)\n", + "# mkdir -p /path/to/your/pywheels\n", + "# pip download hgvs -d /path/to/your/pywheels\n", + "\n", + "# Download HGNC gene mapping data\n", + "# wget -O hgnc_complete_set.txt \"https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt\"\n", + "\n", + "print(\"Setup instructions provided above. Adjust paths for your environment.\")\n", + "print(\"Required data:\")\n", + "print(\"- HGNC complete gene set\")\n", + "print(\"- Python packages: hgvs, pandas, pyarrow, pysam, tqdm\")\n", + "print(\"- VEP installation with cache\")\n", + "print(\"- GRCh38 reference genome\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environment Setup\n", + "\n", + "**For cluster/HPC environments**: Configure virtual environment and load required modules.\n", + "**For local environments**: Ensure all dependencies are installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Environment setup for cluster/HPC systems\n", + "# Adjust module loading and paths for your specific environment\n", + "\n", + "# Example cluster setup:\n", + "\"\"\"\n", + "# Create virtual environment\n", + "python -m venv /tmp/clinvar_env\n", + "\n", + "# Load required modules (adjust for your system)\n", + "module load python gcc arrow postgresql\n", + "module load perl samtools tabix bcftools mariadb\n", + "\n", + "# Activate virtual environment\n", + "source /tmp/clinvar_env/bin/activate\n", + "\n", + "# Install packages\n", + "pip install notebook pandas pyarrow pysam hgvs tqdm networkx\n", + "\n", + "# Start Jupyter (for remote access)\n", + "jupyter notebook --no-browser --ip=$(hostname -f) --port=8888\n", + "\"\"\"\n", + "\n", + "# For local environments, ensure these packages are installed:\n", + "required_packages = [\n", + " 'pandas>=1.3.0',\n", + " 'pyarrow>=5.0.0', \n", + " 'pysam>=0.19.0',\n", + " 'hgvs>=1.5.0',\n", + " 'tqdm>=4.60.0',\n", + " 'networkx>=2.6.0'\n", + "]\n", + "\n", + "print(\"Required packages:\")\n", + "for pkg in required_packages:\n", + " print(f\" - {pkg}\")\n", + " \n", + "print(\"\\nFor VEP processing, also required:\")\n", + "print(\" - VEP (Ensembl Variant Effect Predictor)\")\n", + "print(\" - BCFtools, SAMtools, Tabix\")\n", + "print(\" - Reference genome and VEP cache files\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/localscratch/naimerja.43836119.0/clinvar_env/bin/python\n" + ] + } + ], + "source": [ + "!which python\n", + "# Verify Python environment and core dependencies\n", + "import sys\n", + "import subprocess\n", + "\n", + "print(f\"Python executable: {sys.executable}\")\n", + "print(f\"Python version: {sys.version}\")\n", + "\n", + "# Check for required packages\n", + "try:\n", + " import pandas as pd\n", + " import pyarrow as pa\n", + " import pysam\n", + " import hgvs\n", + " import tqdm\n", + " import networkx as nx\n", + " \n", + " print(\"\\n✅ Core dependencies available:\")\n", + " print(f\" - pandas: {pd.__version__}\")\n", + " print(f\" - pyarrow: {pa.__version__}\")\n", + " print(f\" - pysam: {pysam.__version__}\")\n", + " print(f\" - hgvs: {hgvs.__version__}\")\n", + " print(f\" - networkx: {nx.__version__}\")\n", + " \n", + "except ImportError as e:\n", + " print(f\"❌ Missing dependency: {e}\")\n", + " print(\"Please install required packages first\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install required packages\n", + "# Adjust installation method based on your environment\n", + "\n", + "# For environments with pre-downloaded wheels:\n", + "# !pip install --no-index --find-links /path/to/pywheels hgvs\n", + "# !pip install --no-index tqdm pandas pyarrow\n", + "\n", + "# For standard environments:\n", + "# !pip install hgvs tqdm pandas pyarrow pysam networkx\n", + "\n", + "print(\"Package installation commands provided above.\")\n", + "print(\"Choose the appropriate method for your environment:\")\n", + "print(\" - Standard: pip install \")\n", + "print(\" - Offline: pip install --no-index --find-links \")\n", + "print(\" - Conda: conda install \")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "View possible fields from clinvar\n", + "\n", + "## ClinVar VCF Data Exploration\n", + "\n", + "Examine the structure and metadata of the ClinVar VCF file to understand available annotations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "##fileformat=VCFv4.1\n", + "##FILTER=\n", + "##fileDate=2025-04-29\n", + "##source=ClinVar\n", + "##reference=GRCh38\n", + "##ID=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##INFO=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##contig=\n", + "##bcftools_viewVersion=1.19+htslib-1.18\n", + "##bcftools_viewCommand=view -h /scratch/naimerja/DNASNVData113/clinvar_data/clinvar_grch38.vcf.gz; Date=Fri May 9 12:41:08 2025\n", + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" + ] + } + ], + "source": [ + "# Explore ClinVar VCF file structure\n", + "# Update the file path to point to your ClinVar VCF file\n", + "\n", + "import subprocess\n", + "import os\n", + "\n", + "# Example VCF file path (update for your data)\n", + "vcf_file = \"data/clinvar_grch38.vcf.gz\" # Update this path\n", + "\n", + "# Check if file exists\n", + "if os.path.exists(vcf_file):\n", + " try:\n", + " # View VCF header to understand available fields\n", + " result = subprocess.run(\n", + " [\"bcftools\", \"view\", \"-h\", vcf_file],\n", + " capture_output=True, text=True, check=True\n", + " )\n", + " \n", + " print(\"ClinVar VCF Header (first 50 lines):\")\n", + " print(\"=\" * 50)\n", + " header_lines = result.stdout.split('\\n')[:50]\n", + " for line in header_lines:\n", + " print(line)\n", + " \n", + " except (subprocess.CalledProcessError, FileNotFoundError) as e:\n", + " print(f\"Error reading VCF file: {e}\")\n", + " print(\"Please ensure bcftools is installed and VCF file path is correct\")\n", + "else:\n", + " print(f\"VCF file not found: {vcf_file}\")\n", + " print(\"Please update the file path to point to your ClinVar VCF file\")\n", + " print(\"Download from: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/\")\n", + "\n", + "print(\"\\nKey ClinVar INFO fields to look for:\")\n", + "print(\"- CLNSIG: Clinical significance\")\n", + "print(\"- CLNDN: Disease name\")\n", + "print(\"- GENEINFO: Gene information\")\n", + "print(\"- CLNREVSTAT: Review status\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "VEP to clean raw clinvar vcf to cleaned coding only vcf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 2) Point to your VEP install and cache, and wire up Perl libs:\n", + "import os\n", + "\n", + "os.environ['VEP_ROOT'] = 'SCRATCH_DIR/DNASNVData113/clinvar_data/vep-code-113'\n", + "os.environ['VEP_CACHE'] = 'SCRATCH_DIR/DNASNVData113/clinvar_data/vep-cache-113'\n", + "os.environ['PERL5LIB'] = 'SCRATCH_DIR/perl5/lib/perl5:' + os.environ.get('PERL5LIB','')\n", + "# prepend VEP_ROOT onto the existing PATH\n", + "os.environ['PATH'] = os.environ['VEP_ROOT'] + ':' + os.environ.get('PATH','')\n", + "\n", + "# now this will actually show your full, correct PATH:\n", + "!echo $PATH\n", + "!which bash\n", + "!which vep\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "/usr/bin/time -v $VEP_ROOT/vep \\\n", + " --input_file SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_grch38.vcf.gz \\\n", + " --output_file SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_coding_only.vcf \\\n", + " --cache \\\n", + " --dir_cache $VEP_CACHE \\\n", + " --offline \\\n", + " --fasta $VEP_CACHE/homo_sapiens/113_GRCh38/Homo_sapiens.GRCh38.dna.toplevel.fa \\\n", + " --species homo_sapiens \\\n", + " --assembly GRCh38 \\\n", + " --vcf \\\n", + " --hgvs \\\n", + " --pick \\\n", + " --fork 48 \\\n", + " --force_overwrite \\\n", + " --verbose \\\n", + " --coding_only\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: VEP Processing\n", + "\n", + "Process ClinVar VCF through VEP to add annotations and filter for coding variants." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!/usr/bin/env python3\n", + "import hgvs.edit as HEdit\n", + "from hgvs.parser import Parser\n", + "from hgvs.exceptions import HGVSError\n", + "from hgvs.enums import Datum\n", + "import hgvs.location as loc\n", + "\n", + "from collections import Counter\n", + "from concurrent.futures import ProcessPoolExecutor\n", + "from tqdm import tqdm\n", + "\n", + "def is_coding_pos(pos):\n", + " \"\"\"\n", + " Return True if the given position is within the translated CDS.\n", + " Excludes:\n", + " - intronic offsets (BaseOffsetPosition.is_intronic)\n", + " - 5′ UTR (datum=CDS_START and base < 1)\n", + " - 3′ UTR (datum=CDS_END)\n", + " \"\"\"\n", + " p = pos.start if hasattr(pos, \"start\") else pos\n", + " if isinstance(p, loc.BaseOffsetPosition):\n", + " dbg = f\"(base={p.base}, datum={p.datum}, offset={p.offset})\"\n", + " if p.is_intronic:\n", + " return False\n", + " if p.datum == Datum.CDS_START and p.base < 1:\n", + " return False\n", + " if p.datum == Datum.CDS_END:\n", + " return False\n", + " if p.datum == Datum.CDS_START and p.base >= 1:\n", + " return True\n", + " # any other datum we don’t recognize\n", + " raise ValueError(f\"Unrecognized BaseOffsetPosition {dbg}, full pos object: {pos!r}\")\n", + "\n", + "def _init_worker(idx):\n", + " # runs once in each worker\n", + " global parser, hgvsc_idx\n", + " parser = Parser()\n", + " hgvsc_idx = idx\n", + "\n", + "\n", + "def _classify_line(line):\n", + " # split on tabs to get INFO (column 7)\n", + " cols = line.rstrip(\"\\n\").split(\"\\t\")\n", + " if len(cols) < 8:\n", + " return (\"unmatched\", None, \"\")\n", + "\n", + " info = cols[7]\n", + " # pull CSQ=\n", + " csq_entries = [kv.split(\"=\",1)[1]\n", + " for kv in info.split(\";\")\n", + " if kv.startswith(\"CSQ=\")]\n", + " if not csq_entries:\n", + " return (\"unmatched\", None, \"\")\n", + "\n", + " # first allele in CSQ, then HGVSc field\n", + " hfull = csq_entries[0].split(\",\")[0].split(\"|\")[hgvsc_idx]\n", + " if not hfull:\n", + " return (\"unmatched\", None, \"\")\n", + "\n", + " # parse HGVS\n", + " try:\n", + " var = parser.parse_hgvs_variant(hfull)\n", + " except HGVSError:\n", + " return (\"unmatched\", None, hfull)\n", + "\n", + " edit = var.posedit.edit\n", + " pos = var.posedit.pos\n", + "\n", + " # get 1-based start/end\n", + " if hasattr(pos, \"start\") and hasattr(pos, \"end\"):\n", + " start = pos.start.base\n", + " end = pos.end.base\n", + " else:\n", + " start = end = pos.base\n", + "\n", + " # generic type key\n", + " etype = edit.type # attribute, not method\n", + " if etype in (\"del\", \"dup\", \"inv\"):\n", + " key = f\"{etype}_{'single' if start == end else 'range'}\"\n", + " else:\n", + " key = etype # covers sub, ins, delins, etc.\n", + "\n", + " # coding vs noncoding\n", + " coding = is_coding_pos(pos)\n", + "\n", + " return (key, coding, None)\n", + "\n", + "\n", + "def scan_hgvsc_types(vcf_path, max_workers=24):\n", + " # 1) find CSQ header → HGVSc index\n", + " csq_fields = None\n", + " with open(vcf_path) as f:\n", + " for line in f:\n", + " if line.startswith(\"##INFO=')[0].strip()\n", + " csq_fields = desc.split(\"|\")\n", + " break\n", + " if not csq_fields:\n", + " raise RuntimeError(\"Couldn't find CSQ header in VCF\")\n", + " idx = csq_fields.index(\"HGVSc\")\n", + "\n", + " # 2) count lines for progress bar\n", + " total = sum(1 for _ in open(vcf_path) if not _.startswith(\"#\"))\n", + "\n", + " coding_counts = Counter()\n", + " noncoding_counts = Counter()\n", + " unmatched_counts = Counter()\n", + "\n", + " # 3) parallel processing\n", + " with ProcessPoolExecutor(\n", + " max_workers=max_workers,\n", + " initializer=_init_worker,\n", + " initargs=(idx,)\n", + " ) as exe:\n", + " # only non-header lines\n", + " lines = (l for l in open(vcf_path) if not l.startswith(\"#\"))\n", + " for key, coding, extra in tqdm(\n", + " exe.map(_classify_line, lines, chunksize=1000),\n", + " total=total,\n", + " desc=\"Scanning variants\"\n", + " ):\n", + " if key == \"unmatched\":\n", + " unmatched_counts[extra] += 1\n", + " else:\n", + " if coding:\n", + " coding_counts[key] += 1\n", + " else:\n", + " noncoding_counts[key] += 1\n", + "\n", + " # 4) report\n", + " print(\"\\n=== Coding-region variants ===\")\n", + " for name, cnt in coding_counts.most_common():\n", + " print(f\" {name}: {cnt}\")\n", + "\n", + " print(\"\\n=== Non-coding variants (UTR & intronic) ===\")\n", + " for name, cnt in noncoding_counts.most_common():\n", + " print(f\" {name}: {cnt}\")\n", + "\n", + " print(\"\\n=== Unmatched HGVSc patterns ===\")\n", + " for h, cnt in unmatched_counts.most_common():\n", + " print(f\" {h}: {cnt}\")\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " scan_hgvsc_types(\n", + " \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_coding_only.vcf\",\n", + " max_workers=24\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Creating data table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!/usr/bin/env python3\n", + "import os\n", + "import pandas as pd\n", + "# Use 24 threads for PyArrow encoding\n", + "os.environ[\"ARROW_NUM_THREADS\"] = \"24\"\n", + "\n", + "import pysam\n", + "import pyarrow as pa\n", + "import pyarrow.parquet as pq\n", + "from tqdm import tqdm\n", + "\n", + "def get_window(genome, chrom, pos0, window_size=4096, pad_char=\"N\"):\n", + " \"\"\"\n", + " Fetch exactly `window_size` bases centered at 0-based pos0\n", + " from the pysam.FastaFile `genome`, padding with `pad_char`.\n", + " \"\"\"\n", + " half = window_size // 2\n", + " start = pos0 - half\n", + " end = start + window_size\n", + "\n", + " parts = []\n", + " chrom_len = genome.get_reference_length(chrom)\n", + "\n", + " # left padding\n", + " if start < 0:\n", + " parts.append(pad_char * -start)\n", + " fetch_start = 0\n", + " else:\n", + " fetch_start = start\n", + "\n", + " # fetch middle\n", + " fetch_end = min(end, chrom_len)\n", + " parts.append(genome.fetch(chrom, fetch_start, fetch_end))\n", + "\n", + " # right padding\n", + " if fetch_end < end:\n", + " parts.append(pad_char * (end - fetch_end))\n", + "\n", + " return \"\".join(parts)\n", + "\n", + "\n", + "def main(vcf_path, genome_fasta_path, out_parquet_path):\n", + " use_cols = [\"symbol\", \"name\", \"entrez_id\"]\n", + " hgnc_df = pd.read_csv(\n", + " \"SCRATCH_DIR/DNASNVData113/clinvar_data/hgnc_complete_set.txt\",\n", + " sep=\"\\t\", usecols=use_cols,\n", + " dtype={\"entrez_id\": \"Int64\"}\n", + " )\n", + " # build a dict mapping Entrez ID → approved name\n", + " gene_desc_map = dict(zip(\n", + " hgnc_df[\"entrez_id\"].astype(str), # ensure keys are strings if your gene_id is str\n", + " hgnc_df[\"name\"]\n", + " ))\n", + "\n", + " missing_genes = 0\n", + " # definitions\n", + " PATHOGENIC_ALLOWED = {\n", + " \"pathogenic\",\n", + " \"pathogenic/likely_pathogenic\",\n", + " \"likely_pathogenic\",\n", + " \"benign\",\n", + " \"likely_benign\",\n", + " \"benign/likely_benign\",\n", + " }\n", + "\n", + " REVIEW_STATUS_ALLOWED = {\n", + " \"criteria_provided,_multiple_submitters,_no_conflicts\",\n", + " \"reviewed_by_expert_panel\",\n", + " \"practice_guideline\",\n", + " }\n", + "\n", + " # 0) explicitly remove any old output\n", + " try:\n", + " os.remove(out_parquet_path)\n", + " except FileNotFoundError:\n", + " pass\n", + "\n", + " # count variants for progress bar\n", + " total = sum(1 for line in open(vcf_path) if not line.startswith(\"#\"))\n", + "\n", + " # open the genomic FASTA\n", + " genome = pysam.FastaFile(genome_fasta_path)\n", + " fasta_contigs = set(genome.references) # <<< build this once\n", + "\n", + " # prepare for Parquet writing\n", + " writer = None\n", + " batch = {col: [] for col in (\n", + " \"clinvar_id\",\n", + " \"original_window\",\n", + " \"mutated_window\",\n", + " \"cleaned_pathogenicity\",\n", + " \"disease_name\",\n", + " \"gene_name\",\n", + " \"gene_desc\",\n", + " \"chromosome\",\n", + " \"chromosome_position\",\n", + " \"variant_type\",\n", + " \"clinvar_link\",\n", + " \"gene_id\",\n", + " \"mutation_instruction\",\n", + " \"pathogenicity\",\n", + " \"review_status\"\n", + " )}\n", + " batch_size = 100_000\n", + "\n", + " def flush_batch():\n", + " nonlocal writer, batch\n", + " table = pa.Table.from_pydict(batch)\n", + " if writer is None:\n", + " writer = pq.ParquetWriter(\n", + " out_parquet_path,\n", + " table.schema,\n", + " compression=\"snappy\",\n", + " use_dictionary=True\n", + " )\n", + " writer.write_table(table)\n", + " for col in batch:\n", + " batch[col].clear()\n", + "\n", + " # process VCF\n", + " with open(vcf_path) as vf:\n", + " for line in tqdm(vf, total=total, desc=\"Writing Parquet\"):\n", + " if line.startswith(\"#\"):\n", + " continue\n", + " cols = line.rstrip(\"\\n\").split(\"\\t\")\n", + " chrom, pos1, clinvar_id, ref, alt = cols[:5]\n", + "\n", + " # --- SKIP if this contig is not in your FASTA --- or mitochondrial chromosome (keeps only nuclear chromosomes as in Evo2)\n", + " if chrom not in fasta_contigs or chrom == \"MT\":\n", + " continue\n", + "\n", + " # Skip variants too large to fit sensibly in a 4 096 bp window\n", + " MAX_EDIT = 64 # 64 bp\n", + " if len(ref) > MAX_EDIT or len(alt) > MAX_EDIT:\n", + " continue\n", + "\n", + "\n", + " info = {\n", + " kv.split(\"=\", 1)[0]: kv.split(\"=\", 1)[1]\n", + " for kv in cols[7].split(\";\") if \"=\" in kv\n", + " }\n", + "\n", + " # mutation instruction\n", + " instr = f\"{ref}>{alt}\"\n", + "\n", + " # extract 4096-bp window\n", + " pos0 = int(pos1) - 1\n", + " orig_win = get_window(genome, chrom, pos0, window_size=4096)\n", + "\n", + " # apply REF→ALT at center\n", + " half = 4096 // 2\n", + " i0 = half\n", + " i1 = half + len(ref)\n", + " mut_win = orig_win[:i0] + alt + orig_win[i1:]\n", + " # enforce fixed length\n", + " if len(mut_win) < 4096:\n", + " mut_win = mut_win.ljust(4096, \"N\")\n", + " elif len(mut_win) > 4096:\n", + " mut_win = mut_win[:4096]\n", + "\n", + " # pathogenicity, disease, variant type\n", + " path = info.get(\"CLNSIG\", \"\").lower()\n", + " dis = info.get(\"CLNDN\", \"\")\n", + " gene_info = info.get(\"GENEINFO\", \"\")\n", + "\n", + " #filter out variants with no gene info\n", + " if gene_info ==\"\":\n", + " missing_genes +=1\n", + " continue\n", + " else:\n", + " gene_name = gene_info.split(\":\")[0]\n", + " gene_id = gene_info.split(\":\")[1]\n", + "\n", + "\n", + " vart = \"SNV\" if len(ref) == 1 == len(alt) else \"non_SNV\"\n", + " rev_stat = info.get(\"CLNREVSTAT\", \"\").lower()\n", + "\n", + " # filter for pathogenic/(|)likely pathogenic or benign/(|)likely benign only\n", + " # only keep if ANY of the pipe-delimited terms is in our allowed set\n", + " terms = path.split(\"|\")\n", + " if not any(term in PATHOGENIC_ALLOWED for term in terms):\n", + " continue\n", + "\n", + " # filter for review status\n", + " if rev_stat not in REVIEW_STATUS_ALLOWED:\n", + " continue\n", + "\n", + " if \"pathogenic\" in path:\n", + " clean_pathogenicity = \"pathogenic\"\n", + " elif \"benign\" in path:\n", + " clean_pathogenicity = \"benign\"\n", + " else:\n", + " raise ValueError(f\"Unknown pathogenicity: {path}\")\n", + "\n", + "\n", + " # collect row\n", + " batch[\"clinvar_id\"].append(clinvar_id)\n", + " batch[\"mutation_instruction\"].append(instr)\n", + " batch[\"original_window\"].append(orig_win)\n", + " batch[\"mutated_window\"].append(mut_win)\n", + " batch[\"pathogenicity\"].append(path)\n", + " batch[\"cleaned_pathogenicity\"].append(clean_pathogenicity)\n", + " batch[\"disease_name\"].append(dis)\n", + " batch[\"variant_type\"].append(vart)\n", + " batch[\"review_status\"].append(rev_stat)\n", + " batch[\"gene_name\"].append(gene_name)\n", + " batch[\"gene_id\"].append(gene_id)\n", + " batch[\"chromosome\"].append(chrom)\n", + " batch[\"chromosome_position\"].append(pos1) # 1-based position on chromosome\n", + " batch[\"gene_desc\"].append(gene_desc_map.get(gene_id))\n", + " batch[\"clinvar_link\"].append(f\"https://www.ncbi.nlm.nih.gov/clinvar/variation/{clinvar_id}/\")\n", + "\n", + " # flush when batch is full\n", + " if len(batch[\"mutation_instruction\"]) >= batch_size:\n", + " flush_batch()\n", + "\n", + " # final flush & close\n", + " if batch[\"mutation_instruction\"]:\n", + " flush_batch()\n", + " if writer is not None:\n", + " writer.close()\n", + "\n", + " print(\"Finished writing →\", out_parquet_path)\n", + " print(f\"# Removed due to missing gene info: {missing_genes}\")\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " main(\n", + " \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_coding_only.vcf\",\n", + " \"SCRATCH_DIR/DNASNVData113/clinvar_data/\"\n", + " \"vep-cache-113/homo_sapiens/113_GRCh38/\"\n", + " \"Homo_sapiens.GRCh38.dna.toplevel.fa\",\n", + " \"SCRATCH_DIR/DNASNVData113/clinvar_data/\"\n", + " \"clinvar_windowed_4096.parquet\"\n", + " )\n", + "\n", + "# note to visually inspect the dna sequences and modified sequences go to https://www.ncbi.nlm.nih.gov/gdv/browser/genome/?id=GCF_000001405.40 and then click tools and then sequence text view" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Hereditary_factor_VIII_deficiency_disease|not_provided'" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['clinvar_id']=='10152']['disease_name'][342667]\n", + "# https://www.ncbi.nlm.nih.gov/clinvar/variation/10152/\n", + "# shows that only diseases with stars are included in the associated diseases (since hemophelia not included)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "[print(x) for x in (df[(df['pathogenicity']=='pathogenic') & df['disease_name'].str.contains(r'\\|')]['clinvar_link'])]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "On login node upload table to huggingface" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --no-index huggingface-hub\n", + "from huggingface_hub import HfApi\n", + "import os\n", + "import glob\n", + "\n", + "# 0) config\n", + "repo_id = \"wanglab/bioR_tasks\" # your dataset repo\n", + "repo_type = \"dataset\"\n", + "subfolder = \"variant_effect_non_snv_and_snv\"\n", + "local_dir = \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_windowed_4096.parquet\"\n", + "\n", + "api = HfApi()\n", + "\n", + "# 1) list all files in that subfolder\n", + "all_files = api.list_repo_files(repo_id, repo_type=repo_type)\n", + "old_files = [f for f in all_files if f.startswith(subfolder + \"/\")]\n", + "\n", + "print(f\"Will delete {len(old_files)} old files:\")\n", + "for f in old_files:\n", + " print(\" \", f)\n", + "\n", + "# 2) delete them (one commit per file, or you can batch by reusing the same commit_message)\n", + "for f in old_files:\n", + " api.delete_file(\n", + " path_in_repo = f,\n", + " repo_id = repo_id,\n", + " repo_type = repo_type,\n", + " commit_message = f\"remove old dataset file\"\n", + " )\n", + "\n", + "# 3) upload your single Parquet file\n", + "new_file = \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_windowed_4096.parquet\"\n", + "basename = os.path.basename(new_file)\n", + "dest_path = f\"{subfolder}/{basename}\"\n", + "\n", + "print(f\"Uploading {new_file!r} to {repo_id}/{dest_path} …\")\n", + "api.upload_file(\n", + " path_or_fileobj = new_file,\n", + " path_in_repo = dest_path,\n", + " repo_id = repo_id,\n", + " repo_type = repo_type,\n", + " commit_message = f\"add updated parquet {basename}\"\n", + ")\n", + "\n", + "print(\"Done! Your dataset has been updated on the Hub.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --no-index huggingface-hub\n", + "from huggingface_hub import HfApi\n", + "import os\n", + "import glob\n", + "\n", + "# 0) config\n", + "repo_id = \"wanglab/bioR_tasks\" # your dataset repo\n", + "repo_type = \"dataset\"\n", + "subfolder = \"variant_effect_non_snv_and_snv\"\n", + "local_dir = \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_windowed_4096.parquet\"\n", + "\n", + "api = HfApi()\n", + "\n", + "# 1) list all files in that subfolder\n", + "all_files = api.list_repo_files(repo_id, repo_type=repo_type)\n", + "old_files = [f for f in all_files if f.startswith(subfolder + \"/\")]\n", + "\n", + "\n", + "import io\n", + "\n", + "# Upload cleaned DataFrame\n", + "buffer = io.BytesIO()\n", + "final_df.to_parquet(buffer, index=False)\n", + "buffer.seek(0)\n", + "\n", + "# Construct cleaned filename by appending '_cleaned'\n", + "basename = os.path.splitext(os.path.basename(local_dir))[0] + \"_cleaned.parquet\"\n", + "dest_path = f\"{subfolder}/{basename}\"\n", + "\n", + "print(f\"Uploading cleaned DataFrame to {repo_id}/{dest_path} …\")\n", + "api.upload_file(\n", + " path_or_fileobj=buffer,\n", + " path_in_repo=dest_path,\n", + " repo_id=repo_id,\n", + " repo_type=repo_type,\n", + " commit_message=f\"add cleaned parquet {basename}\"\n", + ")\n", + "\n", + "print(\"Done! Cleaned DataFrame uploaded.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "read table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "→ Discovering data under '/scratch/naimerja/DNASNVData113/clinvar_data/clinvar_windowed_4096.parquet'\n", + "→ Scanning & reading all fragments in parallel …\n", + "→ Converting to pandas DataFrame…\n", + "✅ Loaded 342,689 rows in 3.3s\n", + "DataFrame shape: (342689, 15)\n", + "Memory usage: 3.18 GB\n" + ] + } + ], + "source": [ + "#!/usr/bin/env python3\n", + "import time, os\n", + "import pandas as pd\n", + "import pyarrow as pa\n", + "import pyarrow.parquet as pq\n", + "import pyarrow.dataset as ds\n", + "\n", + "from tqdm import tqdm\n", + "\n", + "def load_parquet_to_pandas(parquet_dir, num_threads=24):\n", + " # configure PyArrow global thread pool\n", + " pa.set_cpu_count(num_threads)\n", + " pa.set_io_thread_count(num_threads)\n", + "\n", + " start = time.time()\n", + " print(f\"→ Discovering data under {parquet_dir!r}\")\n", + "\n", + " # Option A: use the ParquetDataset API\n", + " # dataset = pq.ParquetDataset(parquet_dir) # older PyArrow\n", + " # table = dataset.read(use_threads=True) # uses all threads by default\n", + "\n", + " # Option B (recommended): use the Dataset API\n", + " dataset = ds.dataset(parquet_dir, format=\"parquet\")\n", + " print(\"→ Scanning & reading all fragments in parallel …\")\n", + " # to_table will read all row-groups/files in parallel (use_threads defaults to True) :contentReference[oaicite:0]{index=0}\n", + " table = dataset.to_table()\n", + "\n", + " print(\"→ Converting to pandas DataFrame…\")\n", + " df = table.to_pandas()\n", + "\n", + " end = time.time()\n", + " print(f\"✅ Loaded {len(df):,} rows in {end - start:.1f}s\")\n", + " print(f\"DataFrame shape: {df.shape}\")\n", + " print(f\"Memory usage: {df.memory_usage(deep=True).sum() / 1e9:.2f} GB\")\n", + "\n", + " return df\n", + "\n", + "if __name__ == \"__main__\":\n", + " PARQUET_DIR = \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_windowed_4096.parquet\"\n", + " df = load_parquet_to_pandas(PARQUET_DIR, num_threads=24)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create final training dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import re\n", + "\n", + "#list of 50 questions\n", + "\n", + "question_synonyms = {\n", + " \"A genetic variant on chromosome , position , affects the gene (). Is this variant benign or pathogenic? If pathogenic, what disease(s) does it cause?\",\n", + " \"A mutation at chromosome position on chromosome in gene (): benign or pathogenic? If pathogenic, which disease(s) is it linked to?\",\n", + " \"Considering the variant on chromosome , location , involving gene (), would you classify it as benign or pathogenic? What disease(s), if any, does a pathogenic variant indicate?\",\n", + " \"Is the genetic mutation found on chromosome at position , within the gene (), considered benign or pathogenic? If pathogenic, specify the associated disease(s).\",\n", + " \"Assess the clinical significance (benign or pathogenic) of the variant at chromosome , position , gene (). What disease(s) is it linked to if pathogenic?\",\n", + " \"Does the genetic variant at chromosome , position , impacting gene (), appear benign or pathogenic? If pathogenic, name the associated disease(s).\",\n", + " \"Variant in gene (), located at chromosome position : benign or pathogenic? What disease(s) does it cause if pathogenic?\",\n", + " \"Gene () variant at chromosome , position —is it benign or pathogenic? If pathogenic, what are the associated condition(s)?\",\n", + " \"A genetic alteration at chromosome , position , in gene ()—benign or pathogenic? If pathogenic, which disease(s) is involved?\",\n", + " \"Chromosome , position , gene (): Is this mutation clinically benign or pathogenic? If pathogenic, identify the related disease(s).\",\n", + " \"Does the variant on chromosome at location affecting gene () have a clinical significance of benign or pathogenic? If pathogenic, what disease(s) is associated?\",\n", + " \"Mutation at chromosome , position , within (): benign or pathogenic? If pathogenic, indicate the disease(s).\",\n", + " \"Evaluate this variant at chromosome , position , gene (): benign or pathogenic? If pathogenic, what are the disease connection(s)?\",\n", + " \"Gene mutation in () at chromosome , position —is it benign or pathogenic? If pathogenic, specify the disease(s).\",\n", + " \"Located at chromosome position , the variant affecting gene ()—benign or pathogenic? If pathogenic, which disease(s) does it relate to?\",\n", + " \"Is the chromosome , position variant in () clinically benign or pathogenic? If pathogenic, what condition(s) is associated?\",\n", + " \"Clinical significance of chromosome , position , gene (): benign or pathogenic? Name the disease(s) if pathogenic.\",\n", + " \"Is the genetic variant on chromosome , position , gene (), benign or pathogenic? If pathogenic, what disease(s) is indicated?\",\n", + " \"Regarding the variant at chromosome and position , affecting gene (): benign or pathogenic? If pathogenic, what are the associated illness(es)?\",\n", + " \"The mutation in gene () at chromosome , position —clinically benign or pathogenic? If pathogenic, identify the related disease(s).\",\n", + " \"Assess the variant on chromosome , position , impacting (): is it benign or pathogenic? If pathogenic, specify the associated condition(s).\",\n", + " \"Variant in (), chromosome , position —is this benign or pathogenic? If pathogenic, what disease(s) is linked?\",\n", + " \"Clinical impact (benign or pathogenic) of the variant at chromosome , location , gene (): what disease(s) if pathogenic?\",\n", + " \"The chromosome , position genetic variant in gene (): benign or pathogenic? If pathogenic, indicate disease(s).\",\n", + " \"Determine if the mutation at chromosome , position in gene () is benign or pathogenic. If pathogenic, what disease(s) is associated?\",\n", + " \"Is chromosome , position , gene () variant benign or pathogenic? If pathogenic, what condition(s) is it related to?\",\n", + " \"The mutation impacting () on chromosome at position : benign or pathogenic? Name the associated disease(s) if pathogenic.\",\n", + " \"Variant at chromosome , position , gene (): clinically benign or pathogenic? If pathogenic, specify the disease(s) involved.\",\n", + " \"Chromosome , position , gene (): benign or pathogenic variant? If pathogenic, what are the linked illness(es)?\",\n", + " \"A genetic variant at chromosome , position , affecting gene ()—is it benign or pathogenic? If pathogenic, identify the associated disorder(s).\",\n", + " \"Mutation found at chromosome position , gene (): benign or pathogenic? If pathogenic, indicate the relevant disease(s).\",\n", + " \"Benign or pathogenic: chromosome , position , gene () variant? Disease(s) if pathogenic?\",\n", + " \"Evaluate if the mutation on chromosome at position in () is benign or pathogenic. Disease name(s) if pathogenic?\",\n", + " \"Clinical classification of chromosome , position , gene (): benign or pathogenic? Disease(s) if pathogenic?\",\n", + " \"Variant chromosome , position , gene (): benign or pathogenic? Disease(s)?\",\n", + " \"Variant on chromosome , at position , affecting (): is it benign or pathogenic? If pathogenic, specify the associated disease(s).\",\n", + " \"Does the chromosome mutation at position within gene () classify as benign or pathogenic? If pathogenic, indicate the related illness(es).\",\n", + " \"Determine whether the variant at chromosome , position , in gene () is benign or pathogenic. If pathogenic, identify the relevant disease(s).\",\n", + " \"Gene () variant at chromosome position on chromosome : benign or pathogenic? If pathogenic, what disease(s) is it associated with?\",\n", + " \"Considering the genetic mutation at chromosome , position , impacting (): is it clinically benign or pathogenic? Name the associated disease(s) if pathogenic.\",\n", + " \"Evaluate the clinical significance of the mutation at chromosome , position in gene (): benign or pathogenic? What disease(s) does a pathogenic variant suggest?\",\n", + " \"Is the variant located on chromosome at position , gene (), benign or pathogenic? If pathogenic, specify the disease(s) linked.\",\n", + " \"Classify the chromosome variant at position affecting gene () as benign or pathogenic. If pathogenic, which disease(s) is associated?\",\n", + " \"For chromosome , position , gene (): benign or pathogenic mutation? If pathogenic, what are the associated disease(s)?\",\n", + " \"Is the genetic change at chromosome , position , within gene () benign or pathogenic? Name the disease(s) if pathogenic.\",\n", + " \"Does the variant impacting () on chromosome , position , classify as benign or pathogenic? If pathogenic, what disease(s) is it associated with?\",\n", + " \"Variant at chromosome position , chromosome , gene (): benign or pathogenic? If pathogenic, what condition(s) does it relate to?\",\n", + " \"Regarding the variant found on chromosome at position in gene (): is it benign or pathogenic? If pathogenic, identify the disease(s).\",\n", + " \"The genetic variant at chromosome , position , affecting gene (): benign or pathogenic? Disease name(s) if pathogenic?\",\n", + " \"Clinically, how would you classify the variant at chromosome , position , gene (): benign or pathogenic? If pathogenic, specify the associated illness(es).\"\n", + "}\n", + "\n", + "question_df = pd.DataFrame({'question': list(question_synonyms)})\n", + "question_df.index.name = 'question_number'\n", + "\n", + "# copy the df to training_df\n", + "training_df = df.copy()\n", + "training_df = training_df.rename(columns={'original_window': 'reference_sequence', 'mutated_window': 'mutated_sequence'})\n", + "training_df['question_number'] = np.random.randint(0, 50, size=len(training_df)) # generate random question number between 0 and 49 inclusive\n", + "\n", + "# merge the training_df with the question_df\n", + "training_df = pd.merge(training_df, question_df, on='question_number', how='left')\n", + "\n", + "# drop the question_number column\n", + "training_df = training_df.drop(columns=['question_number'])\n", + "\n", + "def fill_placeholders(row):\n", + " q = row['question']\n", + " # always replace these\n", + " q = q.replace('', str(row['chromosome']))\n", + " q = q.replace('', str(row['chromosome_position']))\n", + " q = q.replace('', row['gene_name'])\n", + " \n", + " # gene_full_name may be None\n", + " if pd.notnull(row['gene_desc']):\n", + " q = q.replace('', row['gene_desc'])\n", + " else:\n", + " # remove the entire \"()\" including surrounding space\n", + " q = re.sub(r'\\s*\\(\\s*\\s*\\)', '', q)\n", + " \n", + " return q\n", + "\n", + "training_df['question'] = training_df.apply(fill_placeholders, axis=1)\n", + "\n", + "\n", + "\n", + "def format_answer(row):\n", + " path = row['cleaned_pathogenicity']\n", + " disease = row['disease_name']\n", + " \n", + " # If disease_name is exactly 'not_provided' or 'not_specified'\n", + " if disease in ('not_provided', 'not_specified', 'not_specified|not_provided', 'not_provided|not_specified'):\n", + " return path\n", + " \n", + " # Split on '|' into a list and drop 'not_provided'\n", + " diseases = [d for d in disease.split('|') if d != 'not_provided']\n", + " \n", + " # Handle 'not_specified': note it, then drop it\n", + " unspecified = 'not_specified' in diseases\n", + " diseases = [d for d in diseases if d != 'not_specified']\n", + " \n", + " # Sort the disease names alphabetically\n", + " diseases = sorted(diseases)\n", + " \n", + " # If unspecified, append the note as an element at the end\n", + " if unspecified:\n", + " diseases.append('likely other unspecified diseases')\n", + " \n", + " # Represent diseases as a Python-style list literal\n", + " disease_text = str(diseases) # e.g. \"['DiseaseA', 'DiseaseB']\"\n", + " \n", + " # Build the answer, adding semicolon only for pathogenic\n", + " if path == 'pathogenic' and diseases:\n", + " return f\"{path}; {disease_text}\"\n", + " else:\n", + " return path\n", + "\n", + "# Apply to your DataFrame\n", + "training_df['answer'] = training_df.apply(format_answer, axis=1)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "disease_name\n", + "not_provided 73241\n", + "not_specified|not_provided 6405\n", + "not_provided|not_specified 5466\n", + "Inborn_genetic_diseases|not_provided 2289\n", + "not_provided|Inborn_genetic_diseases 1929\n", + " ... \n", + "not_provided|VAMP7-related_disorder 1\n", + "46,XY_sex_reversal_1|not_provided 1\n", + "Hereditary_factor_VIII_deficiency_disease|Thrombophilia,_X-linked,_due_to_factor_8_defect|not_provided 1\n", + "Mendelian_susceptibility_to_mycobacterial_diseases_due_to_complete_ISG15_deficiency|not_specified|not_provided 1\n", + "not_provided|not_specified|Mendelian_susceptibility_to_mycobacterial_diseases_due_to_complete_ISG15_deficiency 1\n", + "Name: count, Length: 87193, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_df['disease_name'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in links: /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo2023/x86-64-v3, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo2023/generic, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/generic\n", + "Processing /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/generic/networkx-3.4.2+computecanada-py3-none-any.whl\n", + "Installing collected packages: networkx\n", + "Successfully installed networkx-3.4.2+computecanada\n" + ] + } + ], + "source": [ + "!pip install --no-index networkx\n", + "import networkx as nx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "disjoint diseases" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting split assignment...\n", + "Step 1/5: Building graph & disease→row index mapping (excluding specials)...\n", + " → Built graph with 13326 nodes, 48265 edges in 1.0s\n", + "Step 2/5: Checking for existing disconnected components...\n", + " → Found 3099 components; skipping node removal.\n", + "Step 5/5: Assigning rows to splits…\n", + "Done! Total time: 9.8s; achieved train fraction = 0.1000\n", + "Dropped diseases: []\n", + "Rows dropped: 0\n", + "Final train fraction: 0.100\n" + ] + } + ], + "source": [ + "special_diseases = {\"not_provided\", \"not_specified\", \"Inborn_genetic_diseases\", \"See_cases\"}\n", + "import pandas as pd\n", + "import networkx as nx\n", + "import itertools\n", + "import numpy as np\n", + "import re\n", + "import time\n", + "from tqdm import tqdm\n", + "from math import comb\n", + "import multiprocessing as mp\n", + "from functools import partial\n", + "from collections import defaultdict\n", + "\n", + "def _evaluate_subset(subset, G, disease_to_rows, train_frac):\n", + " \"\"\"\n", + " Worker to evaluate one subset removal:\n", + " - removes `subset` from G,\n", + " - checks for ≥2 components,\n", + " - if so, computes the train/test split score using disease_to_rows.\n", + " Returns (score, subset, components) or None.\n", + " \"\"\"\n", + " H = G.copy()\n", + " H.remove_nodes_from(subset)\n", + " ccs = list(nx.connected_components(H))\n", + " if len(ccs) < 2:\n", + " return None\n", + "\n", + " # compute unique row counts for each component\n", + " sizes = []\n", + " for comp in ccs:\n", + " rows = set()\n", + " for d in comp:\n", + " rows |= disease_to_rows.get(d, set())\n", + " sizes.append(len(rows))\n", + "\n", + " # pick two largest comps\n", + " idx = np.argsort(sizes)[::-1][:2]\n", + " train_count, test_count = sizes[idx[0]], sizes[idx[1]]\n", + " frac = train_count / (train_count + test_count)\n", + " score = abs(frac - train_frac)\n", + " return (score, subset, ccs)\n", + "\n", + "def assign_disjoint_splits(\n", + " df: pd.DataFrame,\n", + " special_diseases: set,\n", + " train_frac: float = 0.9,\n", + " max_remove: int = 3,\n", + " random_state: int = 42,\n", + " n_procs: int = 24\n", + ") -> (pd.DataFrame, dict):\n", + " \"\"\"\n", + " Add a 'split' column to df (0=train, 1=test) so that:\n", + " - No disease outside special_diseases appears in both splits.\n", + " - The overall train/test row ratio is as close to train_frac as possible.\n", + " - SNV/non-SNV and pathogenic/benign proportions stay balanced automatically\n", + " by sampling at the end for any rows containing only special diseases.\n", + " Uses up to `n_procs` parallel processes for the removal search, but only if needed.\n", + " Prints progress at every major step.\n", + " \"\"\"\n", + " rng = np.random.RandomState(random_state)\n", + " start_time = time.time()\n", + " print(\"Starting split assignment...\")\n", + "\n", + " # 1) Build graph and disease→rows mapping\n", + " print(\"Step 1/5: Building graph & disease→row index mapping (excluding specials)...\")\n", + " G = nx.Graph()\n", + " disease_to_rows = defaultdict(set)\n", + " for idx, name_str in enumerate(df['disease_name']):\n", + " names = name_str.split('|')\n", + " non_special = [d for d in names if d not in special_diseases]\n", + " for d in non_special:\n", + " disease_to_rows[d].add(idx)\n", + " G.add_node(d)\n", + " for u, v in itertools.combinations(non_special, 2):\n", + " G.add_edge(u, v)\n", + " elapsed = time.time() - start_time\n", + " print(f\" → Built graph with {G.number_of_nodes()} nodes, {G.number_of_edges()} edges in {elapsed:.1f}s\")\n", + "\n", + " # 2) Check connectivity\n", + " print(\"Step 2/5: Checking for existing disconnected components...\")\n", + " comps = list(nx.connected_components(G))\n", + " if len(comps) >= 2:\n", + " print(f\" → Found {len(comps)} components; skipping node removal.\")\n", + " # compute rows-per-component sets\n", + " comp_rows = []\n", + " for comp in comps:\n", + " rows_set = set()\n", + " for d in comp:\n", + " rows_set |= disease_to_rows[d]\n", + " comp_rows.append((comp, rows_set))\n", + "\n", + " # total non-special rows\n", + " total_ns_rows = len(set().union(*(rows for _, rows in comp_rows)))\n", + " target_train_ns = train_frac * total_ns_rows\n", + "\n", + " # sort components by descending size\n", + " comp_rows.sort(key=lambda x: len(x[1]), reverse=True)\n", + "\n", + " # greedy pack to hit target_train_ns\n", + " train_comp = set()\n", + " train_rows = set()\n", + " for comp, rows_set in comp_rows:\n", + " if len(train_rows | rows_set) <= target_train_ns or not train_rows:\n", + " train_comp |= comp\n", + " train_rows |= rows_set\n", + "\n", + " all_nodes = set(G.nodes())\n", + " test_comp = all_nodes - train_comp\n", + " dropped = []\n", + " else:\n", + " # 3) Removal search\n", + " print(\"Step 3/5: Graph is connected; searching for node removals…\")\n", + " best = {'score': float('inf')}\n", + " all_nodes = list(G.nodes())\n", + " worker = partial(_evaluate_subset,\n", + " G=G,\n", + " disease_to_rows=disease_to_rows,\n", + " train_frac=train_frac)\n", + " for k in range(1, max_remove + 1):\n", + " total_combs = comb(len(all_nodes), k)\n", + " print(f\" → Trying removals of size {k} ({total_combs} combos)…\")\n", + " with mp.Pool(processes=n_procs) as pool:\n", + " for result in tqdm(pool.imap_unordered(worker, itertools.combinations(all_nodes, k)),\n", + " total=total_combs,\n", + " desc=f\" size={k}\"):\n", + " if not result:\n", + " continue\n", + " score, subset, ccs = result\n", + " if score < best['score']:\n", + " best.update(score=score, subset=subset, components=ccs)\n", + " elapsed_k = time.time() - start_time\n", + " print(f\" → Done size-{k} in {elapsed_k:.1f}s; best score = {best['score']:.4f}\")\n", + " if best['score'] < float('inf'):\n", + " break\n", + "\n", + " dropped = list(best['subset'])\n", + " comps = best['components']\n", + "\n", + " # 4) select two largest comps\n", + " print(\"Step 4/5: Selecting two largest components for train/test…\")\n", + " comp_counts = []\n", + " for comp in comps:\n", + " rows_set = set()\n", + " for d in comp:\n", + " rows_set |= disease_to_rows[d]\n", + " comp_counts.append((comp, rows_set))\n", + " comp_counts.sort(key=lambda x: len(x[1]), reverse=True)\n", + " train_comp, test_comp = comp_counts[0][0], comp_counts[1][0]\n", + "\n", + " # 5) Assign rows\n", + " print(\"Step 5/5: Assigning rows to splits…\")\n", + " def which_split(dlist):\n", + " non_special = [d for d in dlist if d not in special_diseases]\n", + " if any(d in train_comp for d in non_special):\n", + " return 0\n", + " if any(d in test_comp for d in non_special):\n", + " return 1\n", + " return None\n", + "\n", + " df_out = df.copy()\n", + " df_out['split'] = df_out['disease_name'].str.split('|').apply(which_split)\n", + "\n", + " # fill None rows to achieve exact train_frac\n", + " mask_none = df_out['split'].isna()\n", + " n_none = mask_none.sum()\n", + " n_train_desired = int(train_frac * len(df_out))\n", + " n_current_train = (df_out['split'] == 0).sum()\n", + " n_to_train = max(0, n_train_desired - n_current_train)\n", + " assign = np.array([0]*n_to_train + [1]*(n_none - n_to_train))\n", + " rng.shuffle(assign)\n", + " df_out.loc[mask_none, 'split'] = assign\n", + " df_out['split'] = df_out['split'].astype(int)\n", + "\n", + " total_elapsed = time.time() - start_time\n", + " print(f\"Done! Total time: {total_elapsed:.1f}s; achieved train fraction = {df_out['split'].mean():.4f}\")\n", + "\n", + " info = {\n", + " 'dropped_nodes': dropped,\n", + " 'dropped_row_count': int(sum(len(disease_to_rows[d]) for d in dropped)),\n", + " 'achieved_frac': float(df_out['split'].mean())\n", + " }\n", + " return df_out, info\n", + "\n", + "# ── Usage ──\n", + "new_df, report = assign_disjoint_splits(\n", + " training_df,\n", + " special_diseases,\n", + " train_frac=0.9,\n", + " max_remove=3,\n", + " random_state=42,\n", + " n_procs=24\n", + ")\n", + "print(\"Dropped diseases:\", report['dropped_nodes'])\n", + "print(\"Rows dropped:\", report['dropped_row_count'])\n", + "print(f\"Final train fraction: {report['achieved_frac']:.3f}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "split\n", + "0 308420\n", + "1 34269\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df['split'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['SAMD11-related_disorder|not_provided',\n", + " 'not_provided|SAMD11-related_disorder', 'not_provided', ...,\n", + " 'not_provided|VAMP7-related_disorder',\n", + " '46,XY_sex_reversal_1|not_provided',\n", + " 'TBL1Y-related_disorder|Deafness,_Y-linked_2|not_provided'],\n", + " shape=(11445,), dtype=object)" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df[new_df['split']==1]['disease_name'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "split\n", + "0 308420\n", + "1 34269\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df['split'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Split 0 (n=308420) ===\n", + "\n", + "Pathogenicity counts:\n", + "cleaned_pathogenicity\n", + "benign 230709\n", + "pathogenic 77711\n", + "\n", + "Pathogenicity ratios:\n", + "cleaned_pathogenicity\n", + "benign 0.748035\n", + "pathogenic 0.251965\n", + "\n", + "Variant-type counts:\n", + "variant_type\n", + "SNV 274147\n", + "non_SNV 34273\n", + "\n", + "Variant-type ratios:\n", + "variant_type\n", + "SNV 0.888876\n", + "non_SNV 0.111124\n", + "\n", + "=== Split 1 (n=34269) ===\n", + "\n", + "Pathogenicity counts:\n", + "cleaned_pathogenicity\n", + "benign 30279\n", + "pathogenic 3990\n", + "\n", + "Pathogenicity ratios:\n", + "cleaned_pathogenicity\n", + "benign 0.883568\n", + "pathogenic 0.116432\n", + "\n", + "Variant-type counts:\n", + "variant_type\n", + "SNV 32454\n", + "non_SNV 1815\n", + "\n", + "Variant-type ratios:\n", + "variant_type\n", + "SNV 0.947037\n", + "non_SNV 0.052963\n", + "\n", + "Cross-tab: split × pathogenicity\n", + "cleaned_pathogenicity benign pathogenic\n", + "split \n", + "0 0.748035 0.251965\n", + "1 0.883568 0.116432\n", + "\n", + "Cross-tab: split × variant_type\n", + "variant_type SNV non_SNV\n", + "split \n", + "0 0.888876 0.111124\n", + "1 0.947037 0.052963\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# assuming new_df is your DataFrame with a 'split' column (0=train, 1=test)\n", + "\n", + "def print_ratio_stats(df, split_label):\n", + " sub = df[df['split'] == split_label]\n", + " total = len(sub)\n", + " print(f\"\\n=== Split {split_label} (n={total}) ===\")\n", + " \n", + " # Pathogenic vs. Benign\n", + " p_counts = sub['cleaned_pathogenicity'].value_counts()\n", + " p_ratios = p_counts / total\n", + " print(\"\\nPathogenicity counts:\")\n", + " print(p_counts.to_string())\n", + " print(\"\\nPathogenicity ratios:\")\n", + " print(p_ratios.to_string())\n", + " \n", + " # SNV vs. non-SNV\n", + " v_counts = sub['variant_type'].value_counts()\n", + " v_ratios = v_counts / total\n", + " print(\"\\nVariant-type counts:\")\n", + " print(v_counts.to_string())\n", + " print(\"\\nVariant-type ratios:\")\n", + " print(v_ratios.to_string())\n", + "\n", + "# Overall\n", + "print_ratio_stats(new_df, 0) # train\n", + "print_ratio_stats(new_df, 1) # test\n", + "\n", + "# If you also want a quick cross-tab view:\n", + "print(\"\\nCross-tab: split × pathogenicity\")\n", + "print(pd.crosstab(new_df['split'], new_df['cleaned_pathogenicity'], normalize='index'))\n", + "\n", + "print(\"\\nCross-tab: split × variant_type\")\n", + "print(pd.crosstab(new_df['split'], new_df['variant_type'], normalize='index'))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "final_df = new_df.copy()[['question', 'answer', 'reference_sequence', 'mutated_sequence', 'split', 'variant_type', 'cleaned_pathogenicity']]\n", + "\n", + "# if len(final_df['variant_type'].value_counts().keys().tolist()) > 2:\n", + "# raise ValueError(\"variant_type has more than 2 values, should just be SNV and non_SNV\")\n", + "\n", + "train_split_df = final_df[final_df['split']==0]\n", + "test_split_df = final_df[final_df['split']==1]\n", + "\n", + "train_split_df = train_split_df.drop('split', axis=1)\n", + "test_split_df = test_split_df.drop('split', axis=1)\n", + "\n", + "snv_train_split_df = train_split_df[train_split_df['variant_type']=='SNV']\n", + "non_snv_train_split_df = train_split_df[train_split_df['variant_type']=='non_SNV']\n", + "\n", + "snv_test_split_df = test_split_df[test_split_df['variant_type']=='SNV']\n", + "non_snv_test_split_df = test_split_df[test_split_df['variant_type']=='non_SNV']\n", + "\n", + "snv_test_split_df = snv_test_split_df.drop('variant_type', axis=1)\n", + "non_snv_test_split_df = non_snv_test_split_df.drop('variant_type', axis=1)\n", + "\n", + "snv_train_split_df = snv_train_split_df.drop('variant_type', axis=1)\n", + "non_snv_train_split_df = non_snv_train_split_df.drop('variant_type', axis=1)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questionanswerreference_sequencemutated_sequencecleaned_pathogenicity
0Assess the variant on chromosome 1, position 9...benignGGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTG...GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTG...benign
1Gene SAMD11 (sterile alpha motif domain contai...benignTGACTAACACGGTGAAACCCGTCTCTACTAAAAATACAAAAAATTA...TGACTAACACGGTGAAACCCGTCTCTACTAAAAATACAAAAAATTA...benign
2The mutation in gene SAMD11 (sterile alpha mot...benignCCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATG...CCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATG...benign
3Determine whether the variant at chromosome 1,...benignGAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAG...GAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAG...benign
4Variant on chromosome 1, at position 935779, a...benignCCTATGTGCCTGGGGGGGGCTTCCTTTCCCACTGGGAGCCGGTGGG...CCTATGTGCCTGGGGGGGGCTTCCTTTCCCACTGGGAGCCGGTGGG...benign
..................
342678Variant at chromosome X, position 155524483, g...benignGTGTGCATAGCTCTATGCAGTGTAATTACATGTGTAACTTTGTGTA...GTGTGCATAGCTCTATGCAGTGTAATTACATGTGTAACTTTGTGTA...benign
342680Mutation at chromosome X, position 155900534, ...benignAGCATTAAAGATCATCTAGTTGAACTACCCATCTGATGCTTAAATG...AGCATTAAAGATCATCTAGTTGAACTACCCATCTGATGCTTAAATG...benign
342681Does the variant on chromosome X at location 1...benignCAATTAGTCCCTTGATTATTGATCCTTCTCTTTTGGCTGTATTCTC...CAATTAGTCCCTTGATTATTGATCCTTCTCTTTTGGCTGTATTCTC...benign
342685Assess the clinical significance (benign or pa...benignTTTAGTCTTTCCAAAATGTATACATGCATGATGTCATAATTTTTAA...TTTAGTCTTTCCAAAATGTATACATGCATGATGTCATAATTTTTAA...benign
342686Is the variant located on chromosome Y at posi...benignAGGTGGCCGTGGCTGTCTGAGGGGAAAGACTGGGGACACTGAATGG...AGGTGGCCGTGGCTGTCTGAGGGGAAAGACTGGGGACACTGAATGG...benign
\n", + "

32454 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " question answer \\\n", + "0 Assess the variant on chromosome 1, position 9... benign \n", + "1 Gene SAMD11 (sterile alpha motif domain contai... benign \n", + "2 The mutation in gene SAMD11 (sterile alpha mot... benign \n", + "3 Determine whether the variant at chromosome 1,... benign \n", + "4 Variant on chromosome 1, at position 935779, a... benign \n", + "... ... ... \n", + "342678 Variant at chromosome X, position 155524483, g... benign \n", + "342680 Mutation at chromosome X, position 155900534, ... benign \n", + "342681 Does the variant on chromosome X at location 1... benign \n", + "342685 Assess the clinical significance (benign or pa... benign \n", + "342686 Is the variant located on chromosome Y at posi... benign \n", + "\n", + " reference_sequence \\\n", + "0 GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTG... \n", + "1 TGACTAACACGGTGAAACCCGTCTCTACTAAAAATACAAAAAATTA... \n", + "2 CCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATG... \n", + "3 GAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAG... \n", + "4 CCTATGTGCCTGGGGGGGGCTTCCTTTCCCACTGGGAGCCGGTGGG... \n", + "... ... \n", + "342678 GTGTGCATAGCTCTATGCAGTGTAATTACATGTGTAACTTTGTGTA... \n", + "342680 AGCATTAAAGATCATCTAGTTGAACTACCCATCTGATGCTTAAATG... \n", + "342681 CAATTAGTCCCTTGATTATTGATCCTTCTCTTTTGGCTGTATTCTC... \n", + "342685 TTTAGTCTTTCCAAAATGTATACATGCATGATGTCATAATTTTTAA... \n", + "342686 AGGTGGCCGTGGCTGTCTGAGGGGAAAGACTGGGGACACTGAATGG... \n", + "\n", + " mutated_sequence \\\n", + "0 GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTG... \n", + "1 TGACTAACACGGTGAAACCCGTCTCTACTAAAAATACAAAAAATTA... \n", + "2 CCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATG... \n", + "3 GAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAG... \n", + "4 CCTATGTGCCTGGGGGGGGCTTCCTTTCCCACTGGGAGCCGGTGGG... \n", + "... ... \n", + "342678 GTGTGCATAGCTCTATGCAGTGTAATTACATGTGTAACTTTGTGTA... \n", + "342680 AGCATTAAAGATCATCTAGTTGAACTACCCATCTGATGCTTAAATG... \n", + "342681 CAATTAGTCCCTTGATTATTGATCCTTCTCTTTTGGCTGTATTCTC... \n", + "342685 TTTAGTCTTTCCAAAATGTATACATGCATGATGTCATAATTTTTAA... \n", + "342686 AGGTGGCCGTGGCTGTCTGAGGGGAAAGACTGGGGACACTGAATGG... \n", + "\n", + " cleaned_pathogenicity \n", + "0 benign \n", + "1 benign \n", + "2 benign \n", + "3 benign \n", + "4 benign \n", + "... ... \n", + "342678 benign \n", + "342680 benign \n", + "342681 benign \n", + "342685 benign \n", + "342686 benign \n", + "\n", + "[32454 rows x 5 columns]" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# save all the final dataframes to parquet files\n", + "snv_train_split_df.to_parquet('SCRATCH_DIR/DNASNVData113/finaldata/snv_train_split_df.parquet')\n", + "non_snv_train_split_df.to_parquet('SCRATCH_DIR/DNASNVData113/finaldata/non_snv_train_split_df.parquet')\n", + "snv_test_split_df.to_parquet('SCRATCH_DIR/DNASNVData113/finaldata/snv_test_split_df.parquet')\n", + "non_snv_test_split_df.to_parquet('SCRATCH_DIR/DNASNVData113/finaldata/non_snv_test_split_df.parquet')\n", + "\n", + "#now upload to huggingface\n", + "!pip install --no-index huggingface-hub\n", + "from huggingface_hub import HfApi\n", + "import os\n", + "import glob\n", + "\n", + "# 0) config\n", + "repo_id = \"wanglab/bioR_tasks\" # your dataset repo\n", + "repo_type = \"dataset\"\n", + "subfolder = \"task4-variant_effect_non_snv_and_snv_with_split\"\n", + "local_dir = \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_windowed_4096.parquet\"\n", + "\n", + "api = HfApi()\n", + "\n", + "# 1) list all files in that subfolder\n", + "all_files = api.list_repo_files(repo_id, repo_type=repo_type)\n", + "old_files = [f for f in all_files if f.startswith(subfolder + \"/\")]\n", + "\n", + "print(f\"Will delete {len(old_files)} old files:\")\n", + "for f in old_files:\n", + " print(\" \", f)\n", + "\n", + "# 2) delete them (one commit per file, or you can batch by reusing the same commit_message)\n", + "for f in old_files:\n", + " api.delete_file(\n", + " path_in_repo = f,\n", + " repo_id = repo_id,\n", + " repo_type = repo_type,\n", + " commit_message = f\"remove old dataset file\"\n", + " )\n", + "\n", + "# 3) upload your single Parquet file\n", + "new_file = \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_windowed_4096.parquet\"\n", + "basename = os.path.basename(new_file)\n", + "dest_path = f\"{subfolder}/{basename}\"\n", + "\n", + "print(f\"Uploading {new_file!r} to {repo_id}/{dest_path} …\")\n", + "api.upload_file(\n", + " path_or_fileobj = new_file,\n", + " path_in_repo = dest_path,\n", + " repo_id = repo_id,\n", + " repo_type = repo_type,\n", + " commit_message = f\"add updated parquet {basename}\"\n", + ")\n", + "\n", + "print(\"Done! Your dataset has been updated on the Hub.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'question': 'Assess the variant on chromosome 1, position 930204, impacting SAMD11 (sterile alpha motif domain containing 11): is it benign or pathogenic? If pathogenic, specify the associated condition(s).',\n", + " 'answer': 'benign',\n", + " 'reference_sequence': 'GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGACTAACACGGTGAAACCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGTGGTGGCGGGTGCCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATGAACCCGGGAGGCGGAGCTTGCAGTGAGCCCAGATTGTGCCACCGCACTCCAGCCTGGGCAACAGAGTGAGACTCCGTCTCAAAAAACTAAAAAAGAAGAGAGGTGGGAGAGGAGAGGCTGTCAGAGCCTCTAAGCCCTGGTGCTTGGGCTGCAGAAGGGCAGAGCTAAGCGGGACTTCCCAGCACAGCACACTCCGGACAGGCTGTGGCTGTTGAAGGGACCCCCGAGCTCCAGCTGACACGCGGAGGCCCGGGCACAGACAGGCATCATACCTTCGGCCTTGGCCGCACTCTGTGGTCATTGGTGTTGGGGGCAGCCCAGGGTCAGGGCAGGGTCTCAGCCTCGGACCCCAGGCCCCACCCCTTGCCCAGCAGTGCTGCGTTTTCCCAGTGAGCTGTCGTGGAGAGAGCAGAGGGGACCCAGCGCAGGCCCAGTGGCCGGTGAGGGGAGACGTGGCTCTGGGACGGGGGCCTCCACCTGGGTGGGGGGATGCTCCAGCTTCCAGACCCTTGGGGAGGGGGCACTGCCCAAACTAAGCTGGCACTGGGGCTGTGCATTTGAAGGTGATGGTGGTTCTAGGTCTGAGGAGGACACCCTCCTAACAGCCTCATCCCCAAGCTCCGGGCTGTGTTGTGGCAATGGGAGGGAGGAAGTCTGAGGAGACCCTGGTGACTGAACGGAGGAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAGCCAGCGGCCTGTTACTACATTTAAAAAAGCCTCCCGCCCACTGGAAAATAATCAATAACTTTCCTTTATCCCTGGGGGTGGCAGGACCTAGAAACACTGGAGGAGTCCGGAAGTGCCTGGGGCTGGGCCGGCGCTGGTGTGCTGTGCAGGGTGCCGCGGGCACGTCCGCCGCGTGTGTGCGTCAGCTCGGGGCTCGGCTGTGCTCTGCAGGGACCACAGCGGGCGTGTCTGTGCTCCCACCCGAGGCACCCACAGCTCCACACGCTCGTTCCGTGGGTGCAAAGGAGATGGGAGAAAGAAGCCCTGTGAGAAATGCGGGGCAGGGTTTGCGGAACAGGGGACCTGGGCTGGTGAGGGCTCCTCGTCTGGTGACCTGTGAGCCCCGGGGCCTGCAGTCTGCGAGGGTTCAGCTCAGACAGTTGCCAGTGGCCTTGCACCAGGCTGCAGCTGCCCCTGAGCCGGGCTGTGCGTGGCGCTGATGAAATAGAAAAGGGCATTCGCTTGTCAACGTTGGCATCGGTGGCAGGGTGTGGTGGGCAGAAGGGTCACAAAGTACGGGTGGGATTGGCAGGCAGATACACGGAGGGAACGTGCGCATTTGAGTGCACGTCCACCAGCACCAGCCCCAGGCCACAGGCAGATCCCAGGAGACACGCAGGGGCCCTAAGAAGGGAGCTGGGAATGAGGGGCCACACAAGCCCGGGACGGAGGCCTGTCGCACATGGGGTGGCCCCGACTCAGGCCCTGGAGTTGGCCAGGACCCTCTAGCATCCTCAAGGGCTGGGCCAACCAGGCTGGCGTGGGGTGGGGCAGGGGAGGGCTGAGCCAGTGGGCGTCGTCTGTAGGGGGATGCCCAACTGCGGCCCCGTCTCTCGGCTCTCCTCTGGGTCTCTGGCCAGCTGTGGCTCCTGCTGGCCCCAGGCGCATCCCAGAGGCAGGTAGAGGGAGGATGGCTGCTCTGAGGGCACCTCTGCCGTGCTTGGGGCTCGGCCTGGGGTGCGAGACCAGGGCAGACCCCCGGGAGATGGAACGGCCCGGTCCAGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCAGAACCGGGGGCGGCTGGCAGACAAGAGGACAGTCGCCCTGCCTGCCGCCCGGAACCTGAAGAAGGAGCGAACTCCCAGCTTCTCTGCCAGCGATGGTGACAGCGACGGGAGTGGCCCCACCTGTGGGCGGCGGCCAGGCTTGAAGCAGGAGGATGGTCCGCACATCCGTATCATGAAGAGAAGGTACTTGGACCAGGGCCGGACAGGAAGGCGCAAGGCTCAGATGGGGCTGGAGCTTCAGGCCTTCAGCTGCTCAGATGAGAGTGTCCACACCGGCCTCCCACACCTTCCCTCAGATGCTGGTCTTTTTGGGGTCCTGTGTGGGTCGCAGGCAGGAGCTGTTTCCTCATCTGCCCCCTGTCTGGCGTCCCCTCCCACCTCTGCTCTGCGGCGCTCACTGGCAGAGGCAGGTTGGCAGCAGTTGGGACCCAGAGGTCTGCACCTTCCTGGGCCGACGCTCCAGCTACCCTTGCTGACCGGGTCCCAGTCTGGCCAGAGAGCAGCTCTAGCAACAGGGAGCTCCATTCAGGCTCGTGACTGGCTGTGCAGAAGCAGCCTCGGCCCCCACCTGCGGTACAACAGGAGGGCTCCTCTGAGTGCACGGCAACAAGCAAGAGGGAGAAGGGGCCTCGGTCCTGTTCTTCCTGATGCGTGTCTGCTGAGGCCAGGAGCTGGCTTTGGCCCATGGGCCTGTCCTAGTGGGAGGCCCCAGCATGTTGAGCCAGTAGCAGGTGGTGCTGGGCATGGCAGCCGCCCTCGTTCACTGCCCAGGGCTGTGGCCCAGCGGGGCACTGACCCGAGACAGGTCTGCGCACGCCCTGCTATCCTGAGGCTGGGGTCAGGGGCCTCCAGAGCAACATGGACCTTCTGCTTCCCTTCCTGCAGAGTCCACACCCACTGGGACGTGAACATCTCTTTCCGAGAGGCGTCCTGCAGGTAGGAGCCGTGCTGTGCGTGCATAAGAGGGGGCCGTGACTCCCCTCCCTCCCTCCCACCCCTGACCGTGCCCTGCTGTCTGCTGTCCGCTGTCTCAGCGTGAGCTGATGCTGTGATGCTGGCTGAGTGTCTGCCAGGTTTGACATGTGCTGCAAGGTTGTCCCCCATCCCGGGAGGCAGACAGTGTTGCACCCAGTTGGGACTGAGGGACCCCAGACCCAGTCAGATGCAGCTCTCGGCAGCAGCTCAGGTGTGAGTTCTGGGCAGCCCGGCCCTGGAGTTAGAGTGCACTTCCTCCCATGTGAGACTGGCCATTTGAGCCCAAAAATGAGGCTGTCACCTCCCCCTTCCCACCCTCCTAGAGACCCACAAGGAGGTGAGAATGCTGATGTGTGAGTGGGGCCCTGAAGGGTGTGTAGGAGCTCTAAGGCGAGGGGATGTCTGCAGAGTAGAGGAACAGGGAAGGGCGTGTAGGAGGGACGAGGAGTGAACCTGGCAGCTCTGGTTCAGTTGGATGCTGAAGAGTCATGGATGCTGGGCCTGTGGGCACCGTCCTCCAGGCGGGAGCCACCGAAAGTTCTTGAGCAGGGCAGTGACCAGGTGTATGTTTGGAGAAGGTCCCTCTGGAGGCCTTCCTGGCAGACAGGGGATTGGATTCAGGCTGTGGAAGCAGGACGGTAGGGGGTGTGATTCCAGGATGTGGAAAGGAGATAAAAATGAAGAGCCCCGGGGAAGAGGTCAAGGGAGTTGGGGGACCCGAGTTCCTGGCTCCAGGGGGAAGCGAGTGGTAAGTCTGTGAACAGAGCCCAGCTGTGGATTCTGTCAATGGGGTCAGGTCTCACCCTGTGGCTTCCAGGGCAGCAAGGCAGGAAGGAGGCGTCTGCCACAAGGCCAGCTTCCTGGGGCCAGAGCCGTGAAGGCCCAGGGGACCTGCGTGTCTTGGCTCCACGCCAGATGTGTTATTATTTATGTCTCTGAGAATGTCTGGATCTCAGAGCCGAATTACAATAAAAACATCTTTAAACTTATTTCTACCTCATTTTGGGGTTGCCAGCTCACCTGATCATTTTTATGAACTGTCATGAACACTGATGACATTTTATGAGCCTTTTACATGGGACACTACAGAATACATTTGTCAGCGAGGCCTGTAGGGAAACCC',\n", + " 'mutated_sequence': 'GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGACTAACACGGTGAAACCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGTGGTGGCGGGTGCCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATGAACCCGGGAGGCGGAGCTTGCAGTGAGCCCAGATTGTGCCACCGCACTCCAGCCTGGGCAACAGAGTGAGACTCCGTCTCAAAAAACTAAAAAAGAAGAGAGGTGGGAGAGGAGAGGCTGTCAGAGCCTCTAAGCCCTGGTGCTTGGGCTGCAGAAGGGCAGAGCTAAGCGGGACTTCCCAGCACAGCACACTCCGGACAGGCTGTGGCTGTTGAAGGGACCCCCGAGCTCCAGCTGACACGCGGAGGCCCGGGCACAGACAGGCATCATACCTTCGGCCTTGGCCGCACTCTGTGGTCATTGGTGTTGGGGGCAGCCCAGGGTCAGGGCAGGGTCTCAGCCTCGGACCCCAGGCCCCACCCCTTGCCCAGCAGTGCTGCGTTTTCCCAGTGAGCTGTCGTGGAGAGAGCAGAGGGGACCCAGCGCAGGCCCAGTGGCCGGTGAGGGGAGACGTGGCTCTGGGACGGGGGCCTCCACCTGGGTGGGGGGATGCTCCAGCTTCCAGACCCTTGGGGAGGGGGCACTGCCCAAACTAAGCTGGCACTGGGGCTGTGCATTTGAAGGTGATGGTGGTTCTAGGTCTGAGGAGGACACCCTCCTAACAGCCTCATCCCCAAGCTCCGGGCTGTGTTGTGGCAATGGGAGGGAGGAAGTCTGAGGAGACCCTGGTGACTGAACGGAGGAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAGCCAGCGGCCTGTTACTACATTTAAAAAAGCCTCCCGCCCACTGGAAAATAATCAATAACTTTCCTTTATCCCTGGGGGTGGCAGGACCTAGAAACACTGGAGGAGTCCGGAAGTGCCTGGGGCTGGGCCGGCGCTGGTGTGCTGTGCAGGGTGCCGCGGGCACGTCCGCCGCGTGTGTGCGTCAGCTCGGGGCTCGGCTGTGCTCTGCAGGGACCACAGCGGGCGTGTCTGTGCTCCCACCCGAGGCACCCACAGCTCCACACGCTCGTTCCGTGGGTGCAAAGGAGATGGGAGAAAGAAGCCCTGTGAGAAATGCGGGGCAGGGTTTGCGGAACAGGGGACCTGGGCTGGTGAGGGCTCCTCGTCTGGTGACCTGTGAGCCCCGGGGCCTGCAGTCTGCGAGGGTTCAGCTCAGACAGTTGCCAGTGGCCTTGCACCAGGCTGCAGCTGCCCCTGAGCCGGGCTGTGCGTGGCGCTGATGAAATAGAAAAGGGCATTCGCTTGTCAACGTTGGCATCGGTGGCAGGGTGTGGTGGGCAGAAGGGTCACAAAGTACGGGTGGGATTGGCAGGCAGATACACGGAGGGAACGTGCGCATTTGAGTGCACGTCCACCAGCACCAGCCCCAGGCCACAGGCAGATCCCAGGAGACACGCAGGGGCCCTAAGAAGGGAGCTGGGAATGAGGGGCCACACAAGCCCGGGACGGAGGCCTGTCGCACATGGGGTGGCCCCGACTCAGGCCCTGGAGTTGGCCAGGACCCTCTAGCATCCTCAAGGGCTGGGCCAACCAGGCTGGCGTGGGGTGGGGCAGGGGAGGGCTGAGCCAGTGGGCGTCGTCTGTAGGGGGATGCCCAACTGCGGCCCCGTCTCTCGGCTCTCCTCTGGGTCTCTGGCCAGCTGTGGCTCCTGCTGGCCCCAGGCGCATCCCAGAGGCAGGTAGAGGGAGGATGGCTGCTCTGAGGGCACCTCTGCCGTGCTTGGGGCTCGGCCTGGGGTGCGAGACCAGGGCAGACCCCCGGGAGATGGAACGGCCCGGTCCAGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCAGAACCGGGGGCGGCTGGCAGACAAGAGGACAGTCGCCCTGCCTGCCGCCCAGAACCTGAAGAAGGAGCGAACTCCCAGCTTCTCTGCCAGCGATGGTGACAGCGACGGGAGTGGCCCCACCTGTGGGCGGCGGCCAGGCTTGAAGCAGGAGGATGGTCCGCACATCCGTATCATGAAGAGAAGGTACTTGGACCAGGGCCGGACAGGAAGGCGCAAGGCTCAGATGGGGCTGGAGCTTCAGGCCTTCAGCTGCTCAGATGAGAGTGTCCACACCGGCCTCCCACACCTTCCCTCAGATGCTGGTCTTTTTGGGGTCCTGTGTGGGTCGCAGGCAGGAGCTGTTTCCTCATCTGCCCCCTGTCTGGCGTCCCCTCCCACCTCTGCTCTGCGGCGCTCACTGGCAGAGGCAGGTTGGCAGCAGTTGGGACCCAGAGGTCTGCACCTTCCTGGGCCGACGCTCCAGCTACCCTTGCTGACCGGGTCCCAGTCTGGCCAGAGAGCAGCTCTAGCAACAGGGAGCTCCATTCAGGCTCGTGACTGGCTGTGCAGAAGCAGCCTCGGCCCCCACCTGCGGTACAACAGGAGGGCTCCTCTGAGTGCACGGCAACAAGCAAGAGGGAGAAGGGGCCTCGGTCCTGTTCTTCCTGATGCGTGTCTGCTGAGGCCAGGAGCTGGCTTTGGCCCATGGGCCTGTCCTAGTGGGAGGCCCCAGCATGTTGAGCCAGTAGCAGGTGGTGCTGGGCATGGCAGCCGCCCTCGTTCACTGCCCAGGGCTGTGGCCCAGCGGGGCACTGACCCGAGACAGGTCTGCGCACGCCCTGCTATCCTGAGGCTGGGGTCAGGGGCCTCCAGAGCAACATGGACCTTCTGCTTCCCTTCCTGCAGAGTCCACACCCACTGGGACGTGAACATCTCTTTCCGAGAGGCGTCCTGCAGGTAGGAGCCGTGCTGTGCGTGCATAAGAGGGGGCCGTGACTCCCCTCCCTCCCTCCCACCCCTGACCGTGCCCTGCTGTCTGCTGTCCGCTGTCTCAGCGTGAGCTGATGCTGTGATGCTGGCTGAGTGTCTGCCAGGTTTGACATGTGCTGCAAGGTTGTCCCCCATCCCGGGAGGCAGACAGTGTTGCACCCAGTTGGGACTGAGGGACCCCAGACCCAGTCAGATGCAGCTCTCGGCAGCAGCTCAGGTGTGAGTTCTGGGCAGCCCGGCCCTGGAGTTAGAGTGCACTTCCTCCCATGTGAGACTGGCCATTTGAGCCCAAAAATGAGGCTGTCACCTCCCCCTTCCCACCCTCCTAGAGACCCACAAGGAGGTGAGAATGCTGATGTGTGAGTGGGGCCCTGAAGGGTGTGTAGGAGCTCTAAGGCGAGGGGATGTCTGCAGAGTAGAGGAACAGGGAAGGGCGTGTAGGAGGGACGAGGAGTGAACCTGGCAGCTCTGGTTCAGTTGGATGCTGAAGAGTCATGGATGCTGGGCCTGTGGGCACCGTCCTCCAGGCGGGAGCCACCGAAAGTTCTTGAGCAGGGCAGTGACCAGGTGTATGTTTGGAGAAGGTCCCTCTGGAGGCCTTCCTGGCAGACAGGGGATTGGATTCAGGCTGTGGAAGCAGGACGGTAGGGGGTGTGATTCCAGGATGTGGAAAGGAGATAAAAATGAAGAGCCCCGGGGAAGAGGTCAAGGGAGTTGGGGGACCCGAGTTCCTGGCTCCAGGGGGAAGCGAGTGGTAAGTCTGTGAACAGAGCCCAGCTGTGGATTCTGTCAATGGGGTCAGGTCTCACCCTGTGGCTTCCAGGGCAGCAAGGCAGGAAGGAGGCGTCTGCCACAAGGCCAGCTTCCTGGGGCCAGAGCCGTGAAGGCCCAGGGGACCTGCGTGTCTTGGCTCCACGCCAGATGTGTTATTATTTATGTCTCTGAGAATGTCTGGATCTCAGAGCCGAATTACAATAAAAACATCTTTAAACTTATTTCTACCTCATTTTGGGGTTGCCAGCTCACCTGATCATTTTTATGAACTGTCATGAACACTGATGACATTTTATGAGCCTTTTACATGGGACACTACAGAATACATTTGTCAGCGAGGCCTGTAGGGAAACCC'}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_training_df.iloc[0][['question', 'answer', 'reference_sequence', 'mutated_sequence']].to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clinvar_id': '1170208',\n", + " 'original_window': 'GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGACTAACACGGTGAAACCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGTGGTGGCGGGTGCCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATGAACCCGGGAGGCGGAGCTTGCAGTGAGCCCAGATTGTGCCACCGCACTCCAGCCTGGGCAACAGAGTGAGACTCCGTCTCAAAAAACTAAAAAAGAAGAGAGGTGGGAGAGGAGAGGCTGTCAGAGCCTCTAAGCCCTGGTGCTTGGGCTGCAGAAGGGCAGAGCTAAGCGGGACTTCCCAGCACAGCACACTCCGGACAGGCTGTGGCTGTTGAAGGGACCCCCGAGCTCCAGCTGACACGCGGAGGCCCGGGCACAGACAGGCATCATACCTTCGGCCTTGGCCGCACTCTGTGGTCATTGGTGTTGGGGGCAGCCCAGGGTCAGGGCAGGGTCTCAGCCTCGGACCCCAGGCCCCACCCCTTGCCCAGCAGTGCTGCGTTTTCCCAGTGAGCTGTCGTGGAGAGAGCAGAGGGGACCCAGCGCAGGCCCAGTGGCCGGTGAGGGGAGACGTGGCTCTGGGACGGGGGCCTCCACCTGGGTGGGGGGATGCTCCAGCTTCCAGACCCTTGGGGAGGGGGCACTGCCCAAACTAAGCTGGCACTGGGGCTGTGCATTTGAAGGTGATGGTGGTTCTAGGTCTGAGGAGGACACCCTCCTAACAGCCTCATCCCCAAGCTCCGGGCTGTGTTGTGGCAATGGGAGGGAGGAAGTCTGAGGAGACCCTGGTGACTGAACGGAGGAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAGCCAGCGGCCTGTTACTACATTTAAAAAAGCCTCCCGCCCACTGGAAAATAATCAATAACTTTCCTTTATCCCTGGGGGTGGCAGGACCTAGAAACACTGGAGGAGTCCGGAAGTGCCTGGGGCTGGGCCGGCGCTGGTGTGCTGTGCAGGGTGCCGCGGGCACGTCCGCCGCGTGTGTGCGTCAGCTCGGGGCTCGGCTGTGCTCTGCAGGGACCACAGCGGGCGTGTCTGTGCTCCCACCCGAGGCACCCACAGCTCCACACGCTCGTTCCGTGGGTGCAAAGGAGATGGGAGAAAGAAGCCCTGTGAGAAATGCGGGGCAGGGTTTGCGGAACAGGGGACCTGGGCTGGTGAGGGCTCCTCGTCTGGTGACCTGTGAGCCCCGGGGCCTGCAGTCTGCGAGGGTTCAGCTCAGACAGTTGCCAGTGGCCTTGCACCAGGCTGCAGCTGCCCCTGAGCCGGGCTGTGCGTGGCGCTGATGAAATAGAAAAGGGCATTCGCTTGTCAACGTTGGCATCGGTGGCAGGGTGTGGTGGGCAGAAGGGTCACAAAGTACGGGTGGGATTGGCAGGCAGATACACGGAGGGAACGTGCGCATTTGAGTGCACGTCCACCAGCACCAGCCCCAGGCCACAGGCAGATCCCAGGAGACACGCAGGGGCCCTAAGAAGGGAGCTGGGAATGAGGGGCCACACAAGCCCGGGACGGAGGCCTGTCGCACATGGGGTGGCCCCGACTCAGGCCCTGGAGTTGGCCAGGACCCTCTAGCATCCTCAAGGGCTGGGCCAACCAGGCTGGCGTGGGGTGGGGCAGGGGAGGGCTGAGCCAGTGGGCGTCGTCTGTAGGGGGATGCCCAACTGCGGCCCCGTCTCTCGGCTCTCCTCTGGGTCTCTGGCCAGCTGTGGCTCCTGCTGGCCCCAGGCGCATCCCAGAGGCAGGTAGAGGGAGGATGGCTGCTCTGAGGGCACCTCTGCCGTGCTTGGGGCTCGGCCTGGGGTGCGAGACCAGGGCAGACCCCCGGGAGATGGAACGGCCCGGTCCAGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCAGAACCGGGGGCGGCTGGCAGACAAGAGGACAGTCGCCCTGCCTGCCGCCCGGAACCTGAAGAAGGAGCGAACTCCCAGCTTCTCTGCCAGCGATGGTGACAGCGACGGGAGTGGCCCCACCTGTGGGCGGCGGCCAGGCTTGAAGCAGGAGGATGGTCCGCACATCCGTATCATGAAGAGAAGGTACTTGGACCAGGGCCGGACAGGAAGGCGCAAGGCTCAGATGGGGCTGGAGCTTCAGGCCTTCAGCTGCTCAGATGAGAGTGTCCACACCGGCCTCCCACACCTTCCCTCAGATGCTGGTCTTTTTGGGGTCCTGTGTGGGTCGCAGGCAGGAGCTGTTTCCTCATCTGCCCCCTGTCTGGCGTCCCCTCCCACCTCTGCTCTGCGGCGCTCACTGGCAGAGGCAGGTTGGCAGCAGTTGGGACCCAGAGGTCTGCACCTTCCTGGGCCGACGCTCCAGCTACCCTTGCTGACCGGGTCCCAGTCTGGCCAGAGAGCAGCTCTAGCAACAGGGAGCTCCATTCAGGCTCGTGACTGGCTGTGCAGAAGCAGCCTCGGCCCCCACCTGCGGTACAACAGGAGGGCTCCTCTGAGTGCACGGCAACAAGCAAGAGGGAGAAGGGGCCTCGGTCCTGTTCTTCCTGATGCGTGTCTGCTGAGGCCAGGAGCTGGCTTTGGCCCATGGGCCTGTCCTAGTGGGAGGCCCCAGCATGTTGAGCCAGTAGCAGGTGGTGCTGGGCATGGCAGCCGCCCTCGTTCACTGCCCAGGGCTGTGGCCCAGCGGGGCACTGACCCGAGACAGGTCTGCGCACGCCCTGCTATCCTGAGGCTGGGGTCAGGGGCCTCCAGAGCAACATGGACCTTCTGCTTCCCTTCCTGCAGAGTCCACACCCACTGGGACGTGAACATCTCTTTCCGAGAGGCGTCCTGCAGGTAGGAGCCGTGCTGTGCGTGCATAAGAGGGGGCCGTGACTCCCCTCCCTCCCTCCCACCCCTGACCGTGCCCTGCTGTCTGCTGTCCGCTGTCTCAGCGTGAGCTGATGCTGTGATGCTGGCTGAGTGTCTGCCAGGTTTGACATGTGCTGCAAGGTTGTCCCCCATCCCGGGAGGCAGACAGTGTTGCACCCAGTTGGGACTGAGGGACCCCAGACCCAGTCAGATGCAGCTCTCGGCAGCAGCTCAGGTGTGAGTTCTGGGCAGCCCGGCCCTGGAGTTAGAGTGCACTTCCTCCCATGTGAGACTGGCCATTTGAGCCCAAAAATGAGGCTGTCACCTCCCCCTTCCCACCCTCCTAGAGACCCACAAGGAGGTGAGAATGCTGATGTGTGAGTGGGGCCCTGAAGGGTGTGTAGGAGCTCTAAGGCGAGGGGATGTCTGCAGAGTAGAGGAACAGGGAAGGGCGTGTAGGAGGGACGAGGAGTGAACCTGGCAGCTCTGGTTCAGTTGGATGCTGAAGAGTCATGGATGCTGGGCCTGTGGGCACCGTCCTCCAGGCGGGAGCCACCGAAAGTTCTTGAGCAGGGCAGTGACCAGGTGTATGTTTGGAGAAGGTCCCTCTGGAGGCCTTCCTGGCAGACAGGGGATTGGATTCAGGCTGTGGAAGCAGGACGGTAGGGGGTGTGATTCCAGGATGTGGAAAGGAGATAAAAATGAAGAGCCCCGGGGAAGAGGTCAAGGGAGTTGGGGGACCCGAGTTCCTGGCTCCAGGGGGAAGCGAGTGGTAAGTCTGTGAACAGAGCCCAGCTGTGGATTCTGTCAATGGGGTCAGGTCTCACCCTGTGGCTTCCAGGGCAGCAAGGCAGGAAGGAGGCGTCTGCCACAAGGCCAGCTTCCTGGGGCCAGAGCCGTGAAGGCCCAGGGGACCTGCGTGTCTTGGCTCCACGCCAGATGTGTTATTATTTATGTCTCTGAGAATGTCTGGATCTCAGAGCCGAATTACAATAAAAACATCTTTAAACTTATTTCTACCTCATTTTGGGGTTGCCAGCTCACCTGATCATTTTTATGAACTGTCATGAACACTGATGACATTTTATGAGCCTTTTACATGGGACACTACAGAATACATTTGTCAGCGAGGCCTGTAGGGAAACCC',\n", + " 'mutated_window': 'GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGACTAACACGGTGAAACCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGTGGTGGCGGGTGCCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATGAACCCGGGAGGCGGAGCTTGCAGTGAGCCCAGATTGTGCCACCGCACTCCAGCCTGGGCAACAGAGTGAGACTCCGTCTCAAAAAACTAAAAAAGAAGAGAGGTGGGAGAGGAGAGGCTGTCAGAGCCTCTAAGCCCTGGTGCTTGGGCTGCAGAAGGGCAGAGCTAAGCGGGACTTCCCAGCACAGCACACTCCGGACAGGCTGTGGCTGTTGAAGGGACCCCCGAGCTCCAGCTGACACGCGGAGGCCCGGGCACAGACAGGCATCATACCTTCGGCCTTGGCCGCACTCTGTGGTCATTGGTGTTGGGGGCAGCCCAGGGTCAGGGCAGGGTCTCAGCCTCGGACCCCAGGCCCCACCCCTTGCCCAGCAGTGCTGCGTTTTCCCAGTGAGCTGTCGTGGAGAGAGCAGAGGGGACCCAGCGCAGGCCCAGTGGCCGGTGAGGGGAGACGTGGCTCTGGGACGGGGGCCTCCACCTGGGTGGGGGGATGCTCCAGCTTCCAGACCCTTGGGGAGGGGGCACTGCCCAAACTAAGCTGGCACTGGGGCTGTGCATTTGAAGGTGATGGTGGTTCTAGGTCTGAGGAGGACACCCTCCTAACAGCCTCATCCCCAAGCTCCGGGCTGTGTTGTGGCAATGGGAGGGAGGAAGTCTGAGGAGACCCTGGTGACTGAACGGAGGAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAGCCAGCGGCCTGTTACTACATTTAAAAAAGCCTCCCGCCCACTGGAAAATAATCAATAACTTTCCTTTATCCCTGGGGGTGGCAGGACCTAGAAACACTGGAGGAGTCCGGAAGTGCCTGGGGCTGGGCCGGCGCTGGTGTGCTGTGCAGGGTGCCGCGGGCACGTCCGCCGCGTGTGTGCGTCAGCTCGGGGCTCGGCTGTGCTCTGCAGGGACCACAGCGGGCGTGTCTGTGCTCCCACCCGAGGCACCCACAGCTCCACACGCTCGTTCCGTGGGTGCAAAGGAGATGGGAGAAAGAAGCCCTGTGAGAAATGCGGGGCAGGGTTTGCGGAACAGGGGACCTGGGCTGGTGAGGGCTCCTCGTCTGGTGACCTGTGAGCCCCGGGGCCTGCAGTCTGCGAGGGTTCAGCTCAGACAGTTGCCAGTGGCCTTGCACCAGGCTGCAGCTGCCCCTGAGCCGGGCTGTGCGTGGCGCTGATGAAATAGAAAAGGGCATTCGCTTGTCAACGTTGGCATCGGTGGCAGGGTGTGGTGGGCAGAAGGGTCACAAAGTACGGGTGGGATTGGCAGGCAGATACACGGAGGGAACGTGCGCATTTGAGTGCACGTCCACCAGCACCAGCCCCAGGCCACAGGCAGATCCCAGGAGACACGCAGGGGCCCTAAGAAGGGAGCTGGGAATGAGGGGCCACACAAGCCCGGGACGGAGGCCTGTCGCACATGGGGTGGCCCCGACTCAGGCCCTGGAGTTGGCCAGGACCCTCTAGCATCCTCAAGGGCTGGGCCAACCAGGCTGGCGTGGGGTGGGGCAGGGGAGGGCTGAGCCAGTGGGCGTCGTCTGTAGGGGGATGCCCAACTGCGGCCCCGTCTCTCGGCTCTCCTCTGGGTCTCTGGCCAGCTGTGGCTCCTGCTGGCCCCAGGCGCATCCCAGAGGCAGGTAGAGGGAGGATGGCTGCTCTGAGGGCACCTCTGCCGTGCTTGGGGCTCGGCCTGGGGTGCGAGACCAGGGCAGACCCCCGGGAGATGGAACGGCCCGGTCCAGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCAGAACCGGGGGCGGCTGGCAGACAAGAGGACAGTCGCCCTGCCTGCCGCCCAGAACCTGAAGAAGGAGCGAACTCCCAGCTTCTCTGCCAGCGATGGTGACAGCGACGGGAGTGGCCCCACCTGTGGGCGGCGGCCAGGCTTGAAGCAGGAGGATGGTCCGCACATCCGTATCATGAAGAGAAGGTACTTGGACCAGGGCCGGACAGGAAGGCGCAAGGCTCAGATGGGGCTGGAGCTTCAGGCCTTCAGCTGCTCAGATGAGAGTGTCCACACCGGCCTCCCACACCTTCCCTCAGATGCTGGTCTTTTTGGGGTCCTGTGTGGGTCGCAGGCAGGAGCTGTTTCCTCATCTGCCCCCTGTCTGGCGTCCCCTCCCACCTCTGCTCTGCGGCGCTCACTGGCAGAGGCAGGTTGGCAGCAGTTGGGACCCAGAGGTCTGCACCTTCCTGGGCCGACGCTCCAGCTACCCTTGCTGACCGGGTCCCAGTCTGGCCAGAGAGCAGCTCTAGCAACAGGGAGCTCCATTCAGGCTCGTGACTGGCTGTGCAGAAGCAGCCTCGGCCCCCACCTGCGGTACAACAGGAGGGCTCCTCTGAGTGCACGGCAACAAGCAAGAGGGAGAAGGGGCCTCGGTCCTGTTCTTCCTGATGCGTGTCTGCTGAGGCCAGGAGCTGGCTTTGGCCCATGGGCCTGTCCTAGTGGGAGGCCCCAGCATGTTGAGCCAGTAGCAGGTGGTGCTGGGCATGGCAGCCGCCCTCGTTCACTGCCCAGGGCTGTGGCCCAGCGGGGCACTGACCCGAGACAGGTCTGCGCACGCCCTGCTATCCTGAGGCTGGGGTCAGGGGCCTCCAGAGCAACATGGACCTTCTGCTTCCCTTCCTGCAGAGTCCACACCCACTGGGACGTGAACATCTCTTTCCGAGAGGCGTCCTGCAGGTAGGAGCCGTGCTGTGCGTGCATAAGAGGGGGCCGTGACTCCCCTCCCTCCCTCCCACCCCTGACCGTGCCCTGCTGTCTGCTGTCCGCTGTCTCAGCGTGAGCTGATGCTGTGATGCTGGCTGAGTGTCTGCCAGGTTTGACATGTGCTGCAAGGTTGTCCCCCATCCCGGGAGGCAGACAGTGTTGCACCCAGTTGGGACTGAGGGACCCCAGACCCAGTCAGATGCAGCTCTCGGCAGCAGCTCAGGTGTGAGTTCTGGGCAGCCCGGCCCTGGAGTTAGAGTGCACTTCCTCCCATGTGAGACTGGCCATTTGAGCCCAAAAATGAGGCTGTCACCTCCCCCTTCCCACCCTCCTAGAGACCCACAAGGAGGTGAGAATGCTGATGTGTGAGTGGGGCCCTGAAGGGTGTGTAGGAGCTCTAAGGCGAGGGGATGTCTGCAGAGTAGAGGAACAGGGAAGGGCGTGTAGGAGGGACGAGGAGTGAACCTGGCAGCTCTGGTTCAGTTGGATGCTGAAGAGTCATGGATGCTGGGCCTGTGGGCACCGTCCTCCAGGCGGGAGCCACCGAAAGTTCTTGAGCAGGGCAGTGACCAGGTGTATGTTTGGAGAAGGTCCCTCTGGAGGCCTTCCTGGCAGACAGGGGATTGGATTCAGGCTGTGGAAGCAGGACGGTAGGGGGTGTGATTCCAGGATGTGGAAAGGAGATAAAAATGAAGAGCCCCGGGGAAGAGGTCAAGGGAGTTGGGGGACCCGAGTTCCTGGCTCCAGGGGGAAGCGAGTGGTAAGTCTGTGAACAGAGCCCAGCTGTGGATTCTGTCAATGGGGTCAGGTCTCACCCTGTGGCTTCCAGGGCAGCAAGGCAGGAAGGAGGCGTCTGCCACAAGGCCAGCTTCCTGGGGCCAGAGCCGTGAAGGCCCAGGGGACCTGCGTGTCTTGGCTCCACGCCAGATGTGTTATTATTTATGTCTCTGAGAATGTCTGGATCTCAGAGCCGAATTACAATAAAAACATCTTTAAACTTATTTCTACCTCATTTTGGGGTTGCCAGCTCACCTGATCATTTTTATGAACTGTCATGAACACTGATGACATTTTATGAGCCTTTTACATGGGACACTACAGAATACATTTGTCAGCGAGGCCTGTAGGGAAACCC',\n", + " 'cleaned_pathogenicity': 'benign',\n", + " 'disease_name': 'SAMD11-related_disorder|not_provided',\n", + " 'gene_name': 'SAMD11',\n", + " 'gene_desc': 'sterile alpha motif domain containing 11',\n", + " 'chromosome': '1',\n", + " 'chromosome_position': '930204',\n", + " 'variant_type': 'SNV',\n", + " 'clinvar_link': 'https://www.ncbi.nlm.nih.gov/clinvar/variation/1170208/',\n", + " 'gene_id': '148398',\n", + " 'mutation_instruction': 'G>A',\n", + " 'pathogenicity': 'benign',\n", + " 'review_status': 'criteria_provided,_multiple_submitters,_no_conflicts'}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.iloc[0].to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "answer\n", + "benign 80\n", + "pathogenic; ['Intellectual_disability,_X-linked_102'] 1\n", + "pathogenic; ['Familial_adenomatous_polyposis_2', 'Hereditary_cancer-predisposing_syndrome'] 1\n", + "pathogenic; ['Familial_thoracic_aortic_aneurysm_and_aortic_dissection', 'Hereditary_cancer-predisposing_syndrome', 'Juvenile_polyposis_syndrome'] 1\n", + "pathogenic; ['Familial_cancer_of_breast', 'Hereditary_cancer-predisposing_syndrome'] 1\n", + "pathogenic; ['Bardet-Biedl_syndrome_2', 'Retinitis_pigmentosa_74'] 1\n", + "pathogenic; ['Early-onset_retinal_dystrophy', 'Leber_congenital_amaurosis', 'Leber_congenital_amaurosis_8', 'Pigmented_paravenous_retinochoroidal_atrophy', 'Retinal_dystrophy', 'Retinitis_pigmentosa_12'] 1\n", + "pathogenic; ['Autosomal_recessive_limb-girdle_muscular_dystrophy_type_2E'] 1\n", + "pathogenic; ['Childhood_Onset_Dystonias', 'Dystonia,_childhood-onset,_with_optic_atrophy_and_basal_ganglia_abnormalities', 'MECR-related_disorder', 'Mitochondrial_disease', 'Optic_atrophy'] 1\n", + "pathogenic; ['Autoimmune_thyroid_disease,_susceptibility_to,_3', 'Iodotyrosyl_coupling_defect'] 1\n", + "pathogenic; ['Duchenne_muscular_dystrophy'] 1\n", + "pathogenic; ['Ataxia-telangiectasia_syndrome', 'Hereditary_cancer-predisposing_syndrome'] 1\n", + "pathogenic; ['Autosomal_dominant_nonsyndromic_hearing_loss_6', 'Cataract_41', 'Type_2_diabetes_mellitus', 'Wolfram-like_syndrome', 'Wolfram_syndrome_1'] 1\n", + "pathogenic; ['Autosomal_recessive_limb-girdle_muscular_dystrophy_type_2B', 'Distal_myopathy_with_anterior_tibial_onset', 'Miyoshi_muscular_dystrophy_1'] 1\n", + "pathogenic; ['Breast-ovarian_cancer,_familial,_susceptibility_to,_1'] 1\n", + "pathogenic; ['Arterial_calcification,_generalized,_of_infancy,_2', 'Autosomal_recessive_inherited_pseudoxanthoma_elasticum', 'Pseudoxanthoma_elasticum,_forme_fruste'] 1\n", + "pathogenic; ['Hereditary_cancer-predisposing_syndrome', 'Juvenile_polyposis_syndrome'] 1\n", + "pathogenic; ['Monogenic_diabetes'] 1\n", + "pathogenic; ['Autosomal_recessive_osteopetrosis_1'] 1\n", + "pathogenic; ['Autosomal_dominant_nonsyndromic_hearing_loss_11', 'Autosomal_recessive_nonsyndromic_hearing_loss_2', 'Rare_genetic_deafness', 'Retinal_dystrophy', 'Usher_syndrome_type_1'] 1\n", + "pathogenic; ['Wilson_disease'] 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_df['answer'].sample(100).value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "visualization of table" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
clinvar_idoriginal_windowmutated_windowcleaned_pathogenicitydisease_namevariant_typeclinvar_linkmutation_instructionpathogenicityreview_status
01170208GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTG...GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTG...benignSAMD11-related_disorder|not_providedSNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...G>Abenigncriteria_provided,_multiple_submitters,_no_con...
21170010CCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATG...CCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATG...benignSAMD11-related_disorder|not_providedSNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...C>Tbenigncriteria_provided,_multiple_submitters,_no_con...
31170044GAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAG...GAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAG...benignnot_provided|SAMD11-related_disorderSNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...C>Tbenigncriteria_provided,_multiple_submitters,_no_con...
51170011AGCCGTCATCTAGGTCTCCTGGAAGGTTTAGAGCCCAGCCTGGGAG...AGCCGTCATCTAGGTCTCCTGGAAGGTTTAGAGCCCAGCCTGGGAG...benignSAMD11-related_disorder|not_providedSNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...C>Gbenigncriteria_provided,_multiple_submitters,_no_con...
71169668GGTTTAGAGCCCAGCCTGGGAGTCTTTGGTGCTGAAACGGATCTGC...GGTTTAGAGCCCAGCCTGGGAGTCTTTGGTGCTGAAACGGATCTGC...benignnot_providedSNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...C>Tbenigncriteria_provided,_multiple_submitters,_no_con...
.................................
342875522717TGTCATCCCTCTTATTAATCATCATCCTAGCCCTAAGTCTGGCCTA...TGTCATCCCTCTTATTAATCATCATCCTAGCCCTAAGTCTGGCCTA...benignMitochondrial_disease|not_specifiedSNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...G>Abenigncriteria_provided,_multiple_submitters,_no_con...
34287865510CTAAAACTAATCGTCCCAACAATTATATTACTACCACTGACATGAC...CTAAAACTAATCGTCCCAACAATTATATTACTACCACTGACATGAC...benignLeber_optic_atrophy|Leigh_syndrome|Mitochondri...SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...T>Cbenignreviewed_by_expert_panel
342905140592AGTTACAATCGGCATCAACCAACCACACCTAGCATTCCTGCACATC...AGTTACAATCGGCATCAACCAACCACACCTAGCATTCCTGCACATC...benignFamilial_cancer_of_breast|Mitochondrial_diseas...SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...A>Gbenignreviewed_by_expert_panel
342907235623TAAACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACT...TAAACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACT...benignLeigh_syndrome|not_providedSNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...A>Gbenigncriteria_provided,_multiple_submitters,_no_con...
342909252455AGCCCTAGACCTCAACTACCTAACCAACAAACTTAAAATAAAATCC...AGCCCTAGACCTCAACTACCTAACCAACAAACTTAAAATAAAATCC...benignnot_specified|Leigh_syndromeSNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...G>Cbenigncriteria_provided,_multiple_submitters,_no_con...
\n", + "

93800 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " clinvar_id original_window \\\n", + "0 1170208 GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTG... \n", + "2 1170010 CCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATG... \n", + "3 1170044 GAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAG... \n", + "5 1170011 AGCCGTCATCTAGGTCTCCTGGAAGGTTTAGAGCCCAGCCTGGGAG... \n", + "7 1169668 GGTTTAGAGCCCAGCCTGGGAGTCTTTGGTGCTGAAACGGATCTGC... \n", + "... ... ... \n", + "342875 522717 TGTCATCCCTCTTATTAATCATCATCCTAGCCCTAAGTCTGGCCTA... \n", + "342878 65510 CTAAAACTAATCGTCCCAACAATTATATTACTACCACTGACATGAC... \n", + "342905 140592 AGTTACAATCGGCATCAACCAACCACACCTAGCATTCCTGCACATC... \n", + "342907 235623 TAAACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACT... \n", + "342909 252455 AGCCCTAGACCTCAACTACCTAACCAACAAACTTAAAATAAAATCC... \n", + "\n", + " mutated_window \\\n", + "0 GGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTG... \n", + "2 CCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATG... \n", + "3 GAGGGAGTGAGTTAGACGCTCTCAAGGGCTCTGCCACCTCCCGGAG... \n", + "5 AGCCGTCATCTAGGTCTCCTGGAAGGTTTAGAGCCCAGCCTGGGAG... \n", + "7 GGTTTAGAGCCCAGCCTGGGAGTCTTTGGTGCTGAAACGGATCTGC... \n", + "... ... \n", + "342875 TGTCATCCCTCTTATTAATCATCATCCTAGCCCTAAGTCTGGCCTA... \n", + "342878 CTAAAACTAATCGTCCCAACAATTATATTACTACCACTGACATGAC... \n", + "342905 AGTTACAATCGGCATCAACCAACCACACCTAGCATTCCTGCACATC... \n", + "342907 TAAACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACT... \n", + "342909 AGCCCTAGACCTCAACTACCTAACCAACAAACTTAAAATAAAATCC... \n", + "\n", + " cleaned_pathogenicity \\\n", + "0 benign \n", + "2 benign \n", + "3 benign \n", + "5 benign \n", + "7 benign \n", + "... ... \n", + "342875 benign \n", + "342878 benign \n", + "342905 benign \n", + "342907 benign \n", + "342909 benign \n", + "\n", + " disease_name variant_type \\\n", + "0 SAMD11-related_disorder|not_provided SNV \n", + "2 SAMD11-related_disorder|not_provided SNV \n", + "3 not_provided|SAMD11-related_disorder SNV \n", + "5 SAMD11-related_disorder|not_provided SNV \n", + "7 not_provided SNV \n", + "... ... ... \n", + "342875 Mitochondrial_disease|not_specified SNV \n", + "342878 Leber_optic_atrophy|Leigh_syndrome|Mitochondri... SNV \n", + "342905 Familial_cancer_of_breast|Mitochondrial_diseas... SNV \n", + "342907 Leigh_syndrome|not_provided SNV \n", + "342909 not_specified|Leigh_syndrome SNV \n", + "\n", + " clinvar_link \\\n", + "0 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "2 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "3 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "5 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "7 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "... ... \n", + "342875 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "342878 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "342905 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "342907 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "342909 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "\n", + " mutation_instruction pathogenicity \\\n", + "0 G>A benign \n", + "2 C>T benign \n", + "3 C>T benign \n", + "5 C>G benign \n", + "7 C>T benign \n", + "... ... ... \n", + "342875 G>A benign \n", + "342878 T>C benign \n", + "342905 A>G benign \n", + "342907 A>G benign \n", + "342909 G>C benign \n", + "\n", + " review_status \n", + "0 criteria_provided,_multiple_submitters,_no_con... \n", + "2 criteria_provided,_multiple_submitters,_no_con... \n", + "3 criteria_provided,_multiple_submitters,_no_con... \n", + "5 criteria_provided,_multiple_submitters,_no_con... \n", + "7 criteria_provided,_multiple_submitters,_no_con... \n", + "... ... \n", + "342875 criteria_provided,_multiple_submitters,_no_con... \n", + "342878 reviewed_by_expert_panel \n", + "342905 reviewed_by_expert_panel \n", + "342907 criteria_provided,_multiple_submitters,_no_con... \n", + "342909 criteria_provided,_multiple_submitters,_no_con... \n", + "\n", + "[93800 rows x 10 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['pathogenicity']=='benign']" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
clinvar_idoriginal_windowmutated_windowcleaned_pathogenicitydisease_namevariant_typeclinvar_linkmutation_instructionpathogenicityreview_status
421185392TTATTGATGTGAAATTCATATAACATAAAACTAACCATTTTAAAGA...TTATTGATGTGAAATTCATATAACATAAAACTAACCATTTTAAAGA...benignMendelian_susceptibility_to_mycobacterial_dise...non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...T>TAbenigncriteria_provided,_multiple_submitters,_no_con...
67666960TGGTGCAGGGAGGTGACTGGGTCCTTGGCCATGGGGTTGGGACCTG...TGGTGCAGGGAGGTGACTGGGTCCTTGGCCATGGGGTTGGGACCTG...pathogenicCongenital_myasthenic_syndrome|Congenital_myas...non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...G>GGGGCCpathogenic/likely_pathogeniccriteria_provided,_multiple_submitters,_no_con...
69970311ATCAGCAGGTGCCCGTTGGATTTGGACTGGGAGTCCCAGGGCCTTG...ATCAGCAGGTGCCCGTTGGATTTGGACTGGGAGTCCCAGGGCCTTG...pathogenicCongenital_myasthenic_syndrome_8non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...G>GCpathogenic/likely_pathogeniccriteria_provided,_multiple_submitters,_no_con...
80930633GTGCCTGAGGCAGCTTTGTTGGCCACGTTGAGGTCTGGTGATGGGA...GTGCCTGAGGCAGCTTTGTTGGCCACGTTGAGGTCTGGTGATGGGA...pathogenicPresynaptic_congenital_myasthenic_syndrome|Con...non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...CGCTCCGGCCAGTGCCAGGGTCGAGGTGAGCGGCTCCCCCGGGGGA...likely_pathogeniccriteria_provided,_multiple_submitters,_no_con...
90263160TCGCGGGACCCCTGCTCCAACGTGACCTGCAGCTTCGGCAGCACCT...TCGCGGGACCCCTGCTCCAACGTGACCTGCAGCTTCGGCAGCACCT...benignnot_provided|not_specified|Congenital_myasthen...non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...CCT>Cbenigncriteria_provided,_multiple_submitters,_no_con...
.................................
3428449654TACATAAAATCTAGACAAAAAAGGAAGGAATCGAACCCCCCAAAGC...TACATAAAATCTAGACAAAAAAGGAAGGAATCGAACCCCCCAAAGC...pathogenicMitochondrial_disease|Mitochondrial_complex_IV...non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...TTTTTTCTTCGCAGGA>Tlikely_pathogenicreviewed_by_expert_panel
3428459656CAAGCCAACCCCATGGCCTCCATGACTTTTTCAAAAAGGTATTAGA...CAAGCCAACCCCATGGCCTCCATGACTTTTTCAAAAAGGTATTAGA...pathogenicMitochondrial_disease|Mitochondrial_complex_IV...non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...A>AClikely_pathogenicreviewed_by_expert_panel
342876693440ATGAGTGACTACAAAAAGGATTAGACTGAACCGAATTGGTATATAG...ATGAGTGACTACAAAAAGGATTAGACTGAACCGAATTGGTATATAG...pathogenicMitochondrial_myopathy_with_reversible_cytochr...non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...CA>Clikely_pathogenicreviewed_by_expert_panel
342895800503ACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAG...ACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAG...pathogenicMitochondrial_diseasenon_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...CTA>Clikely_pathogenicreviewed_by_expert_panel
3429019686TACCGCTAACAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTA...TACCGCTAACAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTA...pathogenicMitochondrial_disease|Parkinsonism/MELAS_overl...non_SNVhttps://www.ncbi.nlm.nih.gov/clinvar/variation...AAATT>Alikely_pathogenicreviewed_by_expert_panel
\n", + "

36097 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " clinvar_id original_window \\\n", + "42 1185392 TTATTGATGTGAAATTCATATAACATAAAACTAACCATTTTAAAGA... \n", + "67 666960 TGGTGCAGGGAGGTGACTGGGTCCTTGGCCATGGGGTTGGGACCTG... \n", + "69 970311 ATCAGCAGGTGCCCGTTGGATTTGGACTGGGAGTCCCAGGGCCTTG... \n", + "80 930633 GTGCCTGAGGCAGCTTTGTTGGCCACGTTGAGGTCTGGTGATGGGA... \n", + "90 263160 TCGCGGGACCCCTGCTCCAACGTGACCTGCAGCTTCGGCAGCACCT... \n", + "... ... ... \n", + "342844 9654 TACATAAAATCTAGACAAAAAAGGAAGGAATCGAACCCCCCAAAGC... \n", + "342845 9656 CAAGCCAACCCCATGGCCTCCATGACTTTTTCAAAAAGGTATTAGA... \n", + "342876 693440 ATGAGTGACTACAAAAAGGATTAGACTGAACCGAATTGGTATATAG... \n", + "342895 800503 ACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAG... \n", + "342901 9686 TACCGCTAACAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTA... \n", + "\n", + " mutated_window \\\n", + "42 TTATTGATGTGAAATTCATATAACATAAAACTAACCATTTTAAAGA... \n", + "67 TGGTGCAGGGAGGTGACTGGGTCCTTGGCCATGGGGTTGGGACCTG... \n", + "69 ATCAGCAGGTGCCCGTTGGATTTGGACTGGGAGTCCCAGGGCCTTG... \n", + "80 GTGCCTGAGGCAGCTTTGTTGGCCACGTTGAGGTCTGGTGATGGGA... \n", + "90 TCGCGGGACCCCTGCTCCAACGTGACCTGCAGCTTCGGCAGCACCT... \n", + "... ... \n", + "342844 TACATAAAATCTAGACAAAAAAGGAAGGAATCGAACCCCCCAAAGC... \n", + "342845 CAAGCCAACCCCATGGCCTCCATGACTTTTTCAAAAAGGTATTAGA... \n", + "342876 ATGAGTGACTACAAAAAGGATTAGACTGAACCGAATTGGTATATAG... \n", + "342895 ACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAG... \n", + "342901 TACCGCTAACAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTA... \n", + "\n", + " cleaned_pathogenicity \\\n", + "42 benign \n", + "67 pathogenic \n", + "69 pathogenic \n", + "80 pathogenic \n", + "90 benign \n", + "... ... \n", + "342844 pathogenic \n", + "342845 pathogenic \n", + "342876 pathogenic \n", + "342895 pathogenic \n", + "342901 pathogenic \n", + "\n", + " disease_name variant_type \\\n", + "42 Mendelian_susceptibility_to_mycobacterial_dise... non_SNV \n", + "67 Congenital_myasthenic_syndrome|Congenital_myas... non_SNV \n", + "69 Congenital_myasthenic_syndrome_8 non_SNV \n", + "80 Presynaptic_congenital_myasthenic_syndrome|Con... non_SNV \n", + "90 not_provided|not_specified|Congenital_myasthen... non_SNV \n", + "... ... ... \n", + "342844 Mitochondrial_disease|Mitochondrial_complex_IV... non_SNV \n", + "342845 Mitochondrial_disease|Mitochondrial_complex_IV... non_SNV \n", + "342876 Mitochondrial_myopathy_with_reversible_cytochr... non_SNV \n", + "342895 Mitochondrial_disease non_SNV \n", + "342901 Mitochondrial_disease|Parkinsonism/MELAS_overl... non_SNV \n", + "\n", + " clinvar_link \\\n", + "42 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "67 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "69 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "80 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "90 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "... ... \n", + "342844 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "342845 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "342876 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "342895 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "342901 https://www.ncbi.nlm.nih.gov/clinvar/variation... \n", + "\n", + " mutation_instruction \\\n", + "42 T>TA \n", + "67 G>GGGGCC \n", + "69 G>GC \n", + "80 CGCTCCGGCCAGTGCCAGGGTCGAGGTGAGCGGCTCCCCCGGGGGA... \n", + "90 CCT>C \n", + "... ... \n", + "342844 TTTTTTCTTCGCAGGA>T \n", + "342845 A>AC \n", + "342876 CA>C \n", + "342895 CTA>C \n", + "342901 AAATT>A \n", + "\n", + " pathogenicity \\\n", + "42 benign \n", + "67 pathogenic/likely_pathogenic \n", + "69 pathogenic/likely_pathogenic \n", + "80 likely_pathogenic \n", + "90 benign \n", + "... ... \n", + "342844 likely_pathogenic \n", + "342845 likely_pathogenic \n", + "342876 likely_pathogenic \n", + "342895 likely_pathogenic \n", + "342901 likely_pathogenic \n", + "\n", + " review_status \n", + "42 criteria_provided,_multiple_submitters,_no_con... \n", + "67 criteria_provided,_multiple_submitters,_no_con... \n", + "69 criteria_provided,_multiple_submitters,_no_con... \n", + "80 criteria_provided,_multiple_submitters,_no_con... \n", + "90 criteria_provided,_multiple_submitters,_no_con... \n", + "... ... \n", + "342844 reviewed_by_expert_panel \n", + "342845 reviewed_by_expert_panel \n", + "342876 reviewed_by_expert_panel \n", + "342895 reviewed_by_expert_panel \n", + "342901 reviewed_by_expert_panel \n", + "\n", + "[36097 rows x 10 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['variant_type']=='non_SNV']" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "variant_type\n", + "SNV 306816\n", + "non_SNV 36097\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['variant_type'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['likely_benign', 'benign', 'benign/likely_benign', 'pathogenic',\n", + " 'pathogenic/likely_pathogenic', 'likely_pathogenic',\n", + " 'pathogenic|drug_response', 'likely_pathogenic|drug_response',\n", + " 'benign/likely_benign|other', 'likely_benign|other', 'benign|other',\n", + " 'pathogenic/likely_pathogenic|other', 'pathogenic|other',\n", + " 'benign|association', 'likely_benign|drug_response|other',\n", + " 'pathogenic/likely_pathogenic|risk_factor', 'benign|drug_response',\n", + " 'benign/likely_benign|drug_response|other',\n", + " 'likely_pathogenic|risk_factor', 'pathogenic|risk_factor',\n", + " 'benign/likely_benign|drug_response', 'benign|risk_factor',\n", + " 'likely_benign|association', 'benign/likely_benign|other|risk_factor',\n", + " 'benign/likely_benign|association', 'likely_pathogenic|affects',\n", + " 'likely_pathogenic|other', 'benign/likely_benign|risk_factor',\n", + " 'likely_pathogenic|association',\n", + " 'pathogenic/likely_pathogenic|association',\n", + " 'benign|confers_sensitivity', 'likely_benign|risk_factor'],\n", + " dtype='object', name='pathogenicity')" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['pathogenicity'].value_counts().keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total variants: 3,493,400\n", + "\n", + "Variant type counts:\n" + ] + }, + { + "data": { + "text/plain": [ + "variant_type\n", + "SNV 3226063\n", + "non_SNV 267337\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Pathogenicity counts:\n" + ] + }, + { + "data": { + "text/plain": [ + "pathogenicity\n", + "not_pathogenic 3043681\n", + "pathogenic 449719\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Top 10 disease names:\n" + ] + }, + { + "data": { + "text/plain": [ + "disease_name\n", + "not_provided 861927\n", + "not_specified 719547\n", + "Inborn_genetic_diseases 133139\n", + "Hereditary_cancer-predisposing_syndrome 47592\n", + "Cardiovascular_phenotype 25149\n", + "Primary_ciliary_dyskinesia 17996\n", + "Inborn_genetic_diseases|not_provided 16863\n", + "not_specified|not_provided 16518\n", + "not_provided|Inborn_genetic_diseases 15874\n", + "not_provided|not_specified 14489\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Length‐difference (alt − ref) distribution:\n" + ] + }, + { + "data": { + "text/plain": [ + "len_diff\n", + "-2046 1\n", + "-2037 1\n", + "-2032 1\n", + "-2031 1\n", + "-2030 1\n", + " ..\n", + " 1951 1\n", + " 1989 1\n", + " 1992 1\n", + " 2004 1\n", + " 2019 1\n", + "Name: count, Length: 1266, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "SNVs: 3,226,063 → Transitions: 2,104,260 Transversions: 1,121,803\n", + "\n", + "Original‐window GC content (sample):\n" + ] + }, + { + "data": { + "text/plain": [ + "count 10000.000000\n", + "mean 0.471380\n", + "std 0.094873\n", + "min 0.244629\n", + "25% 0.389404\n", + "50% 0.461914\n", + "75% 0.548340\n", + "max 0.744385\n", + "Name: orig_gc, dtype: float64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Mutated‐window GC content (sample):\n" + ] + }, + { + "data": { + "text/plain": [ + "count 10000.000000\n", + "mean 0.471290\n", + "std 0.094818\n", + "min 0.244385\n", + "25% 0.389404\n", + "50% 0.461792\n", + "75% 0.548157\n", + "max 0.744385\n", + "Name: mut_gc, dtype: float64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Non-SNV events by net length change:\n", + " Insertions (len_diff>0) : 86,857\n", + " Deletions (len_diff<0) : 169,730\n", + " Balanced Delins (len_diff=0) : 10,750\n" + ] + } + ], + "source": [ + "# ─── Basic cohort statistics ─────────────────────────────────\n", + "\n", + "print(f\"Total variants: {len(df):,}\\n\")\n", + "\n", + "# Variant type\n", + "print(\"Variant type counts:\")\n", + "display(df['variant_type'].value_counts())\n", + "\n", + "# Pathogenicity\n", + "print(\"\\nPathogenicity counts:\")\n", + "display(df['pathogenicity'].value_counts())\n", + "\n", + "# Top diseases\n", + "print(\"\\nTop 10 disease names:\")\n", + "display(df['disease_name']\n", + " .replace('', 'Unknown') # collapse blanks\n", + " .value_counts()\n", + " .head(10))\n", + "\n", + "# ─── Indel vs. SNP breakdown ────────────────────────────────\n", + "\n", + "# parse ref/alt lengths\n", + "ref_alt = df['mutation_instruction'].str.split('>', expand=True)\n", + "df['ref_len'] = ref_alt[0].str.len().astype(int)\n", + "df['alt_len'] = ref_alt[1].str.len().astype(int)\n", + "df['len_diff'] = df['alt_len'] - df['ref_len']\n", + "\n", + "print(\"\\nLength‐difference (alt − ref) distribution:\")\n", + "display(df['len_diff']\n", + " .value_counts()\n", + " .sort_index())\n", + "\n", + "# ─── Transition / transversion in SNVs ─────────────────────\n", + "\n", + "# only look at true SNVs (ref_len==alt_len==1)\n", + "snv = df[(df['variant_type']=='SNV') & (df['len_diff']==0)].copy()\n", + "def is_transition(instr):\n", + " pur = {'A','G'}\n", + " pyr = {'C','T'}\n", + " r,a = instr.split('>')\n", + " return (r in pur and a in pur) or (r in pyr and a in pyr)\n", + "\n", + "snv['is_transition'] = snv['mutation_instruction'].map(is_transition)\n", + "t1 = snv['is_transition'].sum()\n", + "t2 = (~snv['is_transition']).sum()\n", + "print(f\"\\nSNVs: {len(snv):,} → Transitions: {t1:,} Transversions: {t2:,}\\n\")\n", + "\n", + "# ─── GC‐content in windows (sampled) ─────────────────────────\n", + "\n", + "# sampling to speed up\n", + "sample = df.sample(min(len(df), 10000), random_state=0)\n", + "def gc_frac(s): return (s.count('G')+s.count('C'))/len(s)\n", + "\n", + "sample['orig_gc'] = sample['original_window'].map(gc_frac)\n", + "sample['mut_gc' ] = sample['mutated_window'].map(gc_frac)\n", + "\n", + "print(\"Original‐window GC content (sample):\")\n", + "display(sample['orig_gc'].describe())\n", + "\n", + "print(\"\\nMutated‐window GC content (sample):\")\n", + "display(sample['mut_gc'].describe())\n", + "\n", + "\n", + "# ─── Better Non-SNV event breakdown ────────────────────────────────\n", + "\n", + "non_snv = df[df['variant_type'] != 'SNV']\n", + "\n", + "# counts\n", + "n_ins = (non_snv['len_diff'] > 0).sum()\n", + "n_del = (non_snv['len_diff'] < 0).sum()\n", + "n_bal = ((non_snv['len_diff']==0) & (non_snv['ref_len']>1)).sum()\n", + "\n", + "print(\"Non-SNV events by net length change:\")\n", + "print(f\" Insertions (len_diff>0) : {n_ins:,}\")\n", + "print(f\" Deletions (len_diff<0) : {n_del:,}\")\n", + "print(f\" Balanced Delins (len_diff=0) : {n_bal:,}\")\n", + "\n", + "# catch any explicit VCF-style inversions () if they exist\n", + "n_inv = df['mutation_instruction'].str.contains('').sum()\n", + "if n_inv:\n", + " print(f\" Inversions : {n_inv:,}\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mutation_instructionoriginal_windowmutated_windowpathogenicitydisease_namevariant_typeref_lenalt_lenlen_diffabs_len_diff
370378A>CGAACTGAGGAGATAGTTTTTGTTTTTAATGATTGTGCTCTTTTAAC...GAACTGAGGAGATAGTTTTTGTTTTTAATGATTGTGCTCTTTTAAC...not_pathogenicHereditary_cancer-predisposing_syndromeSNV1100
47441C>ATCTTGCTGGTTTCAGGGGAGGAGCCCGCTGTGCCAGGCCCTCATCT...TCTTGCTGGTTTCAGGGGAGGAGCCCGCTGTGCCAGGCCCTCATCT...not_pathogenicnot_specifiedSNV1100
2370658C>GACAGAAATAATGGAGTTAGAAAATCATTTAGTAGCCATCATAGTAA...ACAGAAATAATGGAGTTAGAAAATCATTTAGTAGCCATCATAGTAA...not_pathogenicDICER1-related_tumor_predispositionSNV1100
2479341C>ATGAATGCTTTTAGTTGTATGTGTTTTACGTTCATAAAAGTAAAATC...TGAATGCTTTTAGTTGTATGTGTTTTACGTTCATAAAAGTAAAATC...not_pathogenicnot_specifiedSNV1100
2340733G>ATAAGTGGGGAAGGGCCTGCTTCCTGAGTCGGAGGCTGAGAGGATGG...TAAGTGGGGAAGGGCCTGCTTCCTGAGTCGGAGGCTGAGAGGATGG...not_pathogenicnot_specifiedSNV1100
312980C>TGTCGGCCAGGGCCGCCGCGGGGCTACCGGGCGGGCTCGGGGCGGCG...GTCGGCCAGGGCCGCCGCGGGGCTACCGGGCGGGCTCGGGGCGGCG...not_pathogenicIntellectual_developmental_disorder_with_micro...SNV1100
1829920T>GGAAGGGAATACAAGGAAGGAGGAAAGGGAGTGTTAGTTTGGGCTAT...GAAGGGAATACAAGGAAGGAGGAAAGGGAGTGTTAGTTTGGGCTAT...not_pathogenicDilated_cardiomyopathy_1DD|Cardiovascular_phen...SNV1100
315617C>TTCCTGGTCCCAACCCCCTGCGCAGTATCTCTGGACGGGGCTAGACC...TCCTGGTCCCAACCCCCTGCGCAGTATCTCTGGACGGGGCTAGACC...not_pathogenicnot_providedSNV1100
2279534C>TTTACTTAGAAAAGCTCAACAAGTCTTTGGATATTTAGAGACTTTTT...TTACTTAGAAAAGCTCAACAAGTCTTTGGATATTTAGAGACTTTTT...not_pathogenicnot_providedSNV1100
2536550C>TGGGTGACACACCGGGAGAGGCTAGCAGTAAACAAAGGGAAAGGCGG...GGGTGACACACCGGGAGAGGCTAGCAGTAAACAAAGGGAAAGGCGG...not_pathogenicnot_provided|Hereditary_cancer-predisposing_sy...SNV1100
\n", + "
" + ], + "text/plain": [ + " mutation_instruction \\\n", + "370378 A>C \n", + "47441 C>A \n", + "2370658 C>G \n", + "2479341 C>A \n", + "2340733 G>A \n", + "312980 C>T \n", + "1829920 T>G \n", + "315617 C>T \n", + "2279534 C>T \n", + "2536550 C>T \n", + "\n", + " original_window \\\n", + "370378 GAACTGAGGAGATAGTTTTTGTTTTTAATGATTGTGCTCTTTTAAC... \n", + "47441 TCTTGCTGGTTTCAGGGGAGGAGCCCGCTGTGCCAGGCCCTCATCT... \n", + "2370658 ACAGAAATAATGGAGTTAGAAAATCATTTAGTAGCCATCATAGTAA... \n", + "2479341 TGAATGCTTTTAGTTGTATGTGTTTTACGTTCATAAAAGTAAAATC... \n", + "2340733 TAAGTGGGGAAGGGCCTGCTTCCTGAGTCGGAGGCTGAGAGGATGG... \n", + "312980 GTCGGCCAGGGCCGCCGCGGGGCTACCGGGCGGGCTCGGGGCGGCG... \n", + "1829920 GAAGGGAATACAAGGAAGGAGGAAAGGGAGTGTTAGTTTGGGCTAT... \n", + "315617 TCCTGGTCCCAACCCCCTGCGCAGTATCTCTGGACGGGGCTAGACC... \n", + "2279534 TTACTTAGAAAAGCTCAACAAGTCTTTGGATATTTAGAGACTTTTT... \n", + "2536550 GGGTGACACACCGGGAGAGGCTAGCAGTAAACAAAGGGAAAGGCGG... \n", + "\n", + " mutated_window pathogenicity \\\n", + "370378 GAACTGAGGAGATAGTTTTTGTTTTTAATGATTGTGCTCTTTTAAC... not_pathogenic \n", + "47441 TCTTGCTGGTTTCAGGGGAGGAGCCCGCTGTGCCAGGCCCTCATCT... not_pathogenic \n", + "2370658 ACAGAAATAATGGAGTTAGAAAATCATTTAGTAGCCATCATAGTAA... not_pathogenic \n", + "2479341 TGAATGCTTTTAGTTGTATGTGTTTTACGTTCATAAAAGTAAAATC... not_pathogenic \n", + "2340733 TAAGTGGGGAAGGGCCTGCTTCCTGAGTCGGAGGCTGAGAGGATGG... not_pathogenic \n", + "312980 GTCGGCCAGGGCCGCCGCGGGGCTACCGGGCGGGCTCGGGGCGGCG... not_pathogenic \n", + "1829920 GAAGGGAATACAAGGAAGGAGGAAAGGGAGTGTTAGTTTGGGCTAT... not_pathogenic \n", + "315617 TCCTGGTCCCAACCCCCTGCGCAGTATCTCTGGACGGGGCTAGACC... not_pathogenic \n", + "2279534 TTACTTAGAAAAGCTCAACAAGTCTTTGGATATTTAGAGACTTTTT... not_pathogenic \n", + "2536550 GGGTGACACACCGGGAGAGGCTAGCAGTAAACAAAGGGAAAGGCGG... not_pathogenic \n", + "\n", + " disease_name variant_type \\\n", + "370378 Hereditary_cancer-predisposing_syndrome SNV \n", + "47441 not_specified SNV \n", + "2370658 DICER1-related_tumor_predisposition SNV \n", + "2479341 not_specified SNV \n", + "2340733 not_specified SNV \n", + "312980 Intellectual_developmental_disorder_with_micro... SNV \n", + "1829920 Dilated_cardiomyopathy_1DD|Cardiovascular_phen... SNV \n", + "315617 not_provided SNV \n", + "2279534 not_provided SNV \n", + "2536550 not_provided|Hereditary_cancer-predisposing_sy... SNV \n", + "\n", + " ref_len alt_len len_diff abs_len_diff \n", + "370378 1 1 0 0 \n", + "47441 1 1 0 0 \n", + "2370658 1 1 0 0 \n", + "2479341 1 1 0 0 \n", + "2340733 1 1 0 0 \n", + "312980 1 1 0 0 \n", + "1829920 1 1 0 0 \n", + "315617 1 1 0 0 \n", + "2279534 1 1 0 0 \n", + "2536550 1 1 0 0 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sample(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "check to see which variant types from the vep vcf are not included in the fasta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '2' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '3' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '4' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '5' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '6' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '7' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '8' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '9' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '10' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '11' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '12' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '13' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '14' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '15' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '16' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '17' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '18' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '19' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '20' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '21' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '22' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'X' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'Y' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'MT' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_113889.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_187633.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_187661.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_187693.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NW_009646201.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In VCF but not in FASTA: ['NT_113889.1', 'NT_187633.1', 'NT_187661.1', 'NT_187693.1', 'NW_009646201.1']\n", + "In both VCF and FASTA: ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '3', '4', '5', '6', '7', '8', '9', 'MT', 'X', 'Y']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[W::vcf_parse] Contig '2' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '3' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '4' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '5' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '6' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '7' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '8' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '9' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '10' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '11' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '12' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '13' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '14' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '15' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '16' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '17' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '18' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '19' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '20' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '21' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '22' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'X' is not defined in the header. (Quick workaround: index the file with tabix.)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Counts of variants on missing contigs:\n", + " NT_113889.1: 1\n", + " NT_187633.1: 10\n", + " NT_187661.1: 8\n", + " NT_187693.1: 10\n", + " NW_009646201.1: 1\n", + "\n", + "Total variants on contigs present in both VCF and FASTA: 3494465\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[W::vcf_parse] Contig 'Y' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'MT' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_113889.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_187633.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_187661.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_187693.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NW_009646201.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n" + ] + } + ], + "source": [ + "import pysam\n", + "from collections import Counter\n", + "\n", + "vcf_path = \"SCRATCH_DIR/DNASNVData113/clinvar_data/clinvar_coding_only.vcf\"\n", + "fasta_path = \"SCRATCH_DIR/DNASNVData113/clinvar_data/vep-cache-113/homo_sapiens/113_GRCh38/Homo_sapiens.GRCh38.dna.toplevel.fa\"\n", + "\n", + "# 1) open VCF\n", + "vcf = pysam.VariantFile(vcf_path)\n", + "\n", + "# 2) get contigs from header if present, else from records\n", + "vcf_contigs = set(vcf.header.contigs)\n", + "if not vcf_contigs:\n", + " vcf_contigs = { rec.contig for rec in vcf }\n", + " vcf = pysam.VariantFile(vcf_path) # reopen to iterate again\n", + "\n", + "# 3) open FASTA and get its contigs\n", + "fa = pysam.FastaFile(fasta_path)\n", + "fasta_contigs = set(fa.references)\n", + "\n", + "# 4) compute sets\n", + "missing = sorted(vcf_contigs - fasta_contigs)\n", + "common = sorted(vcf_contigs & fasta_contigs)\n", + "\n", + "print(\"In VCF but not in FASTA:\", missing)\n", + "print(\"In both VCF and FASTA:\", common)\n", + "\n", + "# 5) count variants by category\n", + "counts_missing = Counter()\n", + "counts_common = 0\n", + "\n", + "for rec in vcf:\n", + " chrom = rec.contig\n", + " if chrom in missing:\n", + " counts_missing[chrom] += 1\n", + " elif chrom in fasta_contigs:\n", + " counts_common += 1\n", + "\n", + "# 6) report\n", + "print(\"\\nCounts of variants on missing contigs:\")\n", + "for contig in missing:\n", + " print(f\" {contig}: {counts_missing[contig]}\")\n", + "\n", + "print(f\"\\nTotal variants on contigs present in both VCF and FASTA: {counts_common}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/BioReason/data/Dataset Figures.ipynb b/BioReason/data/Dataset Figures.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c6eaff194c30a73420c93362b7c6ad1d4f507d71 --- /dev/null +++ b/BioReason/data/Dataset Figures.ipynb @@ -0,0 +1,591 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b73f6a17-b1f5-4f8f-9493-0a0363095b09", + "metadata": {}, + "source": [ + "# Making the Dataset Figures" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f58fac67-18d9-414d-94ea-73549d5acbd7", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a446e28e-acee-4159-82e6-8bcbc7e14bf6", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e2d8d17687c24dd08017894671c9e692", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "README.md: 0%| | 0.00/2.82k [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "bars = ['Splits', 'Disease Categories']\n", + "split_subcategories = ['Train', 'Test', 'Validation']\n", + "disease_subcategories = ['Neurodegenerative', 'Cancer', 'Metabolic', 'Other']\n", + "\n", + "# Create data arrays\n", + "split_counts = np.array([len(ds['train']), len(ds['test']), len(ds['val'])])\n", + "disease_counts = np.array([41.3, 28.2, 12.7, 17.8])\n", + "\n", + "# Convert to percentage\n", + "split_perc = split_counts / split_counts.sum() * 100\n", + "disease_perc = disease_counts / disease_counts.sum() * 100\n", + "\n", + "fig, ax = plt.subplots()\n", + "bar_height = 0.35\n", + "y = np.arange(len(bars)) # [0, 1]\n", + "\n", + "# Plot \"Splits\" as a horizontal stacked bar\n", + "left = 0\n", + "for i, val in enumerate(split_perc):\n", + " ax.barh(y[0], val, left=left, height=bar_height, label=split_subcategories[i])\n", + " left += val\n", + "\n", + "# Plot \"Disease Categories\" as a horizontal stacked bar\n", + "left = 0\n", + "for i, val in enumerate(disease_perc):\n", + " ax.barh(y[1], val, left=left, height=bar_height, label=disease_subcategories[i])\n", + " left += val\n", + "\n", + "# Labels and legend\n", + "ax.set_yticks(y)\n", + "ax.set_yticklabels(bars)\n", + "ax.set_xlabel('Percentage')\n", + "ax.set_title('Percent Stacked Bar Graph (Horizontal)')\n", + "ax.legend(loc='lower right', bbox_to_anchor=(1.05, 0))\n", + "\n", + "plt.tight_layout()\n", + "plt.savefig(\"stacked_bar_graph.svg\", format=\"svg\") # Save as SVG\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "4fc5111e-22c0-4a4f-a1cb-b8bb206fbb94", + "metadata": {}, + "source": [ + "# Task 2 and 5 Disease Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "b4505210-a97b-4f94-a5cd-48d05df1bc32", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "ds = load_dataset(\"wanglab/bioR_tasks\", 'variant_effect_coding')" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "7a9f4934-0ac8-4114-b8a1-7dc20826151e", + "metadata": {}, + "outputs": [], + "source": [ + "disease = (ds['train']['answer'] + ds['test']['answer'])" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "5da68fbb-9f0f-4c09-99ab-793ad2940706", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import Counter\n", + "\n", + "# Count occurrences of each disease name\n", + "disease_counts = Counter(ds['train']['answer'] + ds['test']['answer'])\n", + "\n", + "# Write to TSV file\n", + "with open(\"VEP_1_labels.tsv\", \"w\") as f:\n", + " f.write(\"Disease\\tCount\\n\") # Header\n", + " for disease, count in disease_counts.most_common(): # sorted by count\n", + " f.write(f\"{disease}\\t{count}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "0414a333-64df-40d8-9b3d-a0fff5bfcb1e", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "ds = load_dataset(\"wanglab/bioR_tasks\", 'task5_variant_effect_non_snv')" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "880652b7-4986-4fb8-8c20-5bcb57547ca2", + "metadata": {}, + "outputs": [], + "source": [ + "answer = (ds['train']['answer'] + ds['test']['answer'])" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "254bb39e-8bb7-43bc-b5f9-d82ad10b2093", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import Counter\n", + "\n", + "# Count occurrences of each disease name\n", + "disease_counts = Counter(ds['train']['answer'] + ds['test']['answer'])\n", + "\n", + "# Write to TSV file\n", + "with open(\"VEP_Non_SNV_labels.tsv\", \"w\") as f:\n", + " f.write(\"Disease\\tCount\\n\") # Header\n", + " for disease, count in disease_counts.most_common(): # sorted by count\n", + " f.write(f\"{disease}\\t{count}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "89f19d13-93a0-46e2-98e2-55e25f4ee399", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "ds = load_dataset(\"wanglab/bioR_tasks\", 'kegg_variant_2k')" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "b7fbb940-ac12-4d29-9d33-acdf46307008", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import Counter\n", + "\n", + "# Count occurrences of each disease name\n", + "disease_counts = Counter(ds['train']['answer'])\n", + "\n", + "# Write to TSV file\n", + "with open(\"KEGG_disease_labels.tsv\", \"w\") as f:\n", + " f.write(\"Disease\\tCount\\n\") # Header\n", + " for disease, count in disease_counts.most_common(): # sorted by count\n", + " f.write(f\"{disease}\\t{count}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a22e4fa-f1d0-42c8-b31b-96259dfacb11", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "1b62117f-c833-4f54-b334-18a199a6a40d", + "metadata": {}, + "source": [ + "ChatGPT classified these answer into the 8 categories. Now I am creating the stacked bar plots" + ] + }, + { + "cell_type": "markdown", + "id": "6a634362-7681-4302-887d-771c30a2b7d0", + "metadata": {}, + "source": [ + "Keyword Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "020b6f57-7147-4a28-9f46-bf54b35c68c9", + "metadata": {}, + "outputs": [], + "source": [ + "# Reuse the improved keyword-based classification function from earlier\n", + "\n", + "def keyword_based_classify(disease):\n", + " disease_lower = disease.lower()\n", + "\n", + " if any(keyword in disease_lower for keyword in [\n", + " \"alzheimer\", \"parkinson\", \"neuro\", \"ataxia\", \"epilepsy\", \"intellectual\", \"cerebellar\",\n", + " \"brain\", \"dystonia\", \"charcot\", \"myasthenic\", \"autism\", \"spinocerebellar\",\n", + " \"als\", \"developmental\", \"leuko\", \"hypotonia\", \"encephalopathy\", \"seizure\",\n", + " \"microcephaly\", \"prion\", \"huntington\", \"myopathy\", \"spinal\", \"sma\"]):\n", + " return \"Neurological & Neurodevelopmental\"\n", + "\n", + " elif any(keyword in disease_lower for keyword in [\n", + " \"cancer\", \"leukemia\", \"lymphoma\", \"tumor\", \"carcinoma\", \"adenocarcinoma\",\n", + " \"sarcoma\", \"neoplasm\", \"pheochromocytoma\", \"adenoma\", \"glioblastoma\", \"melanoma\"]):\n", + " return \"Oncological / Cancer\"\n", + "\n", + " elif any(keyword in disease_lower for keyword in [\n", + " \"metabolic\", \"glycogen\", \"storage\", \"diabetes\", \"hypercholesterolemia\",\n", + " \"hypophosphatasia\", \"acyl\", \"cobalamin\", \"lipodystrophy\", \"maple syrup\",\n", + " \"homocystinuria\", \"porphyria\", \"gaucher\", \"phgdh\", \"thyroid\", \"adrenal\",\n", + " \"lipoprotein\", \"hypomagnesemia\", \"coenzyme\", \"desmosterolosis\", \"biogenesis\",\n", + " \"hemochromatosis\", \"mitochondrial\", \"pyruvate\", \"oxidative\", \"ketosis\",\n", + " \"aldosteronism\", \"cushing\", \"lesch\", \"dyshormonogenesis\"]):\n", + " return \"Metabolic / Mitochondrial\"\n", + "\n", + " elif any(keyword in disease_lower for keyword in [\n", + " \"cardio\", \"heart\", \"aortic\", \"arrhythmia\", \"ventricular\", \"artery\", \"hypertension\",\n", + " \"thrombocythemia\", \"fibrillation\", \"cardiomyopathy\", \"vascular\", \"anemia\",\n", + " \"thrombocytopenia\", \"myelofibrosis\", \"blood\", \"hypotension\", \"fanconi\"]):\n", + " return \"Cardiovascular & Hematological\"\n", + "\n", + " elif any(keyword in disease_lower for keyword in [\n", + " \"immunodeficiency\", \"scid\", \"autoimmune\", \"inflammation\", \"inflammatory\",\n", + " \"neutropenia\", \"immune\", \"lymphoproliferation\", \"cytokine\", \"common_variable\",\n", + " \"deficiency\", \"immunologic\"]):\n", + " return \"Immunological & Hematopoietic\"\n", + "\n", + " elif any(keyword in disease_lower for keyword in [\n", + " \"ehlers\", \"dysplasia\", \"dystrophy\", \"muscular\", \"osteogenesis\", \"fibrochondrogenesis\",\n", + " \"connective\", \"skeletal\", \"bone\", \"myopathy\", \"chondrodysplasia\", \"hypochondroplasia\",\n", + " \"marfan\"]):\n", + " return \"Musculoskeletal & Connective Tissue\"\n", + "\n", + " elif any(keyword in disease_lower for keyword in [\n", + " \"deafness\", \"hearing\", \"retinitis\", \"macular\", \"amaurosis\", \"dystrophy\",\n", + " \"cone-rod\", \"stargardt\", \"vision\", \"optic\", \"blindness\", \"retina\", \"eye\",\n", + " \"corneal\", \"cataract\"]):\n", + " return \"Sensory Disorders\"\n", + "\n", + " elif disease_lower == \"benign\":\n", + " return \"Benign\"\n", + "\n", + " else:\n", + " return \"Other / Multisystem / Syndromic\"\n", + "\n", + "# Reclassify the diseases in the dataframe\n", + "disease_df[\"Keyword_Category\"] = disease_df[\"Disease\"].apply(keyword_based_classify)\n", + "\n", + "# Save to file\n", + "keyword_classified_path = \"/mnt/data/VEP_Non_SNV_labels_keyword_classified.tsv\"\n", + "disease_df.to_csv(keyword_classified_path, sep=\"\\t\", index=False)\n", + "\n", + "tools.display_dataframe_to_user(name=\"Keyword Classified VEP Non-SNV\", dataframe=disease_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "787f83ea-3a0a-4066-b1b8-2f98eeaf38cd", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e6faccd-40f3-45c5-8400-7d9b77ea3a96", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "d3ea9b94-c604-46e1-8b6f-cb2e867f23f3", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA8YAAAHqCAYAAADGYFelAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjEsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvc2/+5QAAAAlwSFlzAAAPYQAAD2EBqD+naQAAx25JREFUeJzs3XlcTfn/B/DXbd9XWqS0K4oQRpZKTJTEyL6UGsbXTmSXPftOjKVikL2xG0vWLIUsI1tT1gxCFNFyfn94dH6udvu4r+fjcR8z95zP+Xze59xzj9738zmfIxEEQQARERERERGRjJL71gEQERERERERfUtMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIiISKYxMSYiIiIiIiKZxsSYiIiIiIiIZBoTYyIiIiIiIpJpTIxJpkRGRkIikYgvBQUFVK5cGT179sT9+/e/dXif7OrVq5gwYQJSU1PLvM2ZM2fQtm1bmJmZQVlZGYaGhmjQoAGCg4Olyi1duhSRkZGfN+AimJubo1WrVl+0jQkTJkAikZRaLiAgQOp8kZeXR+XKldGhQwdcuXLli8b4PnNzc6k4NDQ0UL9+faxZs0aqnJubG9zc3D6qjWnTpiEmJubTg/3A06dP0alTJxgYGEAikaBNmzbFlnVzcxP3UU5ODpqamrC2tkb79u2xZcsW5OfnF9rG3NwcAQEBnz3u79XOnTvh4+MDQ0NDKCkpQU9PDx4eHli3bh1ycnLKXd/X+l5/LgXX8PJc4z6nhQsXQiKRwMHB4Zu0/zV8zPewrNavX4/58+d/vmA/0X/t/CeiL0vhWwdA9C1ERETAzs4Or1+/xrFjxxAWFoajR4/i8uXLUFdX/9bhfbSrV69i4sSJcHNzg7m5eanld+/ejdatW8PNzQ0zZ86EsbEx0tLSkJCQgOjoaMyZM0csu3TpUlSoUEGmkhAAUFVVxeHDhwEAubm5uHXrFqZMmQIXFxckJSXBxMTkq8TRsGFDzJ49GwBw7949zJ49G/7+/sjKysL//ve/T65/2rRp8PPzKzFx/RiTJ0/G9u3bsXr1alhZWUFPT6/E8paWlli3bh0AICsrCykpKYiJiUH79u3RuHFj7Ny5E9ra2mL57du3Q0tL67PG/D0SBAGBgYGIjIyEl5cX5s6dC1NTU2RkZCA2NhZ9+/bFkydPMGjQoHLV+1/7Xnt7e+PUqVMwNjb+Ju2vXr0aAPD333/jzJkzqF+//jeJ40sr7/ewrNavX48rV65g8ODBnznij/NfO/+J6MtiYkwyycHBAc7OzgAAd3d35OXlYfLkyYiJiUHXrl0/qe5Xr15BTU3tc4T5xc2cORMWFhbYv38/FBT+/3LQqVMnzJw58xtG9v2Qk5PDTz/9JL5v1KgRzMzM4OHhgd27d6N3796f3EZeXh5yc3OhrKxcbBkdHR2pOJo1a4YqVapg7ty5nyUx/lKuXLkCKyurMn+vVFVVpfYTAH799VdEREQgMDAQvXv3xsaNG8V1tWrV+qzxfq9mzZqFyMhITJw4EePHj5da5+Pjg5CQENy6desbRfflvX79GioqKqhYsSIqVqz4TWJISEjAxYsX4e3tjd27d2PVqlWfLTHOyckRRzF9D8r7PSQi+hFwKDURIP4BcPv2bQDvemeWLl0KJycnqKqqQldXF35+fvjnn3+ktnNzc4ODgwOOHTsGFxcXqKmpITAwEADw/PlzBAcHw9LSEsrKyjAwMICXlxeuXbsmbv/27VtMmTIFdnZ2UFZWRsWKFdGzZ088fvxYqp2C4cX79u1D7dq1oaqqCjs7O7H3Ang3xLB9+/YA3iX7BUPhShomlp6ejgoVKhT5x5ic3P9fHszNzfH333/j6NGjYr0FPdLZ2dkIDg6Gk5MTtLW1oaenhwYNGuDPP/8sVGd+fj4WLVokHteCZG/Hjh3Fxgi8+1VfQUEBoaGh4rKDBw/Cw8MDWlpaUFNTQ8OGDXHo0KFC2+7evRtOTk5QVlaGhYWF2Ov6KQp6ShQVFcVljx8/Rt++fVGtWjVoaGjAwMAATZs2xfHjx6W2TU1NhUQiwcyZMzFlyhRYWFhAWVkZsbGx5YpBR0cHVatWFc/Z4jx9+hR9+/aFiYkJlJSUYGlpiTFjxuDNmzdiGYlEgqysLERFRYmfb2lDskurt2A/Dx48iKSkJLHeI0eOlGs/C/Ts2RNeXl7YvHmz1D5/OJQ6Pz8fU6ZMQdWqVcVzrEaNGliwYIFUfTdv3kSXLl1gYGAAZWVl2NvbY8mSJVJlynNub968GfXr14e2tjbU1NRgaWkpXgsKvHjxAsOGDYOFhQWUlJRgYmKCwYMHIysrq8R9z8nJwYwZM2BnZ4dx48YVWcbIyAiNGjUS30+cOBH169eHnp4etLS0ULt2baxatQqCIEgdu+K+1+WJ9/nz5wgKCoKenh40NDTg7e2Nf/75BxKJBBMmTJAqe+LECXh4eEBTUxNqampwcXHB7t27pcoUDJf+66+/EBgYiIoVK0JNTQ1v3rwpdih1Wa4Hjx8/Ru/evWFqaipebxs2bIiDBw+WePwLrFq1CgAwffp0uLi4IDo6Gq9evSpU7v79+2I7SkpKqFSpEvz8/PDvv/8CAI4cOQKJRIK1a9ciODgYJiYmUFZWFn/YWL16NWrWrAkVFRXo6emhbdu2SEpKkmrjn3/+QadOnVCpUiXxFhgPDw8kJiaKZQ4fPgw3Nzfo6+tDVVUVZmZmaNeuXZExl1Vx38MlS5agSZMmMDAwgLq6OhwdHTFz5kyp4f1ubm7YvXs3bt++LXVrSIGynLNl3a+y/Nta0vlf1usIEf1Yvo+fJom+sYI/SAp6In777TdERkZi4MCBmDFjBp4+fYpJkybBxcUFFy9ehKGhobhtWloaunXrhpCQEEybNg1ycnJ4+fIlGjVqhNTUVIwYMQL169dHZmYmjh07hrS0NNjZ2SE/Px++vr44fvw4QkJC4OLigtu3byM0NBRubm5ISEiAqqqq2M7FixcRHByMkSNHwtDQECtXrkRQUBCsra3RpEkTeHt7Y9q0aRg9ejSWLFmC2rVrAwCsrKyK3e8GDRpg5cqVGDhwILp27YratWtLJXsFtm/fDj8/P2hra2Pp0qUAIPZuvnnzBk+fPsWwYcNgYmKCt2/f4uDBg/jll18QERGBHj16iPUEBATgjz/+QFBQECZNmgQlJSWcP3++2PsFBUHA8OHDsXDhQqxcuVJMgP744w/06NEDvr6+iIqKgqKiIpYvXw5PT0/s378fHh4eAIBDhw7B19cXDRo0QHR0NPLy8jBz5kzxD9Syys3NFf9769YtDB8+HLq6uvD29hbLPH36FAAQGhoKIyMjZGZmYvv27XBzc8OhQ4cKJZoLFy6Era0tZs+eDS0tLdjY2JQrppycHNy+fbvE3rPs7Gy4u7sjOTkZEydORI0aNXD8+HGEhYUhMTFRTEhOnTqFpk2bwt3dXUy8ShqeXJZ6jY2NcerUKfTt2xcZGRnisMxq1aqVaz/f17p1a+zZswfHjx9HlSpViiwzc+ZMTJgwAWPHjkWTJk2Qk5ODa9eu4fnz52KZq1evwsXFBWZmZpgzZw6MjIywf/9+DBw4EE+ePBF/gCnruX3q1Cl07NgRHTt2xIQJE6CiooLbt2+LQ/CBdyNJXF1dce/ePYwePRo1atTA33//jfHjx+Py5cs4ePBgsfe9JyQk4OnTp+jVq1eZ7o0H3v0w8dtvv8HMzAwAcPr0aQwYMAD3798Xe5xL+l6XNd78/Hz4+PggISEBEyZMQO3atXHq1Cm0aNGiUExHjx5F8+bNUaNGDaxatQrKyspYunQpfHx8sGHDBnTs2FGqfGBgILy9vbF27VpkZWUVeW0Cyn496N69O86fP4+pU6fC1tYWz58/x/nz55Genl7q8Xz9+jU2bNiAunXrwsHBAYGBgfj111+xefNm+Pv7i+Xu37+PunXrIicnRzxu6enp2L9/P549eyb1b8eoUaPQoEEDLFu2DHJycjAwMEBYWBhGjx6Nzp07IywsDOnp6ZgwYQIaNGiA+Ph48Trh5eUlXs/MzMzw5MkTxMXFied5amoqvL290bhxY6xevRo6Ojq4f/8+9u3bh7dv337SqKaivofJycno0qWL+CPKxYsXMXXqVFy7dk38AXfp0qXo3bs3kpOTsX379kL1luWcLct+lfXf1pLO/7JcR4joByQQyZCIiAgBgHD69GkhJydHePnypbBr1y6hYsWKgqampvDw4UPh1KlTAgBhzpw5UtvevXtXUFVVFUJCQsRlrq6uAgDh0KFDUmUnTZokABAOHDhQbCwbNmwQAAhbt26VWh4fHy8AEJYuXSouq1KliqCioiLcvn1bXPb69WtBT09P+O2338RlmzdvFgAIsbGxZToeT548ERo1aiQAEAAIioqKgouLixAWFia8fPlSqmz16tUFV1fXUuvMzc0VcnJyhKCgIKFWrVri8mPHjgkAhDFjxpS4fZUqVQRvb2/h1atXQrt27QRtbW3h4MGD4vqsrCxBT09P8PHxkdouLy9PqFmzplCvXj1xWf369YVKlSoJr1+/Fpe9ePFC0NPTE8py+fP39xePzfsvY2Nj4cSJE2U6Dh4eHkLbtm3F5SkpKQIAwcrKSnj79m2pMQjCu2Pi5eUl5OTkCDk5OUJKSooY2/Dhw8Vyrq6uUp/RsmXLBADCpk2bpOqbMWOGAED466+/xGXq6uqCv79/meIpT72urq5C9erVy1RvaWX37t0rABBmzJghLqtSpYpU3K1atRKcnJxKbMfT01OoXLmykJGRIbW8f//+goqKivD06dMityvu3J49e7YAQHj+/HmxbYaFhQlycnJCfHy81PItW7YIAIQ9e/YUu210dLQAQFi2bFmJ+1WcvLw8IScnR5g0aZKgr68v5Ofni+uK+16XNd7du3cLAITw8PBC2wMQQkNDxWU//fSTYGBgIHVtyc3NFRwcHITKlSuLcRVcp3v06FEoroJ1KSkpgiCU73qgoaEhDB48uIQjVbw1a9ZIfQYvX74UNDQ0hMaNG0uVCwwMFBQVFYWrV68WW1dsbKwAQGjSpInU8mfPngmqqqqCl5eX1PI7d+4IysrKQpcuXQRBeHfdBiDMnz+/2DYKPqfExMRy7acgfNz38H0F59uaNWsEeXl5qe+Tt7e3UKVKlVJjKO6cLct+leff1uLO/7JcR4jox8Oh1CSTfvrpJygqKkJTUxOtWrWCkZER9u7dC0NDQ+zatQsSiQTdunVDbm6u+DIyMkLNmjULDQXV1dVF06ZNpZbt3bsXtra2aNasWbEx7Nq1Czo6OvDx8ZFqx8nJCUZGRoXacXJyEn9JBwAVFRXY2tqWOpS2JPr6+jh+/Dji4+Mxffp0+Pr64saNGxg1ahQcHR3x5MmTMtWzefNmNGzYEBoaGlBQUICioiJWrVolNfxv7969AIB+/fqVWl96ejqaNm2Ks2fPikMvC8TFxeHp06fw9/eXOm75+flo0aIF4uPjkZWVhaysLMTHx+OXX36BioqKuL2mpiZ8fHzKeoigqqqK+Ph4xMfH48yZM9i2bRtsbW3h5eWFU6dOSZVdtmwZateuDRUVFfE4HDp0qNAwSOBdr0txPWBF2bNnDxQVFaGoqAgLCwts2rQJAwYMwJQpU4rd5vDhw1BXV4efn5/U8oKe96KGnpfFl6q3NMIHQyqLUq9ePVy8eBF9+/bF/v378eLFC6n12dnZOHToENq2bQs1NTWpc8jLywvZ2dk4ffq0WL4s53bdunUBAB06dMCmTZuKnOF+165dcHBwgJOTk1Sbnp6enzTEvDiHDx9Gs2bNoK2tDXl5eSgqKmL8+PFIT0/Ho0ePSt2+rPEePXoUwLt9f1/nzp2l3mdlZeHMmTPw8/ODhoaGuFxeXh7du3fHvXv3cP36dalt2rVrV2qcZb0eAO/OjcjISEyZMgWnT58u1yzeq1atgqqqKjp16gQA0NDQQPv27XH8+HHcvHlTLLd37164u7vD3t6+1Do/3L9Tp07h9evXhSaCMjU1RdOmTcXvlZ6eHqysrDBr1izMnTsXFy5cKDRTtJOTE5SUlNC7d29ERUUVug3oUxT1Pbxw4QJat24NfX198Xzr0aMH8vLycOPGjTLVW5Zztiz7Vd5/W4tS2nWEiH5MTIxJJq1Zswbx8fG4cOECHjx4gEuXLqFhw4YAgH///ReCIMDQ0FBMRApep0+fLpQsFjU76uPHj1G5cuUSY/j333/x/PlzKCkpFWrn4cOHhdrR19cvVIeysjJev35d3t0vxNnZGSNGjMDmzZvx4MEDDBkyBKmpqWWagGvbtm3o0KEDTExM8Mcff+DUqVOIj49HYGAgsrOzxXKPHz+GvLw8jIyMSq3zxo0bOHPmDFq2bFnosSgFw6D9/PwKHbcZM2ZAEAQ8ffoUz549Q35+fpHtlSWGAnJycnB2doazszPq1auHtm3bYs+ePVBQUMDQoUPFcgWTYNWvXx9bt27F6dOnER8fjxYtWhT5GZV3Vt1GjRohPj4eCQkJuHr1Kp4/f46FCxdCSUmp2G3S09NhZGRUaPitgYEBFBQUyjSE9GvWW5qCH4EqVapUbJlRo0Zh9uzZOH36NFq2bAl9fX14eHggISFBjD03NxeLFi0qdP54eXkBgPjdK+u53aRJE8TExCA3Nxc9evRA5cqV4eDggA0bNohl/v33X1y6dKlQm5qamhAEocQfoQp+EEtJSSnTcTp79ix+/vlnAMCKFStw8uRJxMfHY8yYMQBQpmtGWeNNT0+HgoJCodnG3x8yDADPnj2DIAhFnvcFn+eH501ZviNlvR4AwMaNG+Hv74+VK1eiQYMG0NPTQ48ePfDw4cMS27h16xaOHTsGb29vCIKA58+f4/nz5+IPQ+/P9VCWa39x+1ew/8Udo4L1EokEhw4dgqenJ2bOnInatWujYsWKGDhwIF6+fAng3S00Bw8ehIGBAfr16wcrKytYWVl9lntkP/we3rlzB40bN8b9+/exYMEC8cfWgnv2y3K+lfWcLct+lfff1qKUdh0hoh8T7zEmmWRvby/OSv2hChUqQCKR4Pjx40XOEvzhsqLu+atYsSLu3btXYgwVKlSAvr4+9u3bV+R6TU3NErf/UhQVFREaGop58+aV6Vm9f/zxBywsLLBx40apY/H+5E7Au2OSl5eHhw8flvoHb4MGDdC+fXsEBQUBAMLDw8XJwCpUqAAAWLRoUaFZUwsYGhqKs7wW9UdvaX8Il0ZNTQ1WVla4ePGiuOyPP/6Am5sbwsPDpcoW/KH6obLeK1pAW1u72HO2OPr6+jhz5gwEQZBq79GjR8jNzRWPZXl9qXpLs2PHDkgkEjRp0qTYMgU/WAwdOhTPnz/HwYMHMXr0aHh6euLu3bvQ1dUVeymLG71gYWEBoOznNgD4+vrC19cXb968wenTpxEWFoYuXbrA3NwcDRo0QIUKFaCqqiqVRL2vpGPm7OwMPT09/PnnnwgLCyv13ImOjoaioiJ27dolNVqiPM+pLmu8+vr6yM3NxdOnT6WS4w+/Y7q6upCTk0NaWlqhuh48eCBVZ4GyfEfKej0oKDt//nzMnz8fd+7cwY4dOzBy5Eg8evSo2Osw8C7xFQQBW7ZswZYtWwqtj4qKwpQpUyAvL1+ma3+BD/ev4MfP4o7R+8enSpUq4mRgN27cwKZNmzBhwgS8ffsWy5YtAwA0btwYjRs3Rl5eHhISErBo0SIMHjwYhoaGYs/3x/jwexgTE4OsrCxs27ZN6t7/9ycCK015ztnS9utz/Nta2nXkv/LkCSIqH/YYE32gVatWEAQB9+/fF3sK3385OjqWWkfLli1x48YNqcl3imonPT0deXl5RbZTtWrVcsdekLSXtRe5qD/AAIjDRN/vmSuud1oikUBJSUnqj7yHDx8Wmrm3ZcuWAFAocSyOv78/oqOjxUmO8vLyALx7nq+Ojg6uXr1a5HFzdnaGkpIS1NXVUa9ePWzbtk2qd+/ly5fYuXNnmWIoTmZmJm7dugUDAwNxmUQiKfSjyaVLlwoNt/6aPDw8kJmZWeiPyzVr1ojrC5Rn9EF56v1cIiIisHfvXnTu3FnqloKS6OjowM/PD/369cPTp0+RmpoKNTU1uLu748KFC6hRo0aR509BglLWc/t9ysrKcHV1xYwZMwC8G2IKvPu+JycnQ19fv8g2S3ruuKKiIkaMGIFr165h8uTJRZZ59OgRTp48KcatoKAAeXl5cf3r16+xdu3aIuMt6nMva7yurq4AUOjRPdHR0VLv1dXVUb9+fWzbtk2qvfz8fPzxxx+oXLkybG1tiz0GxSnr9eBDZmZm6N+/P5o3b47z588XW39eXh6ioqJgZWWF2NjYQq/g4GCkpaWJt4q0bNkSsbGxhYaFl0WDBg2gqqqKP/74Q2r5vXv3cPjw4WK/V7a2thg7diwcHR2L3Bd5eXnUr19f7MEtaX9LU9T3sOD78f71TxAErFixotD2Jf07UtZztkBx+1Wef1vLct0r6jpCRD8m9hgTfaBhw4bo3bs3evbsiYSEBDRp0gTq6upIS0vDiRMn4OjoWOpzYwcPHoyNGzfC19cXI0eORL169fD69WscPXoUrVq1gru7Ozp16oR169bBy8sLgwYNQr169aCoqIh79+4hNjYWvr6+aNu2bbliLxh2/Pvvv0NTUxMqKiqwsLAochg2AHh6eqJy5crw8fERZ8pOTEzEnDlzoKGhgUGDBollHR0dER0djY0bN8LS0hIqKipwdHREq1atsG3bNvTt2xd+fn64e/cuJk+eDGNjY6l77xo3bozu3btjypQp+Pfff9GqVSsoKyvjwoULUFNTw4ABAwrF5+fnBzU1Nfj5+YmzwmpoaGDRokXw9/fH06dP4efnBwMDAzx+/BgXL17E48ePxeR78uTJaNGiBZo3b47g4GDk5eVhxowZUFdXF4dXliY/P1+85zQ/Px/379/HwoUL8ezZM6lH0bRq1QqTJ09GaGgoXF1dcf36dUyaNAkWFhbirNZfW48ePbBkyRL4+/sjNTUVjo6OOHHiBKZNmwYvLy+pe+AdHR1x5MgR7Ny5E8bGxtDU1Cz2x5ny1Fter1+/Fo/369ev8c8//yAmJga7du2Cq6ur2BtWHB8fH/E55RUrVsTt27cxf/58VKlSRZzRd8GCBWjUqBEaN26M//3vfzA3N8fLly9x69Yt7Ny5U/xBq6zn9vjx43Hv3j14eHigcuXKeP78ORYsWABFRUUxcRw8eDC2bt2KJk2aYMiQIahRowby8/Nx584d/PXXXwgODi7xmbjDhw9HUlISQkNDcfbsWXTp0gWmpqbIyMjAsWPH8Pvvv2PixIlo2LAhvL29MXfuXHTp0gW9e/dGeno6Zs+eXeQImOK+12WNt0WLFmjYsCGCg4Px4sUL1KlTB6dOnRJ/JHn/sW9hYWFo3rw53N3dMWzYMCgpKWHp0qW4cuUKNmzYUO5RFADKfD3IyMiAu7s7unTpAjs7O2hqaiI+Ph779u3DL7/8Umz9e/fuxYMHDzBjxowiH2Hm4OCAxYsXY9WqVWjVqhUmTZqEvXv3okmTJhg9ejQcHR3x/Plz7Nu3D0OHDoWdnV2xbeno6GDcuHEYPXo0evTogc6dOyM9PR0TJ06EioqKOFv6pUuX0L9/f7Rv3x42NjZQUlLC4cOHcenSJYwcORLAu/kODh8+DG9vb5iZmSE7O1vs/S/L97M838PmzZtDSUkJnTt3RkhICLKzsxEeHo5nz54VqtfR0RHbtm1DeHg46tSpI96qUtZztiz7VZ5/W4s7/8tyHSGiH9C3mPGL6FspmNH0w5lWi7J69Wqhfv36grq6uqCqqipYWVkJPXr0EBISEsQyJc3e+ezZM2HQoEGCmZmZoKioKBgYGAje3t7CtWvXxDI5OTnC7NmzhZo1awoqKiqChoaGYGdnJ/z222/CzZs3xXIFMzV/6MNZiAVBEObPny9YWFgI8vLyAgAhIiKi2H3cuHGj0KVLF8HGxkbQ0NAQFBUVBTMzM6F79+6FZlVNTU0Vfv75Z0FTU1MAIDWz6PTp0wVzc3NBWVlZsLe3F1asWCGEhoYWmvk5Ly9PmDdvnuDg4CAoKSkJ2traQoMGDYSdO3eWuK+xsbGChoaG0KJFC+HVq1eCIAjC0aNHBW9vb0FPT09QVFQUTExMBG9vb2Hz5s1S2+7YsUOoUaOGoKSkJJiZmQnTp08vMraiFDUrtYGBgeDq6ips375dquybN2+EYcOGCSYmJoKKiopQu3ZtISYmRvD395c6VgWzUs+aNavU9ks6JkUp6nxIT08X+vTpIxgbGwsKCgpClSpVhFGjRgnZ2dlS5RITE4WGDRsKampqAoBSZyAva73lnZX6/WOtrq4uWFpaCn5+fsLmzZuFvLy8Qtt8OCv1nDlzBBcXF6FChQriZx4UFCSkpqZKbZeSkiIEBgYKJiYmgqKiolCxYkXBxcVFmDJlilS5spzbu3btElq2bCmYmJgISkpKgoGBgeDl5SUcP35cqq7MzExh7NixQtWqVcXz39HRURgyZIjw8OHDMh2jP//8U/D29hYqVqwoKCgoCLq6uoK7u7uwbNky4c2bN2K51atXC1WrVhWUlZUFS0tLISwsTFi1apXUjM6CUPL3uqzxPn36VOjZs6ego6MjqKmpCc2bNxdOnz4tABAWLFggFf/x48eFpk2bitfVn376Ser7LwglX6c/nJW6QGnXg+zsbKFPnz5CjRo1BC0tLUFVVVWoWrWqEBoaKmRlZRV7vNu0aSMoKSkJjx49KrZMp06dBAUFBfGY3L17VwgMDBSMjIwERUVFoVKlSkKHDh2Ef//9VxCE/5+V+sNrVYGVK1eK1yxtbW3B19dX+Pvvv8X1//77rxAQECDY2dkJ6urqgoaGhlCjRg1h3rx5Qm5uriAIgnDq1Cmhbdu2QpUqVQRlZWVBX19fcHV1FXbs2FHsfhT4mO/hzp07xX/HTExMhOHDh4uzV7//lISnT58Kfn5+go6OjiCRSKS+R2U5Z8u6X2X9t7W487+s1xEi+rFIBKEM03wSERERldH69evRtWtXnDx5Ei4uLt86HCIiolIxMSYiIqKPtmHDBty/fx+Ojo6Qk5PD6dOnMWvWLNSqVUt8nBMREdH3jvcYExER0UfT1NREdHQ0pkyZgqysLBgbGyMgIKDEZ2wTERF9b9hjTERERERERDKNj2siIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaZx86z8sPz8fDx48gKamJiQSybcOh4iIiIgIgiDg5cuXqFSpEuTk2A9H/w1MjP/DHjx4AFNT028dBhERERFRIXfv3kXlypW/dRhEZcLE+D9MU1MTwLuLjpaW1jeOhoiIiIgIePHiBUxNTcW/VYn+C5gY/4cVDJ/W0tJiYkxERERE3xXe6kf/JRz0T0RERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMY2JMREREREREMo2JMREREREREck0JsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTGNiTERERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMU/jWAdCncwjdDzlltW8dxjeTqtLlm7XtaGH2UdttCsv9zJEU77Dbkq/WVllkP5v7rUP4LnW0GPGtQyD6Ya1UOfStQyD6LjVusvaL1JuVlf9F6iX6kthjTERERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMY2JMREREREREMo2JMREREREREck0JsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTGNiTERERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMY2JMREREREREMo2JMREREREREck0JsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTGNiTERERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMY2JMREREREREMo2JMREREREREck0JsZEREREREQk08qcGPv4+KBZs2ZFrjt16hQkEgnOnz+P1NRUSCSSIl+nT58GAERGRkotNzY2RocOHZCSklJs+xMmTIBEIkGfPn2klicmJkIikSA1NbWsu/LRYmNj4e7uDj09PaipqcHGxgb+/v7Izc0FABw5cgQSiQQODg7Iy8uT2lZHRweRkZF4+/YtKlSogClTphTZRlhYGCpUqIC3b99+8f0hIiIiIiKiciTGQUFBOHz4MG7fvl1o3erVq+Hk5ITatWuLyw4ePIi0tDSpV506dcT1WlpaSEtLw4MHD7B+/XokJiaidevWhRLK96moqGDVqlW4ceNGWcP+bP7++2+0bNkSdevWxbFjx3D58mUsWrQIioqKyM/PlyqbnJyMNWvWFFmPkpISunXrhsjISAiCUGh9REQEunfvDiUlpS+yH0RERERERCStzIlxq1atYGBggMjISKnlr169wsaNGxEUFCS1XF9fH0ZGRlIvRUVFcb1EIoGRkRGMjY3h7u6O0NBQXLlyBbdu3So2hqpVq8Ld3R1jx44tMdajR4+iXr16UFZWhrGxMUaOHCn26gKAm5sbBg4ciJCQEOjp6cHIyAgTJkwosc4DBw7A2NgYM2fOhIODA6ysrNCiRQusXLmyUBI7YMAAhIaGIjs7u8i6goKCkJycjGPHjkktP378OG7evFnoWBIREREREdGXU+bEWEFBAT169CjU07l582a8ffsWXbt2/aRAVFVVAQA5OTkllps+fTq2bt2K+Pj4Itffv38fXl5eqFu3Li5evIjw8HCsWrWq0NDlqKgoqKur48yZM5g5cyYmTZqEAwcOFNuukZER0tLSCiWzRRk8eDByc3OxePHiItc7Ojqibt26iIiIkFq+evVq1KtXDw4ODqW2QURERERERJ9HuSbfCgwMRGpqKo4cOSIuW716NX755Rfo6upKlXVxcYGGhobUq7hh0vfu3cOsWbNQuXJl2NralhhD7dq10aFDB4wcObLI9UuXLoWpqSkWL14MOzs7tGnTBhMnTsScOXOkhjzXqFEDoaGhsLGxQY8ePeDs7IxDhw4V22779u3RuXNnuLq6wtjYGG3btsXixYvx4sWLQmXV1NQQGhqKsLAwZGRkFFlfYGAgtmzZgszMTABAZmYmNm/eXGJv8Zs3b/DixQupFxEREREREX2aciXGdnZ2cHFxwerVqwG8u5f2+PHjCAwMLFR248aNSExMlHrJy8uL6zMyMqChoQF1dXWYmpri7du32LZtW5nurZ0yZQqOHz+Ov/76q9C6pKQkNGjQABKJRFzWsGFDZGZm4t69e+KyGjVqSG1nbGyMR48eAQD69OkjldADgLy8PCIiInDv3j3MnDkTlSpVwtSpU1G9enWkpaUViiMoKAgVKlTAjBkzityHzp07Iz8/Hxs3bhSPlyAI6NSpU7H7HRYWBm1tbfFlampabFkiIiIiIiIqm3I/rikoKAhbt27FixcvEBERgSpVqsDDw6NQOVNTU1hbW0u93qepqYnExERcvnwZmZmZOHfuHOrWrVumGKysrNCrVy+MHDmy0ARWgiBIJcUFywBILX//fueCdQU9ypMmTZJK6N9nYmKC7t27Y8mSJbh69Sqys7OxbNmyQjEqKChgypQpWLBgAR48eFBovba2Nvz8/MTh1BEREfDz84OWllax+z1q1ChkZGSIr7t37xZbloiIiIiIiMqm3Ilxhw4dIC8vj/Xr1yMqKgo9e/YslIiWqWE5OVhbW8PS0hLq6url3n78+PG4ceMGoqOjpZZXq1YNcXFxUglzXFwcNDU1YWJiUqa6DQwMik3o36erqwtjY2NkZWUVub59+/aoXr06Jk6cWOT6oKAgnDx5Ert27cLJkydLnXRLWVkZWlpaUi8iIiIiIiL6NArl3UBDQwMdO3bE6NGjkZGRgYCAgCLLpaen4+HDh1LLdHR0oKKi8lGBfsjQ0BBDhw7FrFmzpJb37dsX8+fPx4ABA9C/f39cv34doaGhGDp0KOTkyv07gGj58uVITExE27ZtYWVlhezsbKxZswZ///03Fi1aVOx206dPh6enZ5HrXF1dYW1tjR49esDa2hpNmjT56PiIiIiIiIjo43xUphgUFIRnz56hWbNmMDMzK7JMs2bNYGxsLPWKiYn5lFgLGT58uHgPcAETExPs2bMHZ8+eRc2aNdGnTx8EBQWV+oin0tSrVw+ZmZno06cPqlevDldXV5w+fRoxMTFwdXUtdrumTZuiadOmUo+Lel9gYCCePXtW5H3aRERERERE9OVJhA9v0qX/jBcvXrybhGvwJsgpq33rcL6ZVJUu36xtR4uifxgqzaawon8o+RIOuy35am2VRfazud86hO9SR4sR3zoEoh/WSpXinzpBJMsaN1n7RerNysqHb+tUZGRk8NY/+s/4+LHFRERERERERD8AJsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTGNiTERERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMY2JMREREREREMo2JMREREREREck0JsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTGNiTERERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMY2JMREREREREMo2JMREREREREck0JsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTJMIgiB86yDo47x48QLa2trIyMiAlpbWtw6HiIiIiIh/o9J/EnuMiYiIiIiISKYxMSYiIiIiIiKZxsSYiIiIiIiIZBoTYyIiIiIiIpJpTIyJiIiIiIhIpjExJiIiIiIiIpnGxJiIiIiIiIhkGhNjIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIiISKYxMSYiIiIiIiKZxsSYiIiIiIiIZBoTYyIiIiIiIpJpTIyJiIiIiIhIpjExJiIiIiIiIpnGxJiIiIiIiIhkGhNjIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIiISKYxMSYiIiIiIiKZxsSYiIiIiIiIZBoTYyIiIiIiIpJpTIyJiIiIiIhIpjExJiIiIiIiIpnGxJiIiIiIiIhkGhNjIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIiISKYxMSYiIiIiIiKZxsSYiIiIiIiIZBoTYyIiIiIiIpJpTIyJiIiIiIhIpjExJiIiIiIiIpnGxJiIiIiIiIhkGhNjIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIiISKYxMSYiIiIiIiKZxsSYiIiIiIiIZBoTYyIiIiIiIpJpTIyJiIiIiIhIpjExJiIiIiIiIpnGxJiIiIiIiIhkGhNjIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIiISKYxMSYiIiIiIiKZxsSYiIiIiIiIZBoTYyIiIiIiIpJpTIyJiIiIiIhIpjExJiIiIiIiIpnGxJiIiIiIiIhkGhNjIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIiISKYpfOsA6NM5hO6HnLLatw6DiMogVaXLZ6/T0cLss9f5oU1huV+8DQA47Lbkq7TzMbKfzf3WIfwndbQY8cl1rFQ59BkiIaKyaNxk7SfXkZWV/xkiIfq62GNMREREREREMo2JMREREREREck0JsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTGNiTERERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMY2JMREREREREMo2JMREREREREck0JsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTGNiTERERERERDKNiTERERERERHJNCbGREREREREJNOYGBMREREREZFMY2JMREREREREMo2JMREREREREck0JsZEREREREQk05gYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxh8hMjISOjo64vsJEybAycnpm8VDREREREREH++LJcY+Pj5o1qxZketOnToFiUSC8+fPIzU1FRKJpMjX6dOnAbxLRN9fbmxsjA4dOiAlJaXEGF68eIExY8bAzs4OKioqMDIyQrNmzbBt2zYIgvDZ9nXYsGE4dOjQZ6uPiIiIiIiIvh6FL1VxUFAQfvnlF9y+fRtVqlSRWrd69Wo4OTmhdu3aSE1NBQAcPHgQ1atXlyqnr68v/r+WlhauX78OQRBw7do1/Pbbb2jdujUSExMhLy9fqP3nz5+jUaNGyMjIwJQpU1C3bl0oKCjg6NGjCAkJQdOmTaV6fT+FhoYGNDQ0PktdRERERERE9HV9sR7jVq1awcDAAJGRkVLLX716hY0bNyIoKEhqub6+PoyMjKReioqK4nqJRAIjIyMYGxvD3d0doaGhuHLlCm7dulVk+6NHj0ZqairOnDkDf39/VKtWDba2tujVqxcSExPFRPbZs2fo0aMHdHV1oaamhpYtW+LmzZtSdUVGRsLMzAxqampo27Yt0tPTpdZ/OJQ6ICAAbdq0wezZs2FsbAx9fX3069cPOTk5Ypm0tDR4e3tDVVUVFhYWWL9+PczNzTF//vyyHmIiIiIiIiL6DL5YYqygoIAePXogMjJSatjy5s2b8fbtW3Tt2vWT6ldVVQUAqWSzQH5+PqKjo9G1a1dUqlSp0HoNDQ0oKLzrLA8ICEBCQgJ27NiBU6dOQRAEeHl5ifWeOXMGgYGB6Nu3LxITE+Hu7o4pU6aUGl9sbCySk5MRGxuLqKgoREZGSv1I0KNHDzx48ABHjhzB1q1b8fvvv+PRo0cfcyiIiIiIiIjoE3yxodQAEBgYiFmzZuHIkSNwd3cH8G4Y9S+//AJdXV2psi4uLpCTk87TMzIyihwmfe/ePcyaNQuVK1eGra1tofVPnjzBs2fPYGdnV2J8N2/exI4dO3Dy5Em4uLgAANatWwdTU1PExMSgffv2WLBgATw9PTFy5EgAgK2tLeLi4rBv374S69bV1cXixYshLy8POzs7eHt749ChQ+jVqxeuXbuGgwcPIj4+Hs7OzgCAlStXwsbGpsQ637x5gzdv3ojvX7x4UWJ5IiIiIiIiKt0XnZXazs4OLi4uWL16NQAgOTkZx48fR2BgYKGyGzduRGJiotTr/aQ4IyMDGhoaUFdXh6mpKd6+fYtt27ZBSUmpUF0FPdQSiaTE+JKSkqCgoID69euLy/T19VG1alUkJSWJZRo0aCC13Yfvi1K9enWp+I2NjcUe4evXr0NBQQG1a9cW11tbWxf6seBDYWFh0NbWFl+mpqalxkFEREREREQl+6I9xsC7Sbj69++PJUuWICIiAlWqVIGHh0ehcqamprC2ti62Hk1NTZw/fx5ycnIwNDSEurp6sWUrVqwIXV1dMbktTnEzUwuCICbVHzt79fv3RwPvkvT8/PxS2y3JqFGjMHToUPH9ixcvmBwTERERERF9oi+eGHfo0AGDBg3C+vXrERUVhV69epXak1sUOTm5EhPnD8t27NgRa9euRWhoaKH7jLOysqCsrIxq1aohNzcXZ86cEYdSp6en48aNG7C3twcAVKtWTXxsVIEP35eXnZ0dcnNzceHCBdSpUwcAcOvWLTx//rzE7ZSVlaGsrPxJbRMRERF9DHl5eaioqHzU33H03yEnV3h+nvJSUMhHlSoC3r59i+zs7M8QFdHHUVRULPLW3KJ88cRYQ0MDHTt2xOjRo5GRkYGAgIAiy6Wnp+Phw4dSy3R0dKCiovJR7U6bNg1HjhxB/fr1MXXqVDg7O0NRURHHjx9HWFgY4uPjYWNjA19fX/Tq1QvLly+HpqYmRo4cCRMTE/j6+gIABg4cCBcXF8ycORNt2rTBX3/9Ver9xaWxs7NDs2bN0Lt3b4SHh0NRURHBwcFQVVXlPzZERET03bGxsUGVKlXK/Acm/XepqDh9ch1amsCyZbn4999/8fjx408PiugT6OjowMjIqNQ864snxsC74dSrVq3Czz//DDMzsyLLNGvWrNCyDRs2oFOnTh/Vpq6uLk6fPo3p06djypQpuH37NnR1deHo6IhZs2ZBW1sbABAREYFBgwahVatWePv2LZo0aYI9e/aIQ6F/+uknrFy5EqGhoZgwYQKaNWuGsWPHYvLkyR8VV4E1a9YgKCgITZo0gZGREcLCwvD3339/9A8BRERERF+CjY0NbGxsoKenB0VFRf6I/4NTU3v+yXXk5wsQhByYm5vzxxT6ZgRBwKtXr8R5noyNjUssLxE+9iZa+qzu3bsHU1NTHDx4sMh7sIvy4sWLd5NwDd4EOWW1LxwhEX0OqSpdPnudjhZF/+D4OW0Ky/3ibQDAYbclX6Wdj5H9bO63DuE/qaPFiE+uY6XKoc8QCX0MBQUFuLm5wcjICGpq/FtDFmhopH9yHfn5Am7deotatWoxMaZvLj09HY8ePYKtrW2J5+NX6TGmwg4fPozMzEw4OjoiLS0NISEhMDc3R5MmTb51aEREREQA3s1vIi8vX2hSUSKi/4qCH/VycnKYGH+PcnJyMHr0aPzzzz/Q1NSEi4sL1q1bx394iIiI6LtRMGyaw6eJ6L+qrNcvJsbfiKenJzw9Pb91GERERERERDJP7lsHQERERET0X1W/fn2sWLHiW4dBRJ+IPcZEREREVC4uCy981fbiBtYq9zaDBw/G5s2bxfc6OjpwcnLCmDFjUK1atc8W2549ezgxGdEPgD3GRERERPRDcnd3x4ULF3DhwgVs3LgR8vLy8Pf3/6xt6OvrQ1VV9bPWSURfHxNjIiIiIvohKSkpwcDAAAYGBnBwcEC/fv3w4MEDpKe/eyRRWloa+vTpg2rVqqF69ero2bMn7t69K24/ePBgBAYGYtmyZahVqxaqV6+O0aNHIycnRyzz4VDqW7duoU2bNrC0tISbmxuOHTsGExMT7Nu3DwBw9+5dmJiYYM+ePfDz84OVlRWaNWuGhISEr3RUiKgoTIyJiIiI6IeXlZWFbdu2wdzcHLq6unj9+jXat28PdXV1bN26FTExMVBXV0fXrl3x9u1bcbu4uDikpqZi8+bNmD9/PjZt2oRNmzYV2UZ+fj4CAwOhqqqKnTt3YubMmZg5c2aRZWfMmIE+ffrgr7/+gqWlJfr164fc3K/zzHgiKoz3GBMRERHRD+ngwYOwsbEBALx69QqGhoaIioqCnJwc/vzzT8jJyWH27Nni41zmzp0Le3t7nDp1Cq6urgAAbW1tTJ06FfLy8rC2toaHhwdOnDiBrl27Fmrv6NGjuH37NrZs2QIDAwMAQEhICDp37lyobJ8+fdCsWTMAwLBhw+Du7o7U1FRYW1t/kWNBRCVjjzERERER/ZBcXFzw119/4a+//sKuXbvQpEkTdOvWDffu3cOlS5eQmpoKW1tb2NjYwMbGBtWrV8ebN2+Qmpoq1mFrawt5eXnxvaGhIZ48eVJke8nJyahUqZKYFANArVpFTxxmb28v/n9B+eLqJaIvjz3GRERERPRDUlNTg4WFhfi+Ro0asLOzw7p165Cfn48aNWpg0aJFhbbT19cX/19RUbHQekEQimxPEASx97k0Cgr//2d4wTb5+fll2paIPj8mxkREREQkEyQSCeTk5JCdnQ1HR0fs3LkTFSpUgKam5mep39raGvfv38fjx49RsWJFAEBiYuJnqZuIviwOpSYiIiKiH9Lbt2/x6NEjPHr0CDdv3sTYsWORlZWF5s2b45dffoGuri569uyJM2fO4M6dOzh16hTGjx+PBw8efFR7TZo0QZUqVTB48GBcvXoV8fHxmDFjBgCUuSeZiL4N9hgTERER0Q8pNjZWvMdXQ0MD1tbWWL58OVxcXAAA27Ztw9SpU/Hrr78iKysLRkZGaNSo0Uf3IMvLy2P16tUYNmwYvL29YWZmhrFjxyIgIADKysqfbb+I6PNjYkxERERE5RI3sOgJpb4n8+fPx/z580ssY2BggAULFpRYx4cmTZok9f7MmTNS762trRETEyO+j4+PBwCYm5sDAExNTXH//n2pbbS1tQstI6Kvi4kxEREREdFnsnfvXqirq8PCwgIpKSkIDQ1F3bp1xcSYiL5PTIyJiIiIiD6TzMxMTJkyBWlpadDV1UXjxo0xfvz4bx0WEZWCiTERERER0WfSvn17tG/f/luHQUTlxFmpiYiIiIiISKYxMSYiIiIiIiKZxsSYiIiIiIiIZBoTYyIiIiIiIpJpTIyJiIiIiIhIpjExJiIiIiIiIpnGxJiIiIiI6BPVr18fK1asEN+bmJhg37593zCiz+v48Xhoa9fA8+cvvnUoMmPChAlwcnL6rHUeOXIEEokEz58//yz1paamQiKRIDEx8bPU9y3xOcZEREREVC6Vfrf/qu096J1U7m0ePXqEhQsX4tChQ3j48CH09fVRvXp1/Prrr2jcuPEXiFLahQsXoK2t/cXb+VHcunUL/fr1w6lTp6CgoID69etjzZo1qFixYonbRUZGYvDgwUUmehKJBNu3b0ebNm2+TNBlNGHCBMTExHwXyaOLiwvS0tJ4bhaBiTERERER/VDu3r2LNm3aQEtLC2PGjIG9vT1yc3Nx5MgRjBkzBseOHfuoevPy8iCRSCAnV/qgSwMDg49q40f29u1bKCkpFbmud+/eyMjIwNGjR6GmpoZTp05BEISvHOGPT0lJCUZGRt86jO8Sh1ITERER0Q9l9OjRAIDdu3ejVatWsLKyQtWqVfHbb79h586dYrnly5fDw8MD1tbWcHZ2xqhRo5CVlSWu37hxI+zt7XHgwAG4ubnBwsIC9+7dw5MnT+Dv7w8rKyv89NNP2LZtW6EYPhxKnZSUhPbt28PKygrVq1dHSEiI2NaRI0dgaWmJjIwMqTrGjRuHdu3aAQCePn2Kvn37ok6dOrCysoKHhwdiYmKkyu/atQseHh5iGx07dsSrV6/E9dHR0XB3d4eFhQVq1aqFMWPGAHj3Q4KJiQmuXLkiln3+/AW0tWvg+PH4Io/x06fPERgYAnv7ZjAyqocGDX7Bli17pMo0bdoU/fv3x9ChQ1GhQgU0b968yLoAQE5ODp6enqhVqxaqVq2KgICAz/7jwv3799GxY0fo6upCX18fvr6+SE1NFdcHBASgTZs2mDZtGgwNDaGjo4OJEyciNzcXw4cPh56eHipXrozVq1dL1TtixAjY2tpCTU0NlpaWGDduHHJycgC869GeOHEiLl68CIlEAolEgsjISADAnTt34OvrCw0NDWhpaaFDhw74999/i40/Pz8fkyZNQuXKlaGsrAwnJ6dCw/Xj4uLg5OQEFRUVODs7IyYmRmqoc1FDqU+ePAlXV1eoqalBV1cXnp6eePbsGQBg3759aNSoEXR0dKCvr49WrVohOTn5Iz+B7xsTYyIiIiL6YTx79gyxsbEICAiAmppaofXvDyGVk5PDpEmTcPjwYcyfPx8nT57ElClTpMq/fv0aixcvxqxZs3D48GFUqFABQ4YMwb1797Bx40b8/vvviIqKwpMnT4qN6fXr1+jWrRt0dHSwe/duLF++HMePHxcT08aNG0NLSwt79vx/YpmXl4edO3eibdu2AIA3b96gRo0aiIqKwuHDh9G1a1cMHDgQ58+fBwD8+++/6NevHzp27IgjR45gy5YtaNmypdjrGhUVhTFjxqBr1644ePAgIiIiYG5u/nEHGUB29hs4OVXDxo2LcerUNgQE+KF37zFISLgkVS4qKgoKCgo4efIkli9fXmx9vr6+WLp0qbg/n9urV6/g7u4ODQ0NHDt2DCdOnICGhgZatGiBt2/fiuUOHz6MBw8e4NixY5g7dy4mTJiAVq1aQVdXF2fOnEGfPn3Qp08f3L17V9xGU1MTkZGRuHr1KhYsWIAVK1Zg3rx5AICOHTsiODgY1atXR1paGtLS0tCxY0cIgoA2bdrg6dOnOHr0KA4cOIDk5GR07Nix2H1YsGAB5syZg9mzZ+PSpUvw9PRE69atcfPmTQDAy5cv4ePjA0dHR5w/fx6TJ0/GiBEjSjwuiYmJ8PDwQPXq1XHq1CmcOHECPj4+yMvLAwBkZWVh6NChiI+Px6FDhyAnJ4e2bdsiPz//oz+L7xWHUhMRERHRDyM1NRWCIMDa2rrUsr169RL/38zMDMOHD8eoUaMQFhYmLs/JycG0adNQvXp1AEBycjIOHz6MnTt3onbt2gCAOXPmwNXVtdh2tm3bhuzsbCxYsEBM1qdMmYKAgACMGTMGFStWROvWrbF9+3Z07twZAHDixAlkZGSgVatWAABjY2P06dNHrDMwMBCxsbHYtWsXateujUePHiE3NxdeXl6oXLkyAMDe/v/vBV+4cCF69+6NX3/9VVz2KRM7VapkiIEDA8T3v/3WBQcPnsT27X+hdm1Hcbm1tTVmzpxZYl2HDx/GyJEjMXHiRLRq1QrR0dFo0qQJAGDLli3o2bMnXr58Wez2GRkZ0NDQKLGN6OhoyMnJYeXKlZBIJACAiIgI6Ojo4MiRI/j5558BAHp6eli4cCHk5ORQtWpVzJw5E69evRJHIYwaNQrTp0/HyZMn0alTJwDA2LFjxXbMzc0RHByMjRs3IiQkBKqqqtDQ0ICCgoLUEOYDBw7g0qVLSElJgampKQBg7dq1qF69OuLj41G3bt1C+zB79myMGDFCbHfGjBmIjY3F/PnzsWTJEqxbtw4SiQQrVqyAiooKqlWrhvv370ud5x+aOXMmnJ2dsXTpUnFZwbkOQByxUGDVqlUwMDDA1atX4eDgUOIx/69hYvwDuDLRE1paWt86DCIqk4zSi5TT5c9eYxH8v0YjwNedzqe8mn7rAGTWBHz5iZKoaNnZ2UhJSYGBgQFUVFS+WRyVKlUqc9mCnjx9ff1St4uNjcW0adNw9epVvHjxArm5ucjOzoa2tjbU1dWhq6sLJSUlNGvWTEym4uPjoaCggJYtW0JeXl6MT0dHB9ra2lJt6unpoVKlSkhLS4OTk5NUst66dWvk5+cjIyMDNWvWRO/evdGgQQOxvn379sHLywvVqlUD8K4Hefr06di4cSPu37+PN2/e4M2bN+J+GhoawsPDA82aNYOnpyd+/vln+Pn5QVdXF48ePcLDhw/Rpk2bIo9JQY+pgYEBtLTeJbX5+c8BAOrqltDScoS6ejoAQEurOrS0dIqNR1vbCJqa1QFcAAA4OzuX+pmNHDkS/fr1w7Bhw+Dg4AAfHx+sXbsWrVu3xpUrV9CoUaMSt9fU1Cyyp9nGxkb8/3PnzuHWrVvQ1NSUKpOdnS01NLh69epS95AbGhpKJYDy8vLQ19fHo0ePxGVbtmzB/PnzcevWLWRmZiI3N7fUv82TkpJgamoqJsUAUK1aNejo6CApKalQYvzixQs8ePAADRs2lFresGFDXLx4EQBw/fp11KhRQ+q7Wq9evRLjSExMRPv27Ytdn5ycjHHjxuH06dN48uSJ2FN8586dHy4x5lBqIiIiIvph2NjYQCKRICmp5Jmsb9++DS8vLzg4OGDr1q04d+4clixZAgDi/aEAoKqqKibFAMShye8vK40gCMWWL1her149WFlZITo6Gq9fv8b27dvRrVs3sdycOXMwb948hISE4PDhw0hMTISnp6eY1MrLy+PAgQPYu3cvqlWrhkWLFqFq1apISUmBqqpqifEVJILvT3b1/jEoSmnxFFBXVy+xHgC4dOkSatWqBQBo0aIFVq9ejQ4dOmDlypWIiIhAz549S43f2tq60Ot9+fn5qFOnDhITE6VeN27cQJcuXcRyioqKUttJJJIilxUkiKdPn0anTp3QsmVL7Nq1CxcuXMCYMWMKHYcPFXdOlHSuFLRdXPmiti1tArPSzg0fHx+kp6djxYoVOHPmDM6cOQMApe7ffxETYyIiIiL6Yejp6cHT0xNLliyRmkirQMGkQwkJCcjNzcWcOXPw008/wdbWFg8ePCi1/oIZrhMSEsRl169fL/G5sNWqVUNiYqJUPCdPnoScnBxsbW3FZV26dMG6deuwc+dOyMnJwdvbW1x3/Phx+Pr6olu3bqhZsyYsLS3Fe0sLSCQSNGzYEBMnTsSFCxegpKSE7du3Q1NTE+bm5jh06FCR8RU8EiktLU1cVtqjhcoST1mZmJhIzRTerl07LF++HL1794aurm6JPZplVbt2bdy8eRMGBgaFEuhPeXTRyZMnUaVKFYwZMwbOzs6wsbHB7du3pcooKSmJ9+wWqFatGu7cuSN1r/LVq1eRkZEhNQS+gJaWFipVqoQTJ05ILY+LixPL29nZ4dKlS3jz5o24/v3ztCg1atQo9rxIT09HUlISxo4dCw8PD9jb24uTcv2ImBgTERER0Q9l6dKlyMvLQ7169bB161bcvHkTSUlJWLhwoThc2crKCrm5uVi0aBH++ecfrF27FsuWLSu17qpVq6JFixbo1asXzpw5g3PnzuHXX38tseeta9euUFFRgb+/P65cuYLY2FgMGDAA3bt3h6GhoVS58+fPY+rUqfDz85MaEmttbY0DBw4gLi4OSUlJ+O233/Dw4UNx/ZkzZzBt2jQkJCTgzp072LZtGx4/fiwmTRMmTMCcOXOwcOFC3Lx5E+fPn8eiRYsAvOs1/OmnnzB9+nRcvXoVx44dk7pvtiilxVMeISEh+P333zFx4kRcv34dZ8+exaFDh6CmpoZr164VSgY/RteuXVGhQgX4+vri+PHjSElJwdGjRzFo0CDcu3fvo+u1trbGnTt3EB0djeTkZCxcuBDbt2+XKmNubo6UlBQkJibiyZMnePPmDZo1a4YaNWqIn/nZs2fRo0cPuLq6Fjv8fPjw4ZgxYwY2btyI69evY+TIkUhMTMSgQYMAvPthJT8/H71790ZSUhL279+P2bNnAyh+hMOoUaMQHx+Pvn374tKlS7h27RrCw8Px5MkTcfbu33//Hbdu3cLhw4cxdOjQjz5W3zsmxkRERET0Q7GwsMD58+fh7u6O4OBgODg4oHnz5jh06BDCw8MBvJt4au7cuZgxYwYcHBywbt06qUm3ShIREQFTU1O4urril19+Qe/evUt8tJCamhr279+Pp0+fom7duvDz84OHhwcWL14sVc7GxgZ169bFpUuX0LVrV6l148aNQ+3ateHp6Qk3NzcYGRmhTZs24notLS0cO3YMXl5esLW1xdixYzFnzhy0bNkSAODv74/58+dj6dKlqF69Olq1aiXVw7t69Wrk5OTA2dkZgwYNKjQ794dKi6c8fvvtN2zcuFGc0MzHxwcKCgq4du0aunfvjrZt2350b3QBNTU1HDt2DGZmZvjll19gb2+PwMBAvH79+pPm6vH19cWQIUPQv39/ODk5IS4uDuPGjZMq065dO7Ro0QLu7u6oWLEiNmzYAIlEgpiYGOjq6qJJkyZo1qwZLC0tsXHjxmLbGjhwIIKDgxEcHAxHR0fs27cPO3bsEO+l1tLSws6dO5GYmAgnJyeMGTMG48ePB4Bi5wiwtbXFX3/9hYsXL6JevXpo0KAB/vzzTygoKEBOTg7R0dE4d+4cHBwcMGTIEMyaNeujj9X3TiLwydn/WS9evIC2tjYyMjI4+RYRERF9dgWTb1lYWHzTybfovyUvLw8XLlxArVq1xAnK6NtYt24devbsiYyMjFLvJ/5RlfU6xlmpiYiIiIiIfgBr1qyBpaUlTExMcPHiRYwYMQIdOnSQ2aS4PJgYExERERER/QAePnyI8ePH4+HDhzA2Nkb79u0xderUbx3WfwITYyIiIiIioh9ASEgIQkJCvnUY/0mcfIuIiIiIiIhkGhNjIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIj+Y9zc3DB48ODPWueECRPg5OT02eqLioqCjo7OZ6uPyicgIABt2rT55Hq+xLn2PeJzjImIiIioXByjHL9qe5f9L5erfEBAAJ4/f46YmJgvE9APatiwYRgwYMBXb3fOnDlYtGgR/v33X5iZmSE4OBi9e/cudTtzc3MMHjy4UNI2YcIExMTEIDEx8csEXA4SiQTbt2//LAlqeS1YsACCIJS5/JEjR+Du7o5nz55J/aCxbds2KCoqfoEIvy9MjImIiIiICBoaGtDQ0PiqbR47dgzDhg3DwoUL4ePjg7t37+LJkydfNYYflba29mepR09P77PU873jUGoiIiIi+qG5ublhwIABGDx4MHR1dWFoaIjff/8dWVlZ6NmzJzQ1NWFlZYW9e/eK2xw5cgQSiQT79+9HrVq1oKqqiqZNm+LRo0fYu3cv7O3toaWlhc6dO+PVq1fidubm5pg/f75U+05OTpgwYYL4XiKRYOXKlWjbti3U1NRgY2ODHTt2SG1z9OhR1KtXD8rKyjA2NsbIkSORm5tb7D4+e/YMPXr0gK6uLtTU1NCyZUvcvHlTqsyKFStgamoKNTU1tG3bFnPnzpXqGSxqKPXq1atRvXp1MY7+/fuL6+bOnQtHR0eoq6vD1NQUffv2RWZmZrExFkVOTg7y8vIICgqCubk5GjdujLZt25arjrKIiIiAvb09VFRUYGdnh6VLl4rrUlNTIZFIsGnTJjRu3BiqqqqoW7cubty4gfj4eDg7O0NDQwMtWrTA48ePxe3i4+PRvHlzVKhQAdra2nB1dcX58+fF9ebm5gCAtm3bQiKRiO8BIDw8HFZWVlBSUkLVqlWxdu1aqXglEgnCw8PRsmVLqKqqwsLCAps3b5Yqc/nyZTRt2hSqqqrQ19dH7969pY7/h0OpBUHAzJkzYWlpCVVVVdSsWRNbtmwRj4G7uzsAQFdXFxKJBAEBAQAKD6V+8+YNQkJCYGpqCmVlZdjY2GDVqlVl/zC+U0yMiYiIiOiHFxUVhQoVKuDs2bMYMGAA/ve//6F9+/ZwcXHB+fPn4enpie7du0slucC7ZHHx4sWIi4vD3bt30aFDB8yfPx/r16/H7t27ceDAASxatKjc8UycOBEdOnTApUuX4OXlha5du+Lp06cAgPv378PLywt169bFxYsXER4ejlWrVmHKlCnF1hcQEICEhATs2LEDp06dgiAI8PLyQk5ODgDg5MmT6NOnDwYNGoTExEQ0b94cU6dOLTHG8PBw9OvXD71798bly5exY8cOWFtbi+vl5OSwcOFCXLlyBVFRUTh8+DBCQkLKdRxq1aoFExMT9O3bF/n5+eXatqxWrFiBMWPGYOrUqUhKSsK0adMwbtw4REVFSZULDQ3F2LFjcf78eSgoKKBz584ICQnBggULcPz4cSQnJ2P8+PFi+ZcvX8Lf3x/Hjx/H6dOnYWNjAy8vL7x8+RLAu8QZeJeUp6Wlie+3b9+OQYMGITg4GFeuXMFvv/2Gnj17IjY2ViqecePGoV27drh48SK6deuGzp07IykpCQDw6tUrtGjRArq6uoiPj8fmzZtx8OBBqR8uPjR27FhEREQgPDwcf//9N4YMGYJu3brh6NGjMDU1xdatWwEA169fR1paGhYsWFBkPT169EB0dDQWLlyIpKQkLFu27KuPNPgiBPrPysjIEAAIGRkZ3zoUIiIi+gG9fv1auHr1qvD69Wup5Q6RDl/1VV7+/v6Cr6+v+N7V1VVo1KiR+D43N1dQV1cXunfvLi5LS0sTAAinTp0SBEEQYmNjBQDCwYMHxTJhYWECACE5OVlc9ttvvwmenp7i+ypVqgjz5s2TiqdmzZpCaGio+B6AMHbsWPF9ZmamIJFIhL179wqCIAijR48WqlatKuTn54tllixZImhoaAh5eXniPg0aNEgQBEG4ceOGAEA4efKkWP7JkyeCqqqqsGnTJkEQBKFjx46Ct7e3VFxdu3YVtLW1xfehoaFCzZo1xfeVKlUSxowZI5TVpk2bBH19fSE3N1eIj48XVq1aJVX/h/Ly8gQPDw/Bx8dH8PX1FTp27Ci8efNGXF+9enVh9uzZxW5fpUoVQUlJSVBXV5d6KSoqSu2HqampsH79eqltJ0+eLDRo0EAQBEFISUkRAAgrV64U12/YsEEAIBw6dEhcFhYWJlStWrXYeHJzcwVNTU1h586d4jIAwvbt26XKubi4CL169ZJa1r59e8HLy0tquz59+kiVqV+/vvC///1PEARB+P333wVdXV0hMzNTXL97925BTk5OePjwoSAI0t+DzMxMQUVFRYiLi5OqMygoSOjcubMgCP9/zj979kyqzPvn2vXr1wUAwoEDB4o9Dt+b4q5jH2KPMRERERH98GrUqCH+v7y8PPT19eHo+P+TiBkaGgIAHj16VOx2hoaGUFNTg6WlpdSyD7cpbzzq6urQ1NQU60lKSkKDBg0gkUjEMg0bNkRmZibu3btXqK6kpCQoKCigfv364jJ9fX1UrVpV7GG8fv066tWrJ7Xdh+/f9+jRIzx48AAeHh7FlomNjUXz5s1hYmICTU1N9OjRA+np6cjKyipl79/Zt28fTp48icjISGzcuBHp6enw8fFBVlYWsrOzkZycjEaNGpVYx/Dhw5GYmCj16tOnj7j+8ePHuHv3LoKCgsR7qDU0NDBlyhQkJydL1fXhZw2g0Dny/mf96NEj9OnTB7a2ttDW1oa2tjYyMzNx586dEmNOSkpCw4YNpZY1bNhQ/KwKNGjQoND7gjJJSUmoWbMm1NXVperIz8/H9evXC7V59epVZGdno3nz5lLHYc2aNYWOQ0kSExMhLy8PV1fXMm/zX8HJt4iIiIjoh/fhrLoSiURqWUES+uFw3g/LFFXP+9vIyckVmgm4YDhzafEU1CMIglRSXLDs/TiLWlfU8oLyJdVZFFVV1WLXAcDt27fh5eWFPn36YPLkydDT08OJEycQFBRU5P4W5dKlSzAzMxMnd4qJicHPP/8MDw8PtGnTBpaWliUm7wBQoUIFqeHdgPRkUQXHdMWKFVI/HADvfiB5X1Hnw4fL3v+sAwIC8PjxY8yfPx9VqlSBsrIyGjRogLdv35a670V9FkV9tsVtV1L5opYXxL17926YmJhIrVNWVi613QKlnRf/ZewxJiIiIiL6TCpWrIi0tDTx/YsXL5CSklKuOqpVq4a4uDipxDUuLg6ampqFkpqC8rm5uThz5oy4LD09HTdu3IC9vT0AwM7ODmfPnpXaLiEhodgYNDU1YW5ujkOHDhW5PiEhAbm5uZgzZw5++ukn2Nra4sGDB+XaTxMTE6SkpIi94Orq6tizZw/evn2LUaNGYcqUKWVKFktiaGgIExMT/PPPP7C2tpZ6WVhYfFLdx48fx8CBA+Hl5SVOUPbhjNqKiorIy8uTWmZvb48TJ05ILYuLixM/qwKnT58u9N7Ozg7Au888MTFRqnf+5MmTkJOTg62tbaFYq1WrBmVlZdy5c6fQcTA1NQUAKCkpAUCheN/n6OiI/Px8HD16tNgy/1VMjImIiIiIPpOmTZti7dq1OH78OK5cuQJ/f/9CPZOl6du3L+7evYsBAwbg2rVr+PPPPxEaGoqhQ4dCTq7wn+82Njbw9fVFr169cOLECXGyJhMTE/j6+gIABgwYgD179mDu3Lm4efMmli9fjr1795aYeE6YMAFz5szBwoULcfPmTZw/f16caMzKygq5ublYtGgR/vnnH6xduxbLli0r1362a9cOZmZm8Pb2xsGDB3Hr1i3s3LkTaWlpUFdXx+rVqz/LhFwTJkxAWFgYFixYgBs3buDy5cuIiIjA3LlzP6lea2trrF27FklJSThz5gy6du1aqEe14MeFhw8f4tmzZwDeDf+OjIzEsmXLcPPmTcydOxfbtm3DsGHDpLbdvHkzVq9ejRs3biA0NBRnz54VJ9fq2rUrVFRU4O/vjytXriA2NhYDBgxA9+7dxWHg79PU1MSwYcMwZMgQREVFITk5GRcuXMCSJUvESciqVKkCiUSCXbt24fHjx0XOMG5ubg5/f38EBgYiJiYGKSkpOHLkCDZt2vRJx/J7wMSYiIiIiOgzGTVqFJo0aYJWrVrBy8sLbdq0gZWVVbnqMDExwZ49e3D27FnUrFkTffr0QVBQEMaOHVvsNhEREahTpw5atWqFBg0aQBAE7NmzRxwK3LBhQyxbtgxz585FzZo1sW/fPgwZMgQqKirF1unv74/58+dj6dKlqF69Olq1aiU+AsrJyQlz587FjBkz4ODggHXr1iEsLKxc+6mmpoa4uDg4OzujZ8+ecHBwwLx58zBz5kzEx8fj6NGjUo8J+li//vorVq5cicjISDg6OsLV1RWRkZGf3GO8evVqPHv2DLVq1UL37t0xcOBAGBgYSJWZM2cODhw4AFNTU9SqVQsA0KZNGyxYsACzZs1C9erVsXz5ckRERMDNzU1q24kTJyI6Oho1atRAVFQU1q1bh2rVqgF4d+z279+Pp0+fom7duvDz84OHhwcWL15cbLyTJ0/G+PHjERYWBnt7e3h6emLnzp3icTAxMcHEiRMxcuRIGBoaFjvDdXh4OPz8/NC3b1/Y2dmhV69eZb6v/HsmEUq6uYC+ay9evIC2tjYyMjKgpaX1rcMhIiKiH0x2djZSUlJgYWFRYgJF/029evXCtWvXcPz48c9ab15eHi5cuIBatWqVu7ec3pFIJNi+fbvUc4jp45T1OsbJt4iIiIiIZMDs2bPRvHlzqKurY+/evYiKisLSpUu/dVhE3wUmxkREREREMuDs2bOYOXMmXr58CUtLSyxcuBC//vrrtw6L6LvAxJiIiIiISAb8CBMkyQre7fr1cfItIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimcbEmIiIiIiIiGQaE2MiIiIiIiKSaUyMiYiIiIiISKYxMSYiIiIi+gJSU1MhkUiQmJj4SfW4ublh8ODB4ntzc3PMnz//k+r83CQSCWJiYr5oGxMmTICTk9MXbeNT2y7vZxMZGQkdHZ2Pjos+Hz7HmIiIiIjKJcnO/qu2Z38tqVzlAwICEBUVhd9++w3Lli2TWte3b1+Eh4fD398fkZGRZarvyJEjcHd3x7Nnz76LJCY+Ph7q6uqfXI+FhQXCw8PRokWLQuvMzc1x+/ZtbNiwAZ06dZJaV716dVy9ehUREREICAgAAKSlpUFXVxfAux8E6tati4SEBNSpU+eT4/wv+VyfDX197DEmIiIioh+OqakpoqOj8fr1a3FZdnY2NmzYADMzs28Y2aerWLEi1NTUPqmOS5cuIT09He7u7sWWMTU1RUREhNSy06dP4+HDh4WSPyMjIygrK39STP9lb9++BfB5Phv6NpgYExEREdEPp3bt2jAzM8O2bdvEZdu2bYOpqSlq1aolVVYQBMycOROWlpZQVVVFzZo1sWXLFgDvej8LkkddXV1IJBKxl3Tfvn1o1KgRdHR0oK+vj1atWiE5OblQLNeuXYOLiwtUVFRQvXp1HDlyRGr90aNHUa9ePSgrK8PY2BgjR45Ebm5usfv24XDd58+fo3fv3jA0NISKigocHBywa9euEo/Pn3/+CU9PzxKT2a5du+Lo0aO4e/euuGz16tXo2rUrFBSkB56+P5Ta2toaAODs7AyJRAI3NzcAQH5+PiZNmoTKlStDWVkZTk5O2Ldvn1Q99+7dQ6dOnaCnpwd1dXU4OzvjzJkzUmXWrl0Lc3NzaGtro1OnTnj58qW47s2bNxg4cCAMDAygoqKCRo0aIT4+Xlx/5MgRSCQSHDp0CM7OzlBTU4OLiwuuX78u1cb06dNhaGgITU1NBAUFITs7W2p9QEAA2rRpg7CwMFSqVAm2trYACn82c+fOhaOjI9TV1WFqaoq+ffsiMzOz2GNO3w4TYyIiIiL6IfXs2VOqx3P16tUIDAwsVG7s2LGIiIhAeHg4/v77bwwZMgTdunXD0aNHYWpqiq1btwIArl+/jrS0NCxYsAAAkJWVhaFDhyI+Ph6HDh2CnJwc2rZti/z8fKn6hw8fjuDgYFy4cAEuLi5o3bo10tPTAQD379+Hl5cX6tati4sXLyI8PByrVq3ClClTyrSP+fn5aNmyJeLi4vDHH3/g6tWrmD59OuTl5UvcbseOHfD19S2xjKGhITw9PREVFQUAePXqFTZu3FjkMXzfqVOnAAD79+9HWlqa+OPEggULMGfOHMyePRuXLl2Cp6cnWrdujZs3bwIAMjMz4erqigcPHmDHjh24ePEiQkJCpI5ncnIyYmJisGvXLuzatQtHjx7F9OnTxfUhISHYunUroqKicP78eVhbW8PT0xNPnz6VinHMmDGYM2cOEhISoKCgILVPmzZtQmhoKKZOnYqEhAQYGxtj6dKlhfbz0KFDSEpKwoEDB4r9IUJOTg4LFy7ElStXEBUVhcOHDyMkJKTE40ffBu8xJiIiIqIfUvfu3TFq1ChxEqyTJ08iOjpaqsc2KysLc+fOxeHDh9GgQQMAgKWlJU6cOIHly5fD1dUVenp6AAADAwOpe4zbtWsn1d6qVatgYGCAq1evwsHBQVzev39/sWx4eDj27duHVatWISQkBEuXLoWpqSkWL14MiUQCOzs7PHjwACNGjMD48eMhJ1dyP9bBgwdx9uxZJCUlib2WlpaWJW5z//59XLx4EV5eXiUfQACBgYEIDg7GmDFjsGXLFlhZWZU6CVXFihUBAPr6+jAyMhKXz549GyNGjBDvWZ4xYwZiY2Mxf/58LFmyBOvXr8fjx48RHx8vHvOC3ucC+fn5iIyMhKamJoB3n/GhQ4cwdepUZGVlITw8HJGRkWjZsiUAYMWKFThw4ABWrVqF4cOHi/VMnToVrq6uAICRI0fC29sb2dnZUFFRwfz58xEYGIhff/0VADBlyhQcPHiwUK+xuro6Vq5cCSUlpWKPxfuTpllYWGDy5Mn43//+V2SiTd8We4yJiIiI6IdUoUIFeHt7IyoqChEREfD29kaFChWkyly9ehXZ2dlo3rw5NDQ0xNeaNWuKHBb9vuTkZHTp0gWWlpbQ0tKChYUFAODOnTtS5QoSbgBQUFCAs7MzkpLeTSiWlJSEBg0aQCKRiGUaNmyIzMxM3Lt3r9R9TExMROXKlcWkuCx27NiBhg0bislnSby9vZGZmYljx44V2+NeFi9evMCDBw/QsGFDqeUNGzYUj0ViYiJq1apVYlzm5uZiUgwAxsbGePToEYB3n0dOTo5UG4qKiqhXr57YRoEaNWpI1QFArKfgM3nfh+8BwNHRscSkGABiY2PRvHlzmJiYQFNTEz169EB6ejqysrJK3I6+PvYYExEREdEPKzAwEP379wcALFmypND6gmG6u3fvhomJidS60iaT8vHxgampKVasWIFKlSohPz8fDg4O4kRMJSlIhAVBkEqKC5a9X6YkqqqqpZb5UFmGURdQUFBA9+7dERoaijNnzmD79u3lbu99Re1rwbKy7IuiomKh+go+w+KOW1HH+P16CtZ9OAS+NKXNPn379m14eXmhT58+mDx5MvT09HDixAkEBQUhJyenXG3Rl8ceYyIiIiL6YbVo0QJv377F27dv4enpWWh9tWrVoKysjDt37sDa2lrqZWpqCgBir2BeXp64XXp6OpKSkjB27Fh4eHjA3t4ez549KzKG06dPi/+fm5uLc+fOwc7OTmw/Li5OTOoAIC4uDpqamoUS9aLUqFED9+7dw40bN8pwNN7dxxsbG4vWrVuXqTzw7seFo0ePwtfXV3wkU0mKOl5aWlqoVKkSTpw4IVU2Li4O9vbvHv9Vo0YNJCYmFrofuKysra2hpKQk1UZOTg4SEhLENsrC3t5e6jMDUOh9WSQkJCA3Nxdz5szBTz/9BFtbWzx48KDc9dDXwR5jIiIiIvphycvLi8Noi5qQSlNTE8OGDcOQIUOQn5+PRo0a4cWLF4iLi4OGhgb8/f1RpUoVSCQS7Nq1C15eXlBVVYWuri709fXx+++/w9jYGHfu3MHIkSOLjGHJkiWwsbGBvb095s2bh2fPnolDkvv27Yv58+djwIAB6N+/P65fv47Q0FAMHTq01PuLAcDV1RVNmjRBu3btMHfuXFhbW+PatWuQSCRFPp943759sLGxKfU+5PfZ29vjyZMnZX4MkYGBAZSVlbF//35UqVIFKioq0NbWxvDhwxEaGirepxwREYHExESsW7cOANC5c2dMmzZNnO3Z2NgYFy5cQKVKlYocyvwhdXV1/O9//8Pw4cOhp6cHMzMzzJw5E69evUJQUFCZ93fQoEHw9/eHs7MzGjVqhHXr1uHvv/8u1zEDACsrK+Tm5mLRokXw8fHByZMnCz1Xm74f7DEmIiIioh+alpYWtLS0il0/efJkjB8/HmFhYbC3t4enpyd27twp3jNsYmKCiRMnYuTIkTA0NET//v0hJyeH6OhonDt3Dg4ODhgyZAhmzZpVZP3Tp0/HjBkzULNmTRw/fhx//vmneK+ziYkJ9uzZg7Nnz6JmzZro06cPgoKCMHbs2DLv39atW1G3bl107twZ1apVQ0hIiFRv7fv+/PPPMg+jfp++vn6Zh20rKChg2LBh4hDzgvYGDhyI4OBgBAcHw9HREfv27cOOHTtgY2MD4F1P819//QUDAwN4eXnB0dGxTDNsv2/69Olo164dunfvjtq1a+PWrVvYv39/mXq6C3Ts2BHjx4/HiBEjUKdOHdy+fRv/+9//yrx9AScnJ8ydOxczZsyAg4MD1q1bh7CwsHLXQ1+HRHh/3Ab9p7x48QLa2trIyMgo8WJPRERE9DGys7ORkpICCwsLqKiofOtw6BPl5eXBwMAAe/fuRb169b5oOxcuXECtWrXKldQSfQllvY6xx5iIiIiISAakp6djyJAhqFu37rcOhei7w3uMiYiIiIhkgIGBQbmGaBPJEvYYExERERERkUxjYkxEREREREQyjYkxERERERERyTQmxkRERERERCTTmBgTERERERGRTGNiTERERERERDKNiTERERER0Tc0YcIEODk5fbb6JBIJYmJiPnp7c3NzzJ8//7PFU5zIyEjo6Oh88XZ+ZAEBAWjTps03jeFH+RyZGBMRERHRDyUgIAASiQR9+vQptK5v376QSCQICAj4+oH9oD53Yv+hOXPmwNzcHKqqqqhatSp+//33Mm974cIFtG/fHoaGhlBRUYGtrS169eqFGzdufLF4v4TU1FRIJBIkJiZKLV+wYAEiIyO/WLvm5uaQSCTFvtzc3NCxY8f/3PEsisK3DoCIiIiI/luW9Dn8Vdvrt6xpubcxNTVFdHQ05s2bB1VVVQBAdnY2NmzYADMzs88dIn0hx44dw7Bhw7Bw4UL4+Pjg7t27ePLkSZm23bVrF9q1awdPT0+sW7cOVlZWePToETZv3oxx48Zh48aNXzj6L09bW/uL1h8fH4+8vDwAQFxcHNq1a4fr169DS0sLAKCkpARVVVXxO/Zfxh5jIiIiIvrh1K5dG2ZmZti2bZu4bNu2bTA1NUWtWrWkyhY1dNjJyQkTJkwQ30+YMAFmZmZQVlZGpUqVMHDgQHHdmzdvEBISAlNTUygrK8PGxgarVq0CUPQw05iYGEgkkmJjz8/Px6RJk1C5cmUoKyvDyckJ+/btE9e/ffsW/fv3h7GxMVRUVGBubo6wsLBi65s0aRIMDQ3F3sa4uDg0adIEqqqqMDU1xcCBA5GVlVXs9hkZGejduzcMDAygpaWFpk2b4uLFi+L+TZw4ERcvXhR7EaOiogAA8+bNg6OjI9TV1WFqaoq+ffsiMzOz2HaKIicnB3l5eQQFBcHc3ByNGzdG27ZtS93u1atX6NmzJ7y8vLBjxw40a9YMFhYWqF+/PmbPno3ly5eLZY8ePYp69epBWVkZxsbGGDlyJHJzc8X1bm5uGDhwIEJCQqCnpwcjIyOpcwN4N3x95cqVaNu2LdTU1GBjY4MdO3ZIlbl69Sq8vLygoaEBQ0NDdO/eXSrJz8/Px4wZM2BtbQ1lZWWYmZlh6tSpAAALCwsAQK1atcSeWkB6KPXy5cthYmKC/Px8qXZbt24Nf39/8f3OnTtRp04dqKiowNLSEhMnTpTa3/dVrFgRRkZGMDIygp6eHgDAwMBAatmH5/jFixfh7u4OTU1NaGlpoU6dOkhISAAA3L59Gz4+PtDV1YW6ujqqV6+OPXv2ACj7d6U88ZcHE2MiIiIi+iH17NkTERER4vvVq1cjMDCw3PVs2bIF8+bNw/Lly3Hz5k3ExMTA0dFRXN+jRw9ER0dj4cKFSEpKwrJly6ChofHRcS9YsABz5szB7NmzcenSJXh6eqJ169a4efMmAGDhwoXYsWMHNm3ahOvXr+OPP/6Aubl5oXoEQcCgQYOwatUqnDhxAk5OTrh8+TI8PT3xyy+/4NKlS9i4cSNOnDiB/v37FxmLIAjw9vbGw4cPsWfPHpw7dw61a9eGh4cHnj59io4dOyI4OBjVq1dHWloa0tLS0KFDBwDvktqFCxfiypUriIqKwuHDhxESElKuY1GrVi2YmJigb9++hRK+kuzfvx9Pnjwptr2CBOz+/fvw8vJC3bp1cfHiRYSHh2PVqlWYMmWKVPmoqCioq6vjzJkzmDlzJiZNmoQDBw5IlZk4cSI6dOiAS5cuwcvLC127dsXTp08BAGlpaXB1dYWTkxMSEhKwb98+/Pvvv+KxAoBRo0ZhxowZGDduHK5evYr169fD0NAQAHD27FkAwMGDB5GWlib1g0+B9u3b48mTJ4iNjRWXPXv2DPv370fXrl3F49KtWzcMHDgQV69exfLlyxEZGSkm4J9D165dUblyZcTHx+PcuXMYOXIkFBUVAQD9+vXDmzdvcOzYMVy+fBkzZswo13flS8bPodRERERE9EPq3r07Ro0aJd6fefLkSURHR+PIkSPlqufOnTswMjJCs2bNoKioCDMzM9SrVw8AcOPGDWzatAkHDhxAs2bNAACWlpafFPfs2bMxYsQIdOrUCQAwY8YMxMbGYv78+ViyZAnu3LkDGxsbNGrUCBKJBFWqVClUR25uLnr06IGEhAScPHkSlStXBgDMmjULXbp0weDBgwEANjY2WLhwIVxdXREeHg4VFRWpemJjY3H58mU8evQIysrKYnwxMTHYsmULevfuDQ0NDSgoKMDIyAgAxKG3gwYNgry8PIB3PZ6TJ0/G//73PyxdurRMxyE/Px++vr6oWbMmnj9/ji5dumDNmjVQUlICADg4OKBnz54IDg4utG3Bjwh2dnYltrF06VKYmppi8eLFkEgksLOzw4MHDzBixAiMHz8ecnLv+hFr1KiB0NBQ8ZgtXrwYhw4dQvPmzcW6AgIC0LlzZwDAtGnTsGjRIpw9exYtWrRAeHg4ateujWnTponlV69eDVNTU9y4cQPGxsZYsGABFi9eLPbuWllZoVGjRgDe9dwCgL6+vnicP6Snp4cWLVpg/fr18PDwAABs3rwZenp64vupU6di5MiRYhuWlpaYPHkyQkJCxP37VHfu3MHw4cPFY29jYyO1rl27duIPS+X9rnzJ+JkYExEREdEPqUKFCvD29kZUVJTY81mhQoVy19O+fXvMnz8flpaWaNGiBby8vODj4wMFBQUkJiZCXl4erq6unyXmFy9e4MGDB2jYsKHU8oYNG4rDlwMCAtC8eXNUrVoVLVq0QKtWrfDzzz9LlR8yZAiUlZVx+vRpqX0+d+4cbt26hXXr1onLBEFAfn4+UlJSYG9vL1XPuXPnkJmZCX19fanlr1+/RnJycon7EhsbixkzZuDq1at48eIFcnNzkZ2djaysLKirq5d6LPbt24eTJ0/i/v37UFdXR6tWreDj44Nt27ZBXl4eycnJYuL4IUEQSq0fAJKSktCgQQOp4boNGzZEZmYm7t27J96PXqNGDantjI2N8ejRI6ll75dRV1eHpqamWObcuXOIjY0tsnc0OTkZz58/x5s3b8QE9mN17doVvXv3xtKlS6GsrIx169ahU6dO4g8U586dQ3x8vFQPa15eHrKzs/Hq1Suoqal9UvsAMHToUPz6669Yu3YtmjVrhvbt28PKygoAMHDgQPzvf//DX3/9hWbNmqFdu3aFjm1JvmT8HEpNRERERD+swMBAREZGIioqqthh1HJycoUSqZycHPH/TU1Ncf36dSxZsgSqqqro27cvmjRpgpycnFInHSqt7uJ8eF+lIAjistq1ayMlJQWTJ0/G69ev0aFDB/j5+UmVb968Oe7fv4/9+/dLLc/Pz8dvv/2GxMRE8XXx4kXcvHlTTF4+LG9sbCxVPjExEdevX8fw4cOLjT8tLQ0+Pj5wcHDA1q1bce7cOSxZsqTM+w8Aly5dgpmZGfT09KCsrIyYmBhkZmbCw8ND/KGioOf+Q7a2tgCAa9euldjG+8f1/WWA9GdQMBS4gEQiKTS0u6Qy+fn58PHxKXQcb968Kd7v/Tn4+PggPz8fu3fvxt27d3H8+HF069ZNXJ+fn4+JEydKxXD58mXcvHmz0GiBjzVhwgT8/fff8Pb2xuHDh1GtWjVs374dAPDrr7/in3/+Qffu3XH58mU4Oztj0aJFAMr2XfmS8bPHmIiIiIh+WC1atMDbt28BAJ6enkWWqVixItLS0sT3L168QEpKilQZVVVVtG7dGq1bt0a/fv1gZ2eHy5cvw9HREfn5+Th69Kg4lPrDul++fCnVS/rhI3fep6WlhUqVKuHEiRNo0qSJuDwuLk4qCdTS0kLHjh3RsWNH+Pn5oUWLFnj69Kk4QVLr1q3h4+ODLl26QF5eXhyWXbt2bfz999+wtrYu6bCJateujYcPH0JBQaHI+5iBdzMTFwyfLpCUlITc3FzMmTNHHI68adOmMrVZwMTEBCkpKbh37x4qV64MdXV17NmzB+7u7hg1ahS2bdtW7CRmP//8MypUqICZM2eKSdn7nj9/Dh0dHVSrVg1bt26VSpDj4uKgqakJExOTcsVbktq1a2Pr1q0wNzeHgkLhFMzGxgaqqqo4dOgQfv3110LrC4aPf3icP6SqqopffvkF69atw61bt2Bra4s6depIxXH9+vUyf/4fy9bWFra2thgyZAg6d+6MiIgIcdI0U1NT9OnTB3369MGoUaOwYsUKDBgwoEzflS8ZP3uMiYiIiOiHJS8vj6SkJCQlJYnDST/UtGlTrF27FsePH8eVK1fg7+8vVTYyMhKrVq3ClStX8M8//2Dt2rVQVVVFlSpVYG5uDn9/fwQGBiImJgYpKSk4cuSImATWr18fampqGD16NG7duoX169eX+tzZ4cOHY8aMGdi4cSOuX7+OkSNHIjExEYMGDQLwbrbn6OhoXLt2DTdu3MDmzZthZGRUaEbftm3bYu3atejZsye2bNkCABgxYgROnTqFfv36iT2WO3bswIABA4qMpVmzZmjQoAHatGmD/fv3IzU1FXFxcRg7dqw407C5uTlSUlKQmJiIJ0+e4M2bNzAxMUFubi4WLVokHrNly5aV+nm9r127djAzM4O3tzcOHjyIW7duYefOnUhLS4O6ujpWr15d7IRc6urqWLlyJXbv3o3WrVvj4MGDSE1NRUJCAkJCQsRnXPft2xd3797FgAEDcO3aNfz5558IDQ3F0KFDxYT+c+jXrx+ePn2Kzp074+zZs/jnn3/w119/ITAwEHl5eVBRUcGIESMQEhKCNWvWIDk5GadPnxZnNzcwMICqqqo4aVdGRkaxbXXt2hW7d+/G6tWrpXqLAWD8+PFYs2aN2KublJSEjRs3YuzYsZ9lP1+/fo3+/fvjyJEjuH37Nk6ePIn4+HhxiP7gwYOxf/9+pKSk4Pz58zh8+LC4rizflS8ZPxNjIiIiIvqhaWlpic9dLcqoUaPQpEkTtGrVCl5eXmjTpo3UsGIdHR2sWLECDRs2RI0aNXDo0CHs3LlTvO82PDwcfn5+6Nu3L+zs7NCrVy/x8Ud6enr4448/sGfPHjg6OmLDhg2FHvXzoYEDByI4OBjBwcFwdHTEvn37sGPHDnESIw0NDcyYMQPOzs6oW7cuUlNTsWfPniITOT8/P0RFRaF79+7Ytm0batSogaNHj+LmzZto3LgxatWqhXHjxsHY2LjIWCQSCfbs2YMmTZogMDAQtra26NSpE1JTU8UZk9u1a4cWLVrA3d0dFStWRHR0NKpWrYrZs2djxowZcHBwwLp160p8pFRR1NTUEBcXB2dnZ/Ts2RMODg6YN28eZs6cifj4eBw9elScRKwovr6+iIuLg6KiIrp06QI7Ozt07twZGRkZ4qzTJiYm2LNnD86ePYuaNWuiT58+CAoK+myJYoFKlSrh5MmTyMvLg6enJxwcHDBo0CBoa2uLn9u4ceMQHByM8ePHw97eHh07dhTvUVZQUMDChQuxfPlyVKpUCb6+vsW21bRpU+jp6eH69evo0qWL1DpPT0/s2rULBw4cQN26dfHTTz9h7ty5RU7g9jHk5eWRnp6OHj16wNbWFh06dEDLli0xceJEAO96vPv16wd7e3u0aNECVatWFSdjK8t35UvGLxHKemc6fXdevHgBbW1tZGRklHixJyIiIvoY2dnZSElJgYWFxWe7/5B+fHl5ebhw4QJq1apVbC890ddS1usYe4yJiIiIiIhIpjExJiIiIiIiIpnGxJiIiIiIiIhkGhNjIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIiIimabwrQOgT/fT+p8gr1r0VPibwnK/cjRf356aVqUXIiKir6qjxYhvHQJ9BrmaEuS6q+OtahbkFHK+aSxP5F5+0/ap7PLz8791CETlxh5jIiIiIiIikmlMjImIiIiIiEimMTEmIiIiIvrK7t69CxMTE1y5cuWz1lu/fn2sWLHis9Xn5+eH8ePHf7b6fhRubm4YPHjwV20zICAAbdq0+aptfo8mTJgAJyenz14vE2MiIiIi+qEMHjwYJiYmWLx4sdTyffv2wcTE5BtF9XXs2bMH3bp1+6ptpqeno1evXqhWrRrs7OzQvn17pKamlrrdkSNHIJFI4ODggLy8PKl1Ojo6iIyM/DIB03fhe0v0OfkWEREREZXLouDOX7W9AXM2lHsbFRUVLF26FN26dYOOjs7nD6oEOTk5UFRU/KptFtDX1//qbU6dOhWXLl1CVFQUKlSogEuXLpVr++TkZKxZswY9e/b8QhEW7e3bt1BSUvqqbdL3iz3GRERERPTDadSoESpWrFio1/hD8fHx+OWXX2BlZQVnZ2eMGzcOr169EtebmJhg3759UtvY29tj48aNAP5/SPSOHTvg5+cHS0tLbNu2Dfn5+Zg3bx7q1KkDCwsLNG/eHLGxsSXGcurUKXh7e8PCwgK1atXCtGnTkJv7/08YyczMRP/+/WFtbY1atWrh999/LzTU+cOh1BkZGQgJCUHNmjVhaWmJpk2b4sCBAwCAp0+fom/fvqhTpw6srKzg4eGBmJiYkg9sEeTk5ODs7Iy6devCwsICPj4+MDc3L/P2AwYMQGhoKLKzs4stk5GRgd69e8PAwABaWlpo2rQpLl68KK4vqvdx8ODBcHNzE9+7ubmhf//+GDp0KCpUqIDmzZsDAI4ePYp69epBWVkZxsbGGDlypNRxz8rKQo8ePaChoQFjY2PMmTOnUHxv375FSEgITExMoK6ujvr16+PIkSNi7KqqqoXOo23btkFdXR2ZmZkAgPv376Njx47Q1dWFvr4+fH19S+x5f/PmDQYOHAgDAwOoqKigUaNGiI+PF9cX9Mjv3r0bNWvWhIqKCurXr4/Lly+LZSIjI6Gjo4Ndu3ahatWqUFNTg5+fH7KyshAVFQVzc3Po6upiwIABUr36Je3v+/Xu378f9vb20NDQQIsWLZCWlgbg3XDoqKgo/Pnnn5BIJJBIJOL2I0aMgK2tLdTU1GBpaYlx48YhJ+fLz4rPxJiIiIiIfjjy8vIYOXIkIiIi8ODBgyLLJCUloWvXrmjZsiUOHDiA8PBwnD17FmPGjCl3e9OmTUNgYCCOHDkCV1dXrFy5EsuXL8f48eNx4MABuLm5oWfPnvjnn3+K3D4tLQ3du3dHzZo1ceDAAYSFhWHDhg1YsGCBWGbixImIj49HREQENmzYgLNnz0olOR/Kz89Ht27dkJCQgEWLFiE2NhajRo2CvPy7x3y+efMGNWrUQFRUFA4fPoyuXbti4MCBOH/+fLn2/eeff8bu3btLTfyLM3jwYOTm5hb7I4YgCPD29sbDhw+xZ88enDt3DrVr14aHhweePn1arraioqKgoKCAkydPYvny5bh//z68vLxQt25dXLx4EeHh4Vi1ahWmTJkibjN8+HDExsZi+/bt+Ouvv3DkyBGcO3dOqt6ePXvi5MmTiI6OxqVLl9C+fXu0aNECN2/ehLa2Nry9vbFu3TqpbdavXw9fX19oaGjg1atXcHd3h4aGBo4dO4YTJ06IyeTbt2+L3JeQkBBs3boVUVFROH/+PKytreHp6VnomAwfPhyzZ89GfHw8DAwM0Lp1a6lE89WrV1i4cCGio6Oxb98+HDlyBL/88gv27NmDPXv2YO3atfj999+xZcuWMu3v+/XOnj0ba9euxbFjx3Dnzh0MGzYMADBs2DB06NBBTJbT0tLg4uICANDU1ERkZCSuXr2KBQsWYMWKFZg3b155PuaPwsSYiIiIiH5ILVu2RLVq1Yrs4QOA8PBwtGnTBr169YKlpSXq1q2LyZMnY8uWLSX2Xhbl119/hZeXF8zMzGBkZITly5ejb9++8PX1hbW1NcaMGYPq1atj5cqVRW4fFRWFSpUqYerUqbC2tkaLFi0QHByM5cuXIz8/H5mZmdi8eTPGjRuHxo0bw87ODnPnzi10b+77jh8/jsTERKxYsQJNmjRBlSpV0Lx5czRt2hQAYGxsjD59+sDBwQFVqlRBYGAgXF1dsWvXrjLv940bN9C/f38MGzYMw4YNw86dO8V1CQkJkEgkSE9PL7EONTU1hIaGIiwsDBkZGYXWx8bG4vLly9i8eTOcnZ1hY2OD2bNnQ0dHRypZKwtra2vMnDkTVatWhZ2dHZYuXQpTU1MsXrwYdnZ2aNOmDSZOnIg5c+aIx33VqlWYPXs2mjdvDkdHR0RFRUkd9+TkZGzYsAGbN29G48aNYWVlhWHDhqFRo0aIiIgAAHTt2hUxMTHiaIQXL15g9+7d4v3g0dHRkJOTw8qVK+Ho6Ah7e3tERETgzp07Uj2xBbKyshAeHo5Zs2aJ5/mKFSugqqqKVatWSZUNDQ2Viv3ff//F9u3bxfU5OTkIDw9HrVq10KRJE/j5+eHEiRNYtWoVqlWrhlatWsHd3V384aMs+1tQ77Jly+Ds7IzatWujf//+OHToEABAQ0MDqqqqUFZWhpGREYyMjMRh7WPHjoWLiwvMzc3h4+OD4OBgbNq0qVyf88fgPcZERERE9MMaM2YMOnTogN9++63QusuXLyM1NVUqSRAEAfn5+bh79y5sbGzK3E7Nmv/X3n2H93T+jx9/vrOnhBAZZElE7EiMGBnEqJXUCqJE1KpNUVSoGo3WqNq04dOaVa1RsyQxUkUqZqwI2lQaFLFCxvn94Zfz9ZYhOgR5Pa4r19X3ue9zn9c5ua/U633f575rqf999+5dUlNTqVu3rlYdb29vzpw5k+/5Fy9exMvLC41Gox6rW7cu9+/f59q1a9y+fZvMzEw8PT3V8lKlSlGpUqUCYzp9+jS2trYF1snOzmb+/Pls2bKFa9eu8fjxYx4/foyJiUmR7hlg1qxZBAQEMHjwYPz9/QkJCeGvv/6iefPmnD59mipVqhTpvec+ffowe/ZsIiMjmT59ulZZfHw89+7dy9POw4cPSUpKKnKs8OR38LTExER8fHy0nnujRo24d+8ev//+O7du3eLx48f4+Pio5WXKlMHd3V39/Ouvv6IoCpUrV9Zq+9GjR2rMbdq0QU9Pj82bN9O1a1e+++47zM3NadGihXqPFy9exNzcXKuNjIyMfO8xKSmJzMxMGjVqpB7T19enXr16JCYmatXNL/an65iYmGj1kfLly+Pk5ISZmZnWsbS0tCLfb37t2traqm0UZsOGDcydO5eLFy9y7949srKyKFWq1HPP+6ckMRZCCCGEEG+sBg0a4OfnxyeffEKXLl20ynKnGoeHh+c5L3f1ao1Gg6IoWmX5ve9obGyc59jTyRY8SbqfPVZY2dPXfTaG5x2HJwuQFWbJkiUsW7aMjz76iCpVqqgjty/yPmdiYiKdO3cGoHr16qxYsYLu3btz9epVjh8/XuQFtfT09Jg6dSphYWEMHjxYqywnJwdbW9t8R05zF1bT0dEp0u/J1NRU63Nhzz2/331+cnJy0NXVJT4+Xp2mnis3uTQwMKBTp06sXr2arl27snr1akJCQtDT01Pb8PLyyjPdGqBcuXJ5jj0d4/PuJz9P13l2oTiNRpPvsZycnCLfb0HtPu95Hjp0iK5du/LRRx/RsmVLLCwsWLt2bYGzPv5NkhgLIYQQQog32vjx42nRogUuLi5ax2vUqMG5c+dwdnYu8FwrKyv+/PNP9fOlS5d4+PBhodczNzfHxsaGw4cP06BBA/V4fHx8gfuvurm5sW3bNq3E5ujRo+qCTxYWFujr65OQkKAm7Xfv3iU5OVnrGk/z8PDg2rVrJCUl5Ttq/Msvv9CyZUs6duwIPEl4kpOTX2ikPPc+c9WtW5dly5YRFhaGlZVVniS3MJ07d+bTTz/lo48+0jpep04dUlNT0dPTK3BRr3LlyuXZEzohIeG5q4NXrVqV7777Tuu5x8XFYW5ujr29PaVLl0ZfX59Dhw7h4OAAwK1btzh//jx+fn4AeHp6kp2dTVpaGk2aNCnwWqGhobRo0YLTp08THR3Nxx9/rHWP69atUxcXex5XV1cMDAw4cOAA3bt3B558EXD06NE8+yvnF3uVKlWee42CFPV+n8fAwCDPqwAHDx7E0dFR6z3/K1eu/O1rvAh5x1gIIYQQQrzRPDw8ePvtt7XefwR47733iI+PZ/z48Zw6dYpLly6xa9cuPvzwQ7VOo0aNWLFiBSdPnuT48eN88MEHRdqKacCAASxcuJBNmzZx8eJFpk+fzunTp+nTp0++9Xv16sUff/zBhx9+yMWLF9m5cyezZs2iX79+6OjoYGZmRufOnZk6dSoHDx7k3LlzjBw5Eh0dnQJHCH18fKhfvz79+vVTFz/au3ev+q6ok5MT+/bt48iRI1y4cIGxY8dy/fr1oj5WAAYOHEh0dDTjx4/n7NmznDp1ipiYGPT09Lh+/brWO8dF8cknn/DVV19x//599VhgYCA+Pj4EBwezc+dOLl++TFxcHB9++CFHjx4FoGnTphw9epT//e9/XLhwgUmTJuVJlPPz3nvv8dtvvzFkyBDOnj3Lpk2bmDRpkvpszczM6NOnD6NHj2bPnj2cOnWKsLAwdHT+L42qXLkyoaGh9OzZk40bN5KcnMyRI0eIjIxk27Ztaj0/Pz/Kly9PaGgoTk5OWl9ohIaGUrZsWYKCgti/fz/JycnExsYybNgwfv/99zxxm5qaMnDgQEaPHs2OHTs4c+YMffv25cGDB3n62JQpU7RiL1u27D/aP7io9/s8Tk5OnDhxgnPnznHjxg0yMzNxdXXl6tWrrF27lqSkJObNm6f1qsN/SRJjIYQQQgjxxhszZkyeaZy5o4XJycl06NCBli1bMnPmTKytrdU6ERER2NnZ0aFDBwYNGsSAAQPynTb9rD59+tC/f3+mTJlCYGAg0dHRREVF5Rm1zmVra8vXX39NQkICzZs354MPPqBbt24MGzZMrTNp0iS8vLzo1asXXbt2pW7duri5uRU6ZXrZsmXUqlWL9957j4CAAKZNm6aO0g0fPpwaNWoQGhpKp06dKFeuHC1btnzuvT0tICCAdevWkZiYSFBQEF26dOGPP/5g5cqVTJo0ibCwMOLi4orcXtOmTWnatKnWdkkajYZt27bh6+tLeHg4lStXpmvXrly+fJny5csD0LJlSyZOnMiYMWOoW7cud+/epWfPns+9nr29Pdu2bePw4cPUqlWLAQMG0KdPH60vRz799FN8fX1p3749gYGBNG7cGC8vL612oqKi6NmzJ6NGjcLd3Z327dvzyy+/ULFiRa376NatG8ePHyc0NFTrfBMTE/bt24eDgwMdOnTAw8OD8PBwHj58WOAI8ieffELHjh155513qFOnjvqFSunSpfPUGzZsGF5eXly7do3Nmzf/4/2bi3K/z9O3b1/c3d3x9vamXLlyHDx4kKCgIEaMGMHgwYOpXbs2cXFxTJw48R/FWlQapSgT58UrKT09HQsLCzwWeaBrrJtvnfUzsvI9/ibZVqvgRSeEEEIUjxDnscUdgvgXZJlruBtgiqOdA0Z6/+wf0v/UDZ27xXr9V9GDBw/w8vIiIiKCbt26FXc4qpycHFJTU/H09MzzDqp4eWJiYggICODWrVvqu9glUUZGBsnJyTg7Oxf6JZK8YyyEEEIIIcRr4NSpU1y8eJHatWtz9+5ddW/XFx3lFULkJYmxEEIIIYQQr4nFixeTlJSEgYEBNWrUYOPGjZQpU6a4wxLitSeJsRBCCCGEEK+B6tWrs2PHjuIOQ7wm/P39i7TdlHhCFt8SQgghhBBCCFGiSWIshBBCCCGEEKJEk8RYCCGEEEIIIUSJJomxEEIIIYQQQogSTRJjIYQQQgghhBAlmiTGQgghhBBCCCFKNEmMhRBCCCGE+A/Z29v/69ssderUiYiIiH+tveHDhxMeHv6vtSfE60YSYyGEEEII8cZJSUlh1KhR1KlTBycnJ+rVq0dERAR//fVXcYf2r1i2bBljxox56deNi4vD09OzwP1xFUVh1apVhIWFYWFhgaWlJd7e3sydO5cHDx685GiFKDq94g5ACCGEEEK8XtLmJ7zU61kPrv1C9a9cuUL79u1xcXFhwYIFODg4cO7cOaZOncrevXvZsmULpUuX/m+CfUmKK/5du3bRvHlzNBpNvuVDhw5l27ZthIeHs3z5cmxsbDh+/Dhz587FycmJ4ODglxvwC8jMzERfX7+4wxDFREaMhRBCCCHEG2XChAno6+uzevVqfHx8sLe3p2nTpqxdu5bU1FQiIyPVuvXr12fevHmMHDmSypUrU7duXb755hut9v744w8GDhxItWrVcHV15a233uLXX39Vy1euXEnDhg1xcnKiSZMmbNiwodD4EhMT6dy5M5UqVaJatWqMGTOG+/fvq+VZWVlMnDgRDw8PqlWrxrRp0xg2bJjWVOdnp1I/evSIqVOn4u3tjbOzM40aNWLNmjUAZGdnM2rUKBo0aEClSpVo0qQJy5cv/1vPdvfu3bRo0SLfss2bN7Nx40bmz59P7969qVu3Lk5OTgQFBbF3714CAgIAOHLkCM2bN6ds2bJYWFjg5+en9TwBNBoNy5cv5+2338bExAQ3Nzc2b96sVef06dO0adOGUqVKYW5uTpMmTUhKSlLLo6Ki8PDwwMjIiCpVqrBw4UK17PLly2g0GtavX4+/vz9GRkZ5fu+iZJHEWAghhBBCvDFu3bpFTEwMvXr1wtjYWKvM2tqaDh06sGXLFq2pwEuWLKFmzZrs3LmTXr16MW7cOC5evAjA/fv36dSpE3/++SdRUVHs3r2bgQMHkpOTA8D27duZNGkS/fr1Y8+ePfTo0YORI0dy8ODBfON7+PAhPXr0wNLSkh9//JElS5awf/9+JkyYoNZZsGABGzduZPbs2WzatIm7d++yc+fOQu972LBhbNq0iY8//piYmBg++eQTTExMAMjJycHW1pbFixcTHR3NiBEj+OSTT/Ikms9z7tw50tLSaNy4cb7l33//PZUqVaJly5Z5yjQaDRYWFgDcvXuXXr16sX//fg4dOoSbmxutW7fm7t27Wud89NFHdOnShRMnTtC6dWtCQ0PVqfApKSn4+vpiZGTE3r17iY+PJzw8nKysLODJVPMJEyYwbdo0EhMTmT59OhMnTmTlypVa1xg7dixDhw4lMTEx37hFySFTqYUQQgghxBsjOTkZRVFwc3PLt9zV1ZXbt29z8+ZNypYtC0DTpk0JCwsDYNCgQSxbtoy4uDhcXV35/vvvuXnzJj/++KM6fdnZ2Vltb/HixXTp0kU9v1KlSvz6668sXryYRo0a5bn+xo0bycjI4PPPP1cT16lTpxIWFsaECRMoV64cUVFRDBkyhLfeeguAadOmsXfv3gLvOSkpiS1btrBmzRp8fX0BcHR0VMv19fV5//331c8ODg4cPXqULVu20L59+0Kf59N27typjq7mJzk5mUqVKj23naZNm2p9XrJkCaVLlyY2Npa2bduqx8PCwujWrRsA06dP54svvuDw4cO0atWKBQsWYGFhwdq1a9Xpz5UrV1bP/fjjj5k1axYdOnQAnvzOzpw5w5IlS+jVq5dab/jw4WodUbJJYiyEEEIIIUqM3JHip9+RrVq1qvrfGo2GcuXKcfPmTeDJdN3q1asX+E7vxYsXCQ0N1TpWt25dvvzyy3zrX7hwAQ8PDzUpzq2fk5NDUlIShoaGXL9+ndq1a6vlurq61KxZUx2lftbp06fR1dXFx8enwPv+3//+x5o1a/j999/JyMggMzOTatWqFVg/P7t27aJnz54FliuKUuC7x09LS0sjIiKCvXv38ueff5Kdnc2DBw+4evWqVr2aNWuq/21qaoq5uTlpaWkAJCQk0KRJk3zfCb5+/Tq//fYbffr0oW/fvurxrKwsddQ6l7e393PjFSWDJMZCCCGEEOKN4eTkhEaj4fz587Rq1SpPeVJSEpaWlpQpU0Y9pqen/U9ijUajJqEFjY4+W/9phSWIhZU9fTy/NgvyvBg3b97MRx99xMSJE/H29sbU1JRFixZx7NixQs97WlpaGidPniQwMLDAOi4uLly4cOG5bYWFhXH9+nXmzp2Lo6MjhoaG+Pj48PjxY616zya9T/9enp0m/7TcOsuWLaN+/fpaZbq6ulqfTU1NnxuvKBnkHWMhhBBCCPHGKFOmDL6+vqxcuZKHDx9qlaWlpbFx40batWtXpJFNAA8PD06fPs2tW7fyLXd1deXIkSNax44ePYqrq2u+9StXrsyZM2e0ti46cuQIOjo6uLi4UKpUKcqVK6eVtGZnZ3Pq1KlCY8zJyeHnn3/Ot/zw4cN4eXkRFhZG9erVcXZ25sqVKwW2l59du3bh5eWl9YXCs4KDg7l06VK+70MrisKdO3cA2L9/P0OHDqV169ZUq1YNQ0NDbty48ULx1KxZk/3795OZmZmnrHz58tjb23Pp0iVcXV21fp6eBi/E00p8YhwWFpZn2fgNGzZgZGTEzJkzmTx5MhqNJs9PlSpVtM65ePEi4eHhODg4YGhoiL29Pc2aNWPVqlXqIgC5oqOjadu2LeXKlcPIyIhKlSoREhLCvn37/uvbFUIIIYR4402dOpXHjx8TGhrKoUOHSElJITo6mm7dumFjY8PYsWOL3FZwcDDlypWjT58+HDlyhCtXrvDjjz9y9OhRAAYOHMj69ev53//+x6VLl1iyZAnbt29nwIAB+bbXoUMHDA0NGTZsGGfPnuXgwYNMnDiRjh07Uq5cOQB69+7N/Pnz2blzJxcvXiQiIoI7d+4UmMxXrFiRzp07M2rUKHbs2MHVq1eJi4tTF9dycnLixIkTxMTEkJSUxMyZMzl+/PiLPFJ27dpV4GrUudq3b0/79u0ZPHgwUVFRHD16lCtXrrB161YCAwOJjo4GnnyZ8PXXX5OYmMgvv/xCaGhooSPA+Rk8eDDp6el07dqVo0ePcuHCBb7++mvOnTsHwOTJk5kxYwaff/4558+f5+TJk0RFRTF79uwXuo4oOUp8Yvys5cuXExoayvz589VN06tVq8a1a9e0fg4cOKCec/jwYerUqUNiYiILFizg1KlTbN26lfDwcBYvXszp06fVugsXLqRZs2ZYWVmxbt06EhMT+frrr2nYsCEjRox46fcrhBBCCPGmcXFxYfv27Tg6OjJw4EAaNWrEmDFjaNiwIZs3b36hPYANDAxYs2YNVlZWvPPOOzRr1owFCxaoU3JbtWrFRx99xOLFi2natCnffPMNs2fPpmHDhvm2Z2xszKpVq7h9+zZt2rShX79+NG7cmGnTpql1Bg0aRHBwMMOGDSMoKAhTU1P8/PwwNDQsMM4ZM2bQpk0bxo8fj5+fH6NHj1ZHzN955x3eeustBg4cSLt27bh165bWAlTP8+DBAw4ePPjcxFij0bBgwQIiIiKIiYmhadOm1KxZk8mTJxMUFKSu+vzVV19x69YtPD09eeeddxg6dCjW1tZFjgfAysqKvXv3cu/ePfz8/PDy8mLZsmXq9Ot3332X5cuXs2LFCmrUqIGfnx8rVqyQEWNRII1S2AsLJUBYWBi3b9/mhx9+YObMmURERLBq1So6duwIPPm26YcffiAhISHf8xVFoVq1apiYmHD48GF0dPJ+15D7LsnVq1dxdXVl8ODB+X5bVdQFC3Klp6djYWGBxyIPdI11862zfkZWvsffJNtqPX/1QyGEEC9XiHPRR+TEqyvLXMPdAFMc7Rww0jMo1lhu6Nx9fqU3VE5ODn5+frRr104duHmZtm3bxsyZM4mJiSlS/ZycHFJTU/H09MzzTq8QL1tGRgbJyck4OzsX+j6+LL71/33wwQcsWLBAnepRVAkJCSQmJrJmzZp8k2L4v8UTvvvuOzIzMwv8g/a8pPjRo0c8evRI/Zyenl7kOIUQQgghxOvh999/JzY2lgYNGvD48WOioqL47bffePvtt4slHlNTU8aPH18s1xbiZZGp1DzZmD0yMpJNmzblmxSfPHkSMzMzrZ93330XgPPnzwPg7u6u1k9LS9Oqu3DhQrVuqVKlsLGxUet+9913WnVPnjxZYJwzZszAwsJC/alYseK/cv9CCCGEEOLVodFoWL9+PW3atCE4OJizZ8+ydu3aAvdm/q/5+fk9dxq1EK87GTHmyap2N27cICIigrp162Jubq5V7u7uri5ekOvZOk+P9lpZWalTr/39/bWWnn92VLhly5YkJCSQkpKCv78/2dnZBcY5btw4Ro4cqX5OT0+X5FgIIYQQ4g1jb2/Ppk2bijsMIUoUSYx58sfnu+++IyAggFatWrFjxw6txNfAwKDAJfdzv7k7e/asuhG7rq6uWv/pffHc3Ny4c+cOqamp6qixmZkZrq6uefbPy4+hoWGhiy4IIYQQQgghhHhxMpX6/3NwcCA2Npa0tDRatGhR5Pd3PT09qVKlCp999pm6mXhBOnXqhL6+PpGRkf9GyEIIIYQQQggh/gUyYvyUChUqEBMTQ0BAAC1atFA3J8/KyiI1NVWrrkajoXz58mg0GqKiomjevDmNGjVi3LhxeHh4kJmZyb59+7h+/bq6Gp+DgwOzZs1i2LBh/PXXX4SFheHs7Mxff/3FN998AyAr9wkhhBBCCCHESyaJ8TPs7e2JjY0lICCA5s2b07BhQ06fPo2tra1WPUNDQzIyMgBo0KAB8fHxTJ8+nUGDBpGamoqpqSm1atVizpw5hIeHq+cNGTIEDw8PZs+eTadOnUhPT8fKygofHx927NhBjRo1Xur9CiGEEEIIIURJV+L3MX6dyT7GT8g+xkII8eqRfYzfDLKPsfg7ZB9j8Sop6j7G8o6xEEIIIYQQQogSTRJjIYQQQgghnrJu3To8PDyKO4z/VFxcHPb29ty5c6fQevXr12fZsmUvKSpRECcnJ+bOnVss1548ebK6+86bTBJjIYQQQgjxxklJSWHUqFHUqVMHJycn6tWrR0REBH/99ZdWvVcp8WvQoAHR0dH5ltWvX7/A/Y0DAgKwt7dn3bp1f/vaBX0ZsG3bNnr06PG32y2qVyX5mjx5Ml27ds237P79+4wdOxYXFxeMjIwoV64c/v7+bN269SVH+XK9//777Nmzp7jD+M/J4ltCCCGEEOKFTF8+66Veb/y7o16o/pUrV2jfvj0uLi4sWLAABwcHzp07x9SpU9m7dy9btmyhdOnS/1G0BcvMzERfXz/fsjNnznDr1i0aNmxY4Pl2dnasW7eOoKAg9Vh8fDxpaWmYmJj86/ECWFlZ/Sftvqo2b97M6NGj8y0bMGAAhw8fZv78+VStWpWbN28SFxfHzZs3X3KUeRXWt/4pMzMzzMzM/pO2XyUyYiyEEEIIId4oEyZMQF9fn9WrV+Pj44O9vT1NmzZl7dq1pKamEhkZCUCnTp34/fffmTx5Mvb29tjb22u1ExMTg5+fH25uboSGhvLnn39qla9btw4/Pz9cXFzw9fVlxYoVatlvv/2Gvb09mzdvplOnTri4uLBx48YCY965cyd+fn4YGhoWWKdDhw4cOnSIlJQUrRg6dOiAnt7/jXflXvvUqVPqsTt37mBvb09cXFyeduPi4hg5ciTp6enqc5g168mXH8+OqM+aNYu6devi7OxMnTp1mDhxIgBz5syhWbNmedr28vIiIiICePI869Wrh6mpKZaWljRq1IgrV66wYsUKPvroI44fP45Go0Gj0ajP8s6dO/Tr1w9ra2tKlSpF06ZNOX78uNp+7kjzV199hYODA2ZmZgwcOJDs7GxmzpyJjY0N1tbWTJs2rcDn+vRzO3XqFG+99Va+5Vu2bGH8+PG0bt0aJycnvLy8GDJkCL169QJgypQp+e4w8/QzCAsLIzg4mM8++wxbW1usrKwYNGgQmZmZav20tDTatWuHsbExzs7OrFq1Kk+bGo2GxYsXExQUhKmpKVOnTgVg0aJFVKpUCQMDA9zd3fn666/znLdkyRLatm2LiYkJHh4e/Pzzz1y8eBF/f39MTU3x8fEhKSkpzzN+2ldffUW1atUwNDTE1taWwYMHP/f5vuokMRZCCCGEEG+MW7duERMTQ69evTA2NtYqs7a2pkOHDmzZsgVFUVi2bBm2tra8//77HDt2jGPHjql1Hz58yOLFi5k3bx4bN24kJSWFjz/+WC1ftWoVkZGRjB07lpiYGD744AM+/fRT1q9fr3XN6dOnEx4eribZBdm9ezctW7Ys9N7Kli2Ln58f3377rRrj5s2bCQkJKfLzyY+3tzcfffQR5ubm6nMYMGBAnnpbt25l2bJlREZGcuDAAb788kuqVKkCQEhICOfPnychIUGtf+LECY4dO0ZYWBhZWVkEBwfj5+fHiRMn+Pnnn+nXrx8ajYaQkBBGjRpFtWrVuHbtGteuXSMkJARFUWjTpg2pqals27aN+Ph46tSpQ7NmzbSmxCclJbF9+3Z27NjBmjVr+Oqrr2jTpg2///47sbGxREZG8uGHH3Lo0KFCn8PmzZvx9fXF0tIy33IbGxu2bdvG3bv5r5AeHh7OmTNnOHLkSL7PIFd0dDRJSUlER0ezcuVKVqxYofWlSlhYGJcvX2bv3r1s2LCBhQsXkpaWlud6kyZNIigoiJMnTxIeHs7333/PsGHDGDVqFKdOnaJ///707t07z/T8jz/+mJ49e5KQkECVKlXo3r07/fv3Z9y4cRw9ehSg0ER30aJFDBo0iH79+nHy5Ek2b96Mq6trgfVfFzKVWgghhBBCvDGSk5NRFAU3N7d8y11dXbl9+zY3b96kbNmy6OrqYmZmhrW1tVa9zMxMPvnkE5ycnIAnycrTix/NnTuXiIgIWrduDYCDgwPnz5/nm2++oUuXLmq9d999V61TkGvXrnHmzBmaNm363Pvr2rUrU6ZMYdiwYWzduhVHR0eqV6/+3PMKY2BggLm5ORqNJs9zeFpKSgrlypWjSZMm6OvrY29vj6enJ/Bkmre/vz/r1q2jZs2aAKxYsUIdUf/rr7+4c+cObdu2pVKlJ1ttPv1Os5mZGXp6etjY2KjH9u7dy8mTJ0lLS1NH0j/77DN++OEHNmzYQL9+/YAn20N99dVXmJubU7VqVQICAjh37hzbtm1DR0cHd3d3IiMjiYmJoUGDBgXe36ZNm7SmqT9r6dKlhIaGYmVlRa1atWjcuDGdOnWiUaNGAFSoUIGWLVsSFRVF3bp1AYiKilKfQa7SpUszf/58dHV1qVKlCm3atGHPnj307duX8+fPs337dg4dOkT9+vUB+PLLL/N9/7t79+6Eh4drfQ4LC+O9994DYOTIkRw6dIjPPvuMgIAAtV7v3r3VPjp27Fh8fHyYOHGi+sXMsGHD6N27d4HPYerUqYwaNYphw4apx3Lv93UmI8ZCCCGEEKLEUBQFeDKltDDGxsZqUgxQvnx5bty4AcDNmzf5448/GDVqFG5uburPvHnzuHLlilY7tWrVem5Mu3btwtvbu0jvPTdr1oz79+9z6NAh1q1bV+BCUf+Ftm3bkpGRgY+PD6NHj2b79u1kZWWp5d27d2fTpk1kZGSQlZXFmjVr1MStTJkyhIWF0bJlS9q1a8fnn3/OtWvXCr1efHw89+7dw8rKSn3P1czMjOTkZK2pvk5OTpibm6ufy5cvT9WqVdHR0dE6lt+oa6709HRiY2Np3759gXV8fX25dOkSe/bsoWPHjpw+fZomTZpozSTo27cva9asISMjg8zMTFatWqWVvAJUq1ZNa39nW1tbNbbExET09PTw9vZWy6tUqZLvKPbTdXLPzU3SczVq1IjExEStY7lfXMCT5wJoTQEvX748GRkZpKen57lmWloaf/zxR77T5l93MmIshBBCCCHeGE5OTmg0Gs6fP0+rVq3ylCclJWFpaUmZMmUKbefZhYw0Go2aVOfk5ADw6aefqiOmuZ5OeIA807nzU5Rp1Ln09PTo2LEjs2bN4tixYyxfvjxPnacTwlxPJ7B/l729Pfv27WP//v3s37+f8ePHs2jRIr777jv09fVp3rw5BgYG7NixgwcPHvDo0SM6duyonh8VFcXQoUPZsWMH69at48MPP2T37t0FjuLm5ORga2tLTExMnrKnE8X8flf5Hcv9veVn+/bteHh44OjoWOgz0NfXp0mTJjRp0oQPPviAqVOnMmXKFMaOHYuBgQHt2rXD0NCQ77//HkNDwzzPoKB4c2Mr6hc3AKampnmOPXueoih5jj19/dyy/I7l97yK0p9fVzJiLIQQQggh3hhlypTB19eXlStX8vDhQ62ytLQ0Nm7cSLt27bQSguzs7Be6Rrly5bCxseHKlSs4Oztr/Tg4OLxQW/fv3ycuLo4WLVoU+ZyuXbvy888/06JFi3xHEnOT/qcXCzt9+nShbRoYGBTpORgbG9OiRQs+/vhjvv32W+Lj4zl79izwJGnv3Lkz69evZ8uWLYSEhORZLdvT05Nx48YRFxdH9erVWb16dYHXr1OnDqmpqejp6eHq6qr1U7Zs2efG+iI2bdpU6GhxQapWrUpWVhYZGRnAk2fQq1cvoqKiiIqKomvXri+0YriHhwdZWVnqu74A586d4/bt20U698CBA1rH4uLi/tU9uc3NzXFycnojt2+SEWMhhBBCCPFGmTp1KkFBQYSGhjJmzBgqVqzI+fPnmTp1KjY2NowdO1atW7FiRX755ReCgoIwNDR87khyrlGjRjFx4kTMzc0JCAjg8ePHnDhxgtu3b9O/f/8ixxodHY2zs/NzRyqf5ubmxsmTJwscvTM2NqZOnTosWLCAihUr8tdffzFz5sxC26xQoQL3799n//79VKtWDWNj4zztr1u3jpycHDw9PTE2Nua7777DyMhIazXvbt26sWTJEgB19W948u730qVLad++PXZ2dpw7d47z58/Ts2dP4MlIf3JyMgkJCVSoUAFzc3MCAwPx8fEhODiYyMhI3N3d+eOPP9i2bRvBwcF5phL/XVlZWWzfvp2ffvqp0Hr+/v5069YNb29vrKysOHPmDOPHjycgIIBSpUqp9d599101GT148OALxeLu7k6rVq3o27cvS5cuRU9Pj+HDhxdppHb06NF06dJFXaBsy5YtbNy48bn39aImT57MgAEDsLa25q233uLu3bscPHiQIUOG/KvXedlkxFgIIYQQQrxRXFxc2L59O46OjgwcOJBGjRoxZswYGjZsyObNm7Xe5X3//ff57bffaNSoUb5b7RSke/fufPbZZ6xfv57AwEA6derE+vXrX3jEeOfOnS80WpyrTJkyhSZLs2fPJjMzk7feeouIiAjGjBlTaHt169blnXfeYeDAgdSoUYOFCxfmqWNhYcGqVasIDg4mMDCQAwcOsGLFCq0vE1xcXPDy8sLBwUFdPArAxMSEs2fP0rFjRypXrky/fv0YPHiw+iVCx44dadWqFQEBAZQrV441a9ag0WjYtm0bvr6+hIeHU7lyZbp27crly5fVd2P/DbGxsZiZmeHl5VVovZYtW7Jy5UpatGiBh4cHQ4YMoWXLlnlWIndzc6Nhw4a4u7trPYOiioqKomLFivj5+dGhQwd1u6rnCQ4O5vPPP+fTTz+lWrVqLFmyhKioKPz9/V84hsL06tWLuXPnsnDhQqpVq0bbtm25cOHCv3qN4qBRcieyi9dOeno6FhYWeCzyQNdYN98662f88/dJXnXbalUq7hCEEEI8I8R57PMriVdelrmGuwGmONo5YKRnUKyx3NDJf4uc11l2djY1a9bkm2++yfOu8utKURR8fX1p3749n332WZ53rl9FQ4cOJSsrK98vA/4ORVGoUqUK/fv3Z+TIkf9Km+Lvy8jIIDk5GWdnZ4yMjAqsJ1OphRBCCCGEKAa3bt2ib9++1K5du7hD+VfcuHGDDRs2kJqaSrt27Yo7nCKrXr06Pj4+/0pbaWlpfP3116SkpBS65ZF49UhiLIQQQgghRDEoW7Ysw4cPL+4w/jW1atWiTJkyfPLJJ1rv3L7qcvdD/jeUL1+esmXLsnTp0iJtvyVeHZIYCyGEEEIIIf6xlJQU4Mk2P6mpqcUcTfGQt1RfX7L4lhBCCCGEEEKIEk0SYyGEEEIIIYQQJZokxkIIIYQQQgghSjRJjIUQQgghhBBClGiSGAshhBBCCCGEKNEkMRZCCCGEEEIIUaJJYiyEEEIIIUQJYW9vz44dO176dTUaDT/88MNLv64QRSX7GAshhBBCiDfKjRs3mDlzJtHR0dy4cQMLCwuqVq3KyJEj8fb2Lu7w/nXDhw/n22+/BUBPTw9LS0s8PDwIDg6mS5cu6Oj831jYsWPHsLCwKK5QhXhlSWIshBBCCCFeyP7ztV/q9ZpUTnih+n379iUzM5O5c+fi6OjI9evXOXDgALdv3/5P4vu3ZGZmoq+v/7fODQgIYPbs2WRnZ3Pjxg2io6OJiIjgxx9/JCoqCj29J//st7a2/jdD1vJP4n+ex48fY2Bg8J+0LQTIVGohhBBCCPEGuXPnDocPH2bChAk0atSIChUq4OnpyZAhQwgMDFTrpaenM2bMGGrWrIm7uzudO3fm9OnTavmsWbNo3rw5GzZsoH79+lSpUoWBAwdy7949tc7WrVtp1qwZlSpVolq1aoSEhPDgwQMAcnJymDNnDl5eXjg7O9O8eXOio6PVc3/77Tfs7e3ZvHkznTp1wsXFhVWrVuHu7s7WrVu17mnXrl24urpqXftZBgYGWFtbY2trS40aNRg6dChfffUVe/fuZf369Wq9p6dSP378mAkTJuDp6YmLiwv169fniy++UOumpKTQu3dv3NzccHd3p3///ly/fj3PM1q7di0+Pj44OzujKArJycn069cPU1NTqlatyu7du/PEm5KSQkhICKVLl8bKyoqgoCAuX76sloeFhREcHMyMGTOws7OjcuXKACxcuBA3NzeMjIwoX748nTp1KvCZCPEiJDEWQgghhBBvDFNTU0xNTdmxYwePHj3Kt46iKPTs2ZO0tDS+/vprtm/fTo0aNQgJCeHWrVtqvStXrrBz505WrlzJypUrOXToEPPnzwfgzz//ZNCgQYSEhBATE8OGDRt46623UBQFgOXLl7NkyRIiIiLYvXs3/v7+9O7dm0uXLmnFMn36dMLDw4mJiaFVq1YEBQWxbt06rTrr16+nTZs2mJmZvdCzaNy4MVWrVmX79u35ln/11Vfs2rWLxYsXs2/fPr744gsqVqyoPqPw8HBu377Nd999x5o1a7hy5QoDBw7UauPy5cts2bKFZcuWsWvXLnJycujbty86OjocPHiQxYsXM3bsWK1zHjx4QEBAAGZmZuzbt48DBw5gZmZGq1atePz4sVpvz549JCYmsnv3brZu3crRo0cZOnQoU6ZM4dy5c+zYsQNfX98XeiZCFESmUgshhBBCiDeGnp4ec+bMYcyYMXzzzTdUr16dBg0aEBQURNWqVQE4ePAgZ8+e5fjx4xgaGgIQERHBzp07+fHHH+nRowfwf6O+uQlpx44dOXDgAABpaWlkZWXRunVrKlSoAICHh4cax5IlS3jvvfcICgoCYMKECcTFxbF8+XKmT5+u1nv33Xdp3bq1+rlbt24EBQWRmpqKjY0Nf/31Fz/99BNr1qz5W8/D1dWVxMTEfMtSUlJwdnamXr16aDQa9T4A9u/fT2JiIj///DP29vYAzJs3j4CAABISEqhduzbwZPr0vHnzsLKyAiA2NpaLFy+yadMmateuja6uLtOnT+ett95S2167di06OjosX74cjUYDQFRUFJaWlsTExNCiRQvgyZccy5cvV6dQb9y4EVNTU9q2bYu5uTmOjo54enr+recixLNkxFgIIYQQQrxR2rRpQ3x8PFFRUfj7+/Pzzz/TqlUrdST25MmT3L9/n+rVq+Pm5qb+XL16lStXrqjtVKxYUWuU1tramps3bwJQtWpVGjduTLNmzejXrx+rVq1S32G+e/cuqamp1K1bVysub29vLly4oHWsVq1aWp89PT2pXLkyGzZsAGDDhg3Y29vToEGDv/UsFEVRk89ndenShdOnT9OkSRMmTpxIbGysWnbhwgXs7OzUpBigcuXKWFhYaN2Dvb29mhTnnmdvb0/58uXVYz4+PlrXjY+P5+LFi5ibm2NmZoaZmRllypQhIyODpKQktV6NGjW03itu3rw5jo6OuLi48M4777Bq1Sp16roQ/5SMGAshhBBCiDeOkZERvr6++Pr6MmLECN5//31mzZpFSEgIOTk5WFtbq8nn055esTl3wapcGo2GnJwcAHR1dVm7di1Hjx4lNjaWqKgoIiMj2bp1K6VLl1brPy2/JNXY2DhPDN27dycqKorBgwezfv16unTpUmBy+zwXL15Up0c/q0aNGhw6dIi9e/dy4MABBgwYQOPGjVm2bFmBCfWzx01MTPKUP+vZdnJycvDy8mLVqlV56pYrV079b1NTU60yc3Nzfv31V2JiYti1axcRERFMnjyZI0eOYGlpme89ClFUkhi/AQ51P0SpUqXyL+z1cmMpDh7PryKEEEKIvyEjI4OHyckY2JhiYGT0fwXnX24cBhXMscP8H7Xh7e3Nrl27sLOzw9/fn8jISOzt7XFycsq3vrm5Ofr6+tjZ2anHLCws0NXV1Tpmb29PUFAQn376KY6OjsTFxTFy5Ejs7Ow4e/YsHTp0UOueOHGCevXqYWdnp75La21trdUewKBBg5g2bRobNmzg3LlzatwFMTEx4fHjx3nq7N27l8TEREaPHq1VVqZMGa3P7u7uDBw4kJ07d9KqVSuMjIzw8fFhypQpZGdnq4n1mTNnSE9Pp2HDhtjZ2eX7jHx8fPj444+1Fun6+eefteKqU6cO69atw9rauuB/wxZAT0+PwMBAAgMDmTRpEpaWluzdu1frOQvxd0hiLIQQQggh3hg3b96kc+fOhIeHU7NmTczNzTl69CgzZ85U3/cNDAzEx8eH4OBgIiMjcXd3548//mDbtm0EBwcXaa/jX375hT179tCiRQusra355ZdfuH79uvqe8ejRo5k0aRKVKlWidu3aREVFkZCQkO8o6bNKly5Nhw4dGD16NC1atNB697cgjx49IjU1lezsbP7880927NjBjBkzaNu2LT179sz3nDlz5mBra0vt2rXR0dHh22+/xcbGBktLSwIDA6lZsyahoaHMnTuXrKws3nvvPfz8/Ap9PoGBgbi7uzNp0iTKly/P/fv3mTBhglad0NBQPv30U4KCgpgyZQoVKlTg6tWrbNy4kdGjRxd4v1u3buXSpUv4+vpSunRptm3bRk5ODu7u7s99PkI8jyTGQgghhBDijWFmZkb9+vWZM2cOSUlJZGZmUrFiRfr27cv48eOBJ1N7t23bxoQJEwgPD+f69evY2Njg6+ur9W5sYUqVKsW+ffuYO3cu6enpODo6MmvWLHWRqaFDh5Kens6oUaNIS0ujatWqbN68GTc3tyK136dPH1avXk14eHiR6u/YsQNbW1v09PQoXbo0tWrVYt68efTq1QsdnfyXFTIzMyMyMpILFy6gq6tL3bp12bZtm1r/hx9+YMiQIfj6+qKjo0OrVq20tnPKj46ODhs2bKB79+74+Pjg5OTEvHnzaNWqlVrHxMSEffv2MXbsWDp06MDdu3ext7enWbNmhY4gW1pasnHjRiZPnkxGRgZubm6sWbOGatWqFekZCVEYjZLfiwDitZCeno6FhQV37tx54WkoQgghhBDPk5GRQXJyMs7Ozhg9PZVa/OdWrVrFsGHD+OOPP7QWoHodZGdnc+zYMTw9PdHV1S3ucEQJV9S/YzJiLIQQQgghxCviwYMHJCcnM2PGDPr37//aJcVCvK5kuyYhhBBCCCFeETNnzqR27dqUL1+ecePGFXc4QpQYkhgLIYQQQgjxipg8eTKZmZns2bNHaw9lIcR/SxJjIYQQQgghhBAlmiTGQgghhBBCCCFKNEmMhRBCCCFEoWQTEyHE66qof78kMRZCCCGEEPnS19cHnqyULIQQr6Pcv1+5f88KIts1CSGEEEKIfOnq6mJpaUlaWhoAJiYmaDSaYo5KvOqys7OBJ/vHyj7GorgoisKDBw9IS0vD0tLyuX1REmMhhBBCCFEgGxsbADU5FuJ5cnJyuHHjBpcvX0ZHRyaoiuJlaWmp/h0rjEaRl0ZeW+np6VhYWHDnzh1KlSpV3OEIIYQQ4g2WnZ1NZmZmcYchXgP37t3D29ubo0ePypZToljp6+sXedaCjBgLIYQQQojn0tXVlWmxokgeP37MlStXMDAwwMjIqLjDEaJIZG6DEEIIIYQQQogSTRJjIYQQQgghhBAlmiTGQgghhBBCCCFKNHnH+DWWu25aenp6MUcihBBCCCHEE7n/NpU1fsXrRBLj19jNmzcBqFixYjFHIoQQQgghhLabN29iYWFR3GEIUSSSGL/GypQpA8DVq1flj47IIz09nYoVK/Lbb7/Jdl4iD+kf4nmkj4jCSP8Qhblz5w4ODg7qv1WFeB1IYvway90w3cLCQv6nJApUqlQp6R+iQNI/xPNIHxGFkf4hCpP7b1UhXgfSW4UQQgghhBBClGiSGAshhBBCCCGEKNEkMX6NGRoaMmnSJAwNDYs7FPEKkv4hCiP9QzyP9BFRGOkfojDSP8TrSKPIOupCCCGEEEIIIUowGTEWQgghhBBCCFGiSWIshBBCCCGEEKJEk8RYCCGEEEIIIUSJJonxa2rhwoU4OztjZGSEl5cX+/fvL+6QRDGYMWMGdevWxdzcHGtra4KDgzl37pxWHUVRmDx5MnZ2dhgbG+Pv78/p06eLKWJRnGbMmIFGo2H48OHqMekfIiUlhR49emBlZYWJiQm1a9cmPj5eLZc+UnJlZWXx4Ycf4uzsjLGxMS4uLkyZMoWcnBy1jvSPkmPfvn20a9cOOzs7NBoNP/zwg1Z5UfrCo0ePGDJkCGXLlsXU1JT27dvz+++/v8S7EKJgkhi/htatW8fw4cOZMGECx44do0mTJrz11ltcvXq1uEMTL1lsbCyDBg3i0KFD7N69m6ysLFq0aMH9+/fVOjNnzmT27NnMnz+fI0eOYGNjQ/Pmzbl7924xRi5etiNHjrB06VJq1qypdVz6R8l269YtGjVqhL6+Ptu3b+fMmTPMmjULS0tLtY70kZIrMjKSxYsXM3/+fBITE5k5cyaffvopX3zxhVpH+kfJcf/+fWrVqsX8+fPzLS9KXxg+fDjff/89a9eu5cCBA9y7d4+2bduSnZ39sm5DiIIp4rVTr149ZcCAAVrHqlSponzwwQfFFJF4VaSlpSmAEhsbqyiKouTk5Cg2NjbKJ598otbJyMhQLCwslMWLFxdXmOIlu3v3ruLm5qbs3r1b8fPzU4YNG6YoivQPoShjx45VGjduXGC59JGSrU2bNkp4eLjWsQ4dOig9evRQFEX6R0kGKN9//736uSh94fbt24q+vr6ydu1atU5KSoqio6Oj7Nix46XFLkRBZMT4NfP48WPi4+Np0aKF1vEWLVoQFxdXTFGJV8WdO3cAKFOmDADJycmkpqZq9RdDQ0P8/Pykv5QggwYNok2bNgQGBmodl/4hNm/ejLe3N507d8ba2hpPT0+WLVumlksfKdkaN27Mnj17OH/+PADHjx/nwIEDtG7dGpD+If5PUfpCfHw8mZmZWnXs7OyoXr269BfxStAr7gDEi7lx4wbZ2dmUL19e63j58uVJTU0tpqjEq0BRFEaOHEnjxo2pXr06gNon8usvV65ceekxipdv7dq1/Prrrxw5ciRPmfQPcenSJRYtWsTIkSMZP348hw8fZujQoRgaGtKzZ0/pIyXc2LFjuXPnDlWqVEFXV5fs7GymTZtGt27dAPkbIv5PUfpCamoqBgYGlC5dOk8d+TeseBVIYvya0mg0Wp8VRclzTJQsgwcP5sSJExw4cCBPmfSXkum3335j2LBh7Nq1CyMjowLrSf8ouXJycvD29mb69OkAeHp6cvr0aRYtWkTPnj3VetJHSqZ169bxzTffsHr1aqpVq0ZCQgLDhw/Hzs6OXr16qfWkf4hcf6cvSH8RrwqZSv2aKVu2LLq6unm+WUtLS8vzLZ0oOYYMGcLmzZuJjo6mQoUK6nEbGxsA6S8lVHx8PGlpaXh5eaGnp4eenh6xsbHMmzcPPT09tQ9I/yi5bG1tqVq1qtYxDw8PdTFH+RtSso0ePZoPPviArl27UqNGDd555x1GjBjBjBkzAOkf4v8UpS/Y2Njw+PFjbt26VWAdIYqTJMavGQMDA7y8vNi9e7fW8d27d9OwYcNiikoUF0VRGDx4MBs3bmTv3r04OztrlTs7O2NjY6PVXx4/fkxsbKz0lxKgWbNmnDx5koSEBPXH29ub0NBQEhIScHFxkf5RwjVq1CjPFm/nz5/H0dERkL8hJd2DBw/Q0dH+p6Kurq66XZP0D5GrKH3By8sLfX19rTrXrl3j1KlT0l/EK0GmUr+GRo4cyTvvvIO3tzc+Pj4sXbqUq1evMmDAgOIOTbxkgwYNYvXq1WzatAlzc3P1m1oLCwuMjY3VPWunT5+Om5sbbm5uTJ8+HRMTE7p3717M0Yv/mrm5ufq+eS5TU1OsrKzU49I/SrYRI0bQsGFDpk+fTpcuXTh8+DBLly5l6dKlAPI3pIRr164d06ZNw8HBgWrVqnHs2DFmz55NeHg4IP2jpLl37x4XL15UPycnJ5OQkECZMmVwcHB4bl+wsLCgT58+jBo1CisrK8qUKcP7779PjRo18iwOKUSxKLb1sMU/smDBAsXR0VExMDBQ6tSpo27PI0oWIN+fqKgotU5OTo4yadIkxcbGRjE0NFR8fX2VkydPFl/Qolg9vV2Tokj/EIqyZcsWpXr16oqhoaFSpUoVZenSpVrl0kdKrvT0dGXYsGGKg4ODYmRkpLi4uCgTJkxQHj16pNaR/lFyREdH5/tvjl69eimKUrS+8PDhQ2Xw4MFKmTJlFGNjY6Vt27bK1atXi+FuhMhLoyiKUkw5uRBCCCGEEEIIUezkHWMhhBBCCCGEECWaJMZCCCGEEEIIIUo0SYyFEEIIIYQQQpRokhgLIYQQQgghhCjRJDEWQgghhBBCCFGiSWIshBBCCCGEEKJEk8RYCCGEEEIIIUSJJomxEEIIIYQQQogSTRJjIYQQ4h9aunQpFStWREdHh7lz577Ua69YsQJLS8uXek0hhBDiTSOJsRBCvIHCwsLQaDRoNBr09fVxcXHh/fff5/79+8Ud2nM5OTm99OTyn0hPT2fw4MGMHTuWlJQU+vXrp1X+559/oq+vzzfffJPv+f3796dmzZp/+/ohISGcP3/+b5+fn8uXL6PRaEhISPhX2xVCCCFeVZIYCyHEG6pVq1Zcu3aNS5cuMXXqVBYuXMj777//t9pSFIWsrKx/OcI3w9WrV8nMzKRNmzbY2tpiYmKiVV6+fHnatGlDVFRUnnMfPnzI2rVr6dOnz9+6dmZmJsbGxlhbW/+t84UQQgjxhCTGQgjxhjI0NMTGxoaKFSvSvXt3QkND+eGHH4Anie7MmTNxcXHB2NiYWrVqsWHDBvXcmJgYNBoNO3fuxNvbG0NDQ/bv309OTg6RkZG4urpiaGiIg4MD06ZNU89LSUkhJCSE0qVLY2VlRVBQEJcvX1bLw8LCCA4O5rPPPsPW1hYrKysGDRpEZmYmAP7+/ly5coURI0aoI94AN2/epFu3blSoUAETExNq1KjBmjVrtO737t27hIaGYmpqiq2tLXPmzMHf35/hw4erdR4/fsyYMWOwt7fH1NSU+vXrExMTU+hzvHr1KkFBQZiZmVGqVCm6dOnCn3/+CTyZxlyjRg0AXFxc0Gg0Wvebq0+fPkRHR+cp27BhAxkZGfTo0YMdO3bQuHFjLC0tsbKyom3btiQlJal1c0dx169fj7+/P0ZGRnzzzTd5plInJSURFBRE+fLlMTMzo27duvz0009a13VycmL69OmEh4djbm6Og4MDS5cuVcudnZ0B8PT0RKPR4O/vDzzpF/Xq1cPU1BRLS0saNWrElStXCn1+QgghxOtAEmMhhCghjI2N1QT0ww8/JCoqikWLFnH69GlGjBhBjx49iI2N1TpnzJgxzJgxg8TERGrWrMm4ceOIjIxk4sSJnDlzhtWrV1O+fHkAHjx4QEBAAGZmZuzbt48DBw5gZmZGq1atePz4sdpmdHQ0SUlJREdHs3LlSlasWMGKFSsA2LhxIxUqVGDKlClcu3aNa9euAZCRkYGXlxdbt27l1KlT9OvXj3feeYdffvlFbXfkyJEcPHiQzZs3s3v3bvbv38+vv/6qdT+9e/fm4MGDrF27lhMnTtC5c2datWrFhQsX8n1miqIQHBzMX3/9RWxsLLt37yYpKYmQkBDgyTTm3KTz8OHDXLt2jYoVK+Zpp3Xr1tjY2Kj3meurr74iODgYKysr7t+/z8iRIzly5Ah79uxBR0eHt99+m5ycHK1zxo4dy9ChQ0lMTKRly5Z5rnXv3j1at27NTz/9xLFjx2jZsiXt2rXj6tWrWvVmzZqFt7c3x44d47333mPgwIGcPXtWvReAn376iWvXrrFx40aysrIIDg7Gz8+PEydO8PPPP9OvXz/1ywshhBDitaYIIYR44/Tq1UsJCgpSP//yyy+KlZWV0qVLF+XevXuKkZGREhcXp3VOnz59lG7duimKoijR0dEKoPzwww9qeXp6umJoaKgsW7Ys32t++eWXiru7u5KTk6Mee/TokWJsbKzs3LlTjcvR0VHJyspS63Tu3FkJCQlRPzs6Oipz5sx57j22bt1aGTVqlBqbvr6+8u2336rlt2/fVkxMTJRhw4YpiqIoFy9eVDQajZKSkqLVTrNmzZRx48ble41du3Ypurq6ytWrV9Vjp0+fVgDl8OHDiqIoyrFjxxRASU5OLjTesWPHKo6OjurzuXTpkqLRaNRn86y0tDQFUE6ePKkoiqIkJycrgDJ37lytelFRUYqFhUWh165ataryxRdfqJ8dHR2VHj16qJ9zcnIUa2trZdGiRVrXOnbsmFrn5s2bCqDExMQUei0hhBDidSQjxkII8YbaunUrZmZmGBkZ4ePjg6+vL1988QVnzpwhIyOD5s2bY2Zmpv7873//05q6C+Dt7a3+d2JiIo8ePaJZs2b5Xi8+Pp6LFy9ibm6utlmmTBkyMjK02q1WrRq6urrqZ1tbW9LS0gq9l+zsbKZNm0bNmjWxsrLCzMyMXbt2qaOgly5dIjMzk3r16qnnWFhY4O7urn7+9ddfURSFypUra913bGxsnvt++p4rVqyoNQpctWpVLC0tSUxMLDTmZ/Xp04crV66wd+9e4MlocYUKFQgMDASeTIHu3r07Li4ulCpVSp3O/OxI79O/k/zcv3+fMWPGqHGamZlx9uzZPO08veCXRqPBxsam0N9DmTJlCAsLU0egP//8c3VEXwghhHjd6RV3AEIIIf4bAQEBLFq0CH19fezs7NDX1wcgOTkZgB9//BF7e3utcwwNDbU+m5qaqv9tbGxc6PVycnLw8vJi1apVecrKlSun/nduHLk0Gk2e6cLPmjVrFnPmzGHu3LnUqFEDU1NThg8frk7RVhRFbetpucdz49PV1SU+Pl4rMQcwMzPL97qKouQ7Vbig44Vxc3OjSZMmREVFERAQwMqVK+nduzc6Ok++o27Xrh0VK1Zk2bJl2NnZkZOTQ/Xq1bWmoYP27yQ/o0ePZufOnXz22We4urpibGxMp06d8rTzd34PUVFRDB06lB07drBu3To+/PBDdu/eTYMGDYr6GIQQQohXkiTGQgjxhjI1NcXV1TXP8apVq2JoaMjVq1fx8/Mrcntubm4YGxuzZ88e3n333TzlderUYd26dVhbW1OqVKm/HbeBgQHZ2dlax/bv309QUBA9evQAniS5Fy5cwMPDA4BKlSqhr6/P4cOH1dHd9PR0Lly4oN6jp6cn2dnZpKWl0aRJkyLFUrVqVa5evcpvv/2mtnvmzBnu3LmjXvtF9OnTh4EDBxIUFMTvv/9O7969gSeLiyUmJrJkyRI1tgMHDrxw+/DkWYWFhfH2228DT945zm9BsMIYGBgA5Pk9wJPn6Onpybhx4/Dx8WH16tWSGAshhHjtyVRqIYQoYczNzXn//fcZMWIEK1euJCkpiWPHjrFgwQJWrlxZ4HlGRkaMHTuWMWPGqNOuDx06xJdffglAaGgoZcuWJSgoiP3795OcnExsbCzDhg3j999/L3J8Tk5O7Nu3j5SUFG7cuAGAq6sru3fvJi4ujsTERPr3709qaqrWPfXq1YvRo0cTHR3N6dOnCQ8PR0dHRx3ZrVy5MqGhofTs2ZONGzeSnJzMkSNHiIyMZNu2bfnGEhgYSM2aNQkNDeXXX3/l8OHD9OzZEz8/v+dOac5P586d0dfXp3///jRr1gwnJycAdRXvpUuXcvHiRfbu3cvIkSNfuH148qw2btxIQkICx48fp3v37s8dCX6WtbU1xsbG7Nixgz///JM7d+6QnJzMuHHj+Pnnn7ly5Qq7du3i/Pnzf+sLAiGEEOJVI4mxEEKUQB9//DERERHMmDEDDw8PWrZsyZYtW9T3WgsyceJERo0aRUREBB4eHoSEhKjvpZqYmLBv3z4cHBzo0KEDHh4ehIeH8/DhwxcaQZ4yZQqXL1+mUqVK6hTsiRMnUqdOHVq2bIm/vz82NjYEBwdrnTd79mx8fHxo27YtgYGBNGrUCA8PD4yMjNQ6UVFR9OzZk1GjRuHu7k779u355Zdf8l1JGp5ML/7hhx8oXbo0vr6+BAYG4uLiwrp164p8P08zMTGha9eu3Lp1i/DwcPW4jo4Oa9euJT4+nurVqzNixAg+/fTTv3WNOXPmULp0aRo2bEi7du1o2bIlderUeaE29PT0mDdvHkuWLMHOzo6goCBMTEw4e/YsHTt2pHLlyvTr14/BgwfTv3//vxWnEEII8SrRKE+/gCWEEEK8Ie7fv4+9vT2zZs2iT58+xR2OEEIIIV5h8o6xEEKIN8KxY8c4e/Ys9erV486dO0yZMgWAoKCgYo5MCCGEEK86SYyFEEK8MT777DPOnTuHgYEBXl5e7N+/n7JlyxZ3WEIIIYR4xclUaiGEEEIIIYQQJZosviWEEEIIIYQQokSTxFgIIYQQQgghRIkmibEQQgghhBBCiBJNEmMhhBBCCCGEECWaJMZCCCGEEEIIIUo0SYyFEEIIIYQQQpRokhgLIYQQQgghhCjRJDEWQgghhBBCCFGiSWIshBBCCCGEEKJE+380kQSRJegeQgAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "# Disease categories\n", + "categories = [\n", + " \"Benign\",\n", + " \"Cardiovascular & Hematological\",\n", + " \"Immunological & Hematopoietic\",\n", + " \"Metabolic / Mitochondrial\",\n", + " \"Musculoskeletal & Connective Tissue\",\n", + " \"Neurological & Neurodevelopmental\",\n", + " \"Oncological / Cancer\",\n", + " \"Other / Multisystem / Syndromic\",\n", + " \"Sensory Disorders\"\n", + "]\n", + "\n", + "# Raw counts per dataset\n", + "kegg_counts = [0, 0, 17, 121, 0, 764, 316, 231, 0]\n", + "vep_coding_counts = [17398, 1550, 1876, 1863, 2887, 5715, 1254, 16199, 1341]\n", + "vep_non_snv_counts = [6559, 3145, 1439, 2781, 3605, 7147, 21932, 20197, 2320]\n", + "\n", + "datasets = [kegg_counts, vep_coding_counts, vep_non_snv_counts]\n", + "dataset_labels = ['KEGG', 'VEP Coding', 'VEP Non-SNV']\n", + "\n", + "# Convert counts to percentages\n", + "datasets_perc = []\n", + "for data in datasets:\n", + " total = sum(data)\n", + " perc = [val / total * 100 if total > 0 else 0 for val in data]\n", + " datasets_perc.append(perc)\n", + "\n", + "# Plotting\n", + "fig, ax = plt.subplots(figsize=(10, 5))\n", + "y = np.arange(len(datasets_perc))\n", + "bar_height = 0.5\n", + "\n", + "for i, category in enumerate(categories):\n", + " values = [d[i] for d in datasets_perc]\n", + " left = np.sum([d[:i] for d in datasets_perc], axis=1) if i > 0 else np.zeros(len(datasets_perc))\n", + " ax.barh(y, values, left=left, height=bar_height, label=category)\n", + "\n", + "# Axes and formatting\n", + "ax.set_yticks(y)\n", + "ax.set_yticklabels(dataset_labels)\n", + "ax.set_xlabel(\"Percentage of Variants\")\n", + "ax.set_title(\"Percent Stacked Bar Plot of Disease Categories Across Datasets\")\n", + "ax.legend(loc='lower right', bbox_to_anchor=(1.25, 0))\n", + "\n", + "plt.tight_layout()\n", + "plt.savefig(\"three_stacked_bar_plots_percent.svg\", format=\"svg\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ac53b01-53dc-4889-bed8-b254053f0d65", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/BioReason/data/KEGG_Data_1.ipynb b/BioReason/data/KEGG_Data_1.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a96c3d000b75f9b30a86d5dab4f96526c7373a8b --- /dev/null +++ b/BioReason/data/KEGG_Data_1.ipynb @@ -0,0 +1,11483 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "5077734e", + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration - Update these paths for your environment\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "# Create and navigate to kegg_data directory\n", + "data_dir = Path('kegg_data')\n", + "data_dir.mkdir(exist_ok=True)\n", + "os.chdir(data_dir)\n", + "\n", + "# Configuration parameters\n", + "CONFIG = {\n", + " # Output directories\n", + " 'network_dir': 'kegg_network',\n", + " 'variant_network_dir': 'network_variant', \n", + " 'variant_info_dir': 'variant_info',\n", + " \n", + " # Reference data paths (update these to point to your reference files)\n", + " 'cosmic_fusion_data': 'data/Cosmic_Fusion_v101_GRCh38.tsv', # Update path as needed\n", + " 'reference_genome': 'data/GRCh38_genomic.fna', # Update path as needed\n", + " \n", + " # Processing parameters\n", + " 'num_threads': 4, # Adjust based on your system\n", + " 'batch_size': 1000\n", + "}\n", + "\n", + "# Create required directories\n", + "for dir_name in [CONFIG['network_dir'], CONFIG['variant_network_dir'], CONFIG['variant_info_dir']]:\n", + " Path(dir_name).mkdir(exist_ok=True)\n", + "\n", + "print(f\"Working directory: {os.getcwd()}\")\n", + "print(\"Configuration loaded. Directory structure created.\")\n", + "print(\"\\n📝 Update CONFIG dictionary above with your actual file paths for reference data\")" + ] + }, + { + "cell_type": "markdown", + "id": "b77a0f2c", + "metadata": {}, + "source": [ + "# KEGG Data Processing Pipeline - Part 1: Data Retrieval and Network Analysis\n", + "\n", + "## Overview\n", + "\n", + "This notebook is the first part of a comprehensive KEGG (Kyoto Encyclopedia of Genes and Genomes) data processing pipeline for genetic variant analysis. It focuses on downloading and processing KEGG network data, disease associations, and variant information.\n", + "\n", + "## What This Notebook Does\n", + "\n", + "1. **KEGG Data Retrieval**: Downloads disease lists, network data, and pathway information from KEGG REST API\n", + "2. **Network Analysis**: Processes KEGG network files to identify reference vs disease networks\n", + "3. **Variant Extraction**: Identifies and extracts genetic variants from network data\n", + "4. **Data Filtering**: Cleans and filters variant information for downstream analysis\n", + "5. **Reference Data**: Processes genomic reference sequences and chromosome data\n", + "\n", + "## Prerequisites\n", + "\n", + "- Python 3.7+ with required packages (see requirements below)\n", + "- `kegg_pull` package for KEGG data retrieval\n", + "- `seqkit` for sequence processing\n", + "- Internet connection for KEGG API access\n", + "- Sufficient storage space (several GB for full dataset)\n", + "\n", + "## Required Packages\n", + "\n", + "```bash\n", + "pip install kegg-pull biopython pandas\n", + "```\n", + "\n", + "## Directory Structure\n", + "\n", + "This notebook expects and creates the following structure:\n", + "```\n", + "kegg_data/\n", + "├── kegg_diseases.txt\n", + "├── network_pathway.tsv\n", + "├── network_disease.tsv\n", + "├── kegg_network/\n", + "├── network_variant/\n", + "├── variant_info/\n", + "└── output files...\n", + "```\n", + "\n", + "## Important Notes\n", + "\n", + "- **Processing Time**: Full dataset processing can take several hours\n", + "- **Storage Requirements**: ~5-10GB of storage needed for complete dataset\n", + "- **API Limits**: KEGG REST API has rate limits; process may need pausing\n", + "- **Network Access**: Requires stable internet connection for data downloads\n", + "\n", + "## Next Steps\n", + "\n", + "After completing this notebook:\n", + "1. Run `KEGG_Data_2.ipynb` for variant information parsing\n", + "2. Run `KEGG_Data_3.ipynb` for final dataset creation with sequences" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "Set up paths and parameters for the data processing pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4297e63d-0309-45c4-920b-7a5cc1f42771", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d48693e3", + "metadata": {}, + "outputs": [], + "source": [ + "curl -s \"https://rest.kegg.jp/list/disease\" > kegg_diseases.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6e489c3f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "KEGG_data.ipynb\t\tclassify.py\t\tmodel.py\n", + "LICENSE\t\t\tdataset.py\t\tmodel_decoder.py\n", + "README.md\t\tdna_classifier.py\tplayground.ipynb\n", + "baseline.py\t\tfinetune.py\t\trequirements.txt\n", + "baseline_model.py\tkegg_diseases.txt\n" + ] + } + ], + "source": [ + "ls" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0cfda653", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 1593\n" + ] + } + ], + "source": [ + "curl -s \"https://rest.kegg.jp/list/network\" | wc -l" + ] + }, + { + "cell_type": "markdown", + "id": "4b2c1ed0-90a5-4005-bdb0-41be43070a8b", + "metadata": {}, + "source": [ + "Use kegg_pull for retrieving KEGG data https://github.com/MoseleyBioinformaticsLab/kegg_pull" + ] + }, + { + "cell_type": "markdown", + "id": "998a046f-604c-4378-8563-3df7de0f85c3", + "metadata": {}, + "source": [ + "```python3 -m pip install kegg-pull```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "65894de2-27f1-46c1-9eab-b54c7630fe86", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.1.0\n" + ] + } + ], + "source": [ + "kegg_pull -v" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "63e1de4e-ee4a-4cba-aabe-9a801735e643", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Usage:\n", + " kegg_pull -h | --help Show this help message.\n", + " kegg_pull -v | --version Displays the package version.\n", + " kegg_pull --full-help Show the help message of all sub commands.\n", + " kegg_pull pull ... Pull, separate, and store an arbitrary number of KEGG entries to the local file system.\n", + " kegg_pull entry-ids ... Obtain a list of KEGG entry IDs.\n", + " kegg_pull map ... Obtain a mapping of entry IDs (KEGG or outside databases) to the IDs of related entries.\n", + " kegg_pull pathway-organizer ... Creates a flattened version of a pathways Brite hierarchy.\n", + " kegg_pull rest ... Executes one of the KEGG REST API operations.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\n", + "Usage:\n", + " kegg_pull pull -h | --help\n", + " kegg_pull pull database [--force-single-entry] [--multi-process] [--n-workers=] [--output=] [--print] [--sep=] [--entry-field=] [--n-tries=] [--time-out=] [--sleep-time=] [--ut=]\n", + " kegg_pull pull entry-ids [--force-single-entry] [--multi-process] [--n-workers=] [--output=] [--print] [--sep=] [--entry-field=] [--n-tries=] [--time-out=] [--sleep-time=] [--ut=]\n", + "\n", + "Options:\n", + " -h --help Show this help message.\n", + " database Pulls all the entries in a KEGG database.\n", + " The KEGG database from which to pull entries.\n", + " --force-single-entry Forces pulling only one entry at a time for every request to the KEGG web API. This flag is automatically set if is \"brite\".\n", + " --multi-process If set, the entries are pulled across multiple processes to increase speed. Otherwise, the entries are pulled sequentially in a single process.\n", + " --n-workers= The number of sub-processes to create when pulling. Defaults to the number of cores available. Ignored if --multi-process is not set.\n", + " --output= The directory where the pulled KEGG entries will be stored. Defaults to the current working directory. If ends in \".zip\", entries are saved to a ZIP archive instead of a directory. Ignored if --print is set.\n", + " --print If set, prints the entries to the screen rather than saving them to the file system. Separates entries by the --sep option if set.\n", + " --sep= The string that separates the entries which are printed to the screen when the --print option is set. Ignored if the --print option is not set. Defaults to printing the entry id, followed by the entry, followed by a newline.\n", + " --entry-field= Optional field to extract from the entries pulled rather than the standard flat file format (or \"htext\" in the case of brite entries).\n", + " --n-tries= The number of times to attempt a KEGG request before marking it as timed out or failed. Defaults to 3.\n", + " --time-out= The number of seconds to wait for a KEGG request before marking it as timed out. Defaults to 60.\n", + " --sleep-time= The amount of time to wait after a KEGG request times out (or potentially blacklists with a 403 error code) before attempting it again. Defaults to 5.0.\n", + " --ut= If set, the ratio of unsuccessful entry IDs (failed or timed out) to total entry IDs at which kegg_pull quits. Valid values are between 0.0 and 1.0 non-inclusive.\n", + " entry-ids Pulls entries specified by a comma separated list. Or from standard input: one entry ID per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull pull entry-ids - ...).\n", + " Comma separated list of entry IDs to pull (e.g. id1,id2,id3 etc.). Or if equal to \"-\", entry IDs are read from standard input. Will likely need to set --force-single-entry if any of the entries are from the brite database.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\n", + "Usage:\n", + " kegg_pull entry-ids -h | --help\n", + " kegg_pull entry-ids database [--output=]\n", + " kegg_pull entry-ids keywords [--output=]\n", + " kegg_pull entry-ids molec-attr (--formula=|--em=...|--mw=...) [--output=]\n", + "\n", + "Options:\n", + " -h --help Show this help message.\n", + " database Pulls all the entry IDs within a given database.\n", + " The KEGG database from which to pull a list of entry IDs.\n", + " --output= Path to the file (either in a directory or ZIP archive) to store the output (1 entry ID per line). Prints to the console if not specified. If a ZIP archive, the file path must be in the form of /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:file.txt).\n", + " keywords Searches for entries within a database based on provided keywords.\n", + " Comma separated list of keywords to search entries with (e.g. kw1,kw2,kw3 etc.). Or if equal to \"-\", keywords are read from standard input, one keyword per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull rest find brite - ...).\n", + " molec-attr Searches a database of molecule-type KEGG entries by molecular attributes.\n", + " --formula= Sequence of atoms in a chemical formula format to search for (e.g. \"O5C7\" searches for molecule entries containing 5 oxygen atoms and/or 7 carbon atoms).\n", + " --em= Either a single number (e.g. \"--em=155.5\") or two numbers (e.g. \"--em=155.5 --em=244.4\"). If a single number, searches for molecule entries with an exact mass equal to that value rounded by the last decimal point. If two numbers, searches for molecule entries with an exact mass within the two values (a range).\n", + " --mw= Same as \"--em=\" but searches based on the molecular weight.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\n", + "Usage:\n", + " kegg_pull map -h | --help\n", + " kegg_pull map conv [--reverse] [--output=]\n", + " kegg_pull map link [--deduplicate] [--add-glycans] [--add-drugs] [--output=]\n", + " kegg_pull map (link|conv) entry-ids [--reverse] [--output=]\n", + " kegg_pull map link [--deduplicate] [--add-glycans] [--add-drugs] [--output=]\n", + "\n", + "Options:\n", + " -h --help Show this help message.\n", + " conv Converts the output of the KEGG \"conv\" operation into a JSON mapping.\n", + " The name of the KEGG database with entry IDs mapped to the outside database.\n", + " The name of the outside database with entry IDs mapped from the KEGG database.\n", + " --reverse Reverses the mapping with the target becoming the source and the source becoming the target.\n", + " --output= The location (either a directory or ZIP archive) of the JSON file to store the mapping. If not set, prints a JSON representation of the mapping to the console. If a ZIP archive, the file path must be in the form of /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:mapping.json).\n", + " link Converts the output of the KEGG \"link\" operation into a JSON mapping.\n", + " The name of the database with entry IDs mapped to the target database.\n", + " The name of the database with entry IDs mapped from the source database.\n", + " --deduplicate Some mappings including pathway entry IDs result in half beginning with the normal \"path:map\" prefix but the other half with a different prefix. If set, removes the IDs corresponding to identical entries but with a different prefix. Raises an exception if neither the source nor the target database are \"pathway\".\n", + " --add-glycans Whether to add the corresponding compound IDs of equivalent glycan entries. Logs a warning if neither the source nor the target database are \"compound\".\n", + " --add-drugs Whether to add the corresponding compound IDs of equivalent drug entries. Logs a warning if neither the source nor the target database are \"compound\".\n", + " entry-ids Create a mapping to a target database from a list of specific entry IDs.\n", + " Comma separated list of entry IDs (e.g. Id1,Id2,Id3 etc.). Or if equal to \"-\", entry IDs are read from standard input, one entry ID per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull map entry-ids drug - ...).\n", + " The name of an intermediate KEGG database with which to find cross-references to cross-references e.g. \"kegg_pull map link ko reaction compound\" creates a mapping from ko-to-compound via ko-to-reaction cross-references connected to reaction-to-compound cross-references.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\n", + "Usage:\n", + " kegg_pull pathway-organizer [--tln=] [--fn=] [--output=]\n", + "\n", + "Options:\n", + " -h --help Show this help message.\n", + " --tln= Node names in the highest level of the hierarchy to select from. If not set, all top level nodes are traversed to create the mapping of node key to node info. Either a comma separated list (e.g. node1,node2,node3 etc.) or if equal to \"-\", read from standard input one node per line; Press CTRL+D to finalize input or pipe (e.g. cat nodes.txt | kegg_pull pathway-organizer --tln=- ...). If both \"--tln\" and \"--fn\" are set as \"-\", one of the lines must be the delimiter \"---\" without quotes in order to distinguish the input, with the top level nodes first and filter nodes second.\n", + " --fn= Names (not keys) of nodes to exclude from the mapping of node key to node info. Neither these nodes nor any of their children will be included. If not set, no nodes will be excluded. Either a comma separated list (e.g. node1,node2,node3 etc.) or if equal to \"-\", read from standard input one node per line; Press CTRL+D to finalize input or pipe (e.g. cat nodes.txt | kegg_pull pathway-organizer --fn=- ...). If both \"--tln\" and \"--fn\" are set as \"-\", one of the lines must be the delimiter \"---\" without quotes in order to distinguish the input, with the top level nodes first and filter nodes second.\n", + " --output= The file to store the flattened Brite hierarchy as a JSON structure with node keys mapping to node info, either a JSON file or ZIP archive. Prints to the console if not set. If saving to a ZIP archive, the file path must be in the form of /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:mapping.json).\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\n", + "Usage:\n", + " kegg_pull rest -h | --help\n", + " kegg_pull rest info [--test] [--output=]\n", + " kegg_pull rest list [--test] [--output=]\n", + " kegg_pull rest get [--entry-field=] [--test] [--output=]\n", + " kegg_pull rest find [--test] [--output=]\n", + " kegg_pull rest find (--formula=|--em=...|--mw=...) [--test] [--output=]\n", + " kegg_pull rest conv [--test] [--output=]\n", + " kegg_pull rest conv entry-ids [--test] [--output=]\n", + " kegg_pull rest link [--test] [--output=]\n", + " kegg_pull rest link entry-ids [--test] [--output=]\n", + " kegg_pull rest ddi [--test] [--output=]\n", + "\n", + "Options:\n", + " -h --help Show this help message.\n", + " info Executes the \"info\" KEGG API operation, pulling information about a KEGG database.\n", + " The name of the database to pull information about or entry IDs from.\n", + " --test If set, test the request to ensure it works rather than sending it. Print True if the request would succeed and False if the request would fail. Ignores --output if this options is set along with --test.\n", + " --output= Path to the file (either in a directory or ZIP archive) to store the response body from the KEGG web API operation. Prints to the console if not specified. If a ZIP archive, the file path must be in the form of /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:file.txt).\n", + " list Executes the \"list\" KEGG API operation, pulling the entry IDs of the provided database.\n", + " get Executes the \"get\" KEGG API operation, pulling the entries of the provided entry IDs.\n", + " Comma separated list of entry IDs (e.g. id1,id2,id3 etc.). Or if equal to \"-\", entry IDs are read from standard input, one entry ID per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull rest get - ...).\n", + " --entry-field= Optional field to extract from an entry instead of the default entry info (i.e. flat file or htext in the case of brite entries).\n", + " find Executes the \"find\" KEGG API operation, finding entry IDs based on provided queries.\n", + " Comma separated list of keywords to search entries with (e.g. kw1,kw2,kw3 etc.). Or if equal to \"-\", keywords are read from standard input, one keyword per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull rest find brite - ...).\n", + " --formula= Sequence of atoms in a chemical formula format to search for (e.g. \"O5C7\" searches for molecule entries containing 5 oxygen atoms and/or 7 carbon atoms).\n", + " --em= Either a single number (e.g. --em=155.5) or two numbers (e.g. --em=155.5 --em=244.4). If a single number, searches for molecule entries with an exact mass equal to that value rounded by the last decimal point. If two numbers, searches for molecule entries with an exact mass within the two values (a range).\n", + " --mw= Same as --em but searches based on the molecular weight.\n", + " conv Executes the \"conv\" KEGG API operation, converting entry IDs from an outside database to those of a KEGG database and vice versa.\n", + " The name of the KEGG database from which to view equivalent outside database entry IDs.\n", + " The name of the non-KEGG database from which to view equivalent KEGG database entry IDs.\n", + " entry-ids Perform the \"conv\" or \"link\" operation of the form that maps specific provided entry IDs to a target database.\n", + " link Executes the \"link\" KEGG API operation, showing the IDs of entries that are connected/related to entries of other databases.\n", + " The name of the database that the entry IDs of the source database or provided entry IDs are mapped to.\n", + " The name of the database from which cross-references are found in the target database.\n", + " ddi Executes the \"ddi\" KEGG API operation, searching for drug to drug interactions. Providing one entry ID reports all known interactions, while providing multiple checks if any drug pair in a given set of drugs is CI or P. If providing multiple, all entries must belong to the same database.\n", + " Comma separated list of drug entry IDs from the following databases: drug, ndc, or yj (e.g. id1,id2,id3 etc.). Or if equal to \"-\", entry IDs are read from standard input, one entry ID per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull rest ddi - ...).\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull --full-help" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "12e35258-92f8-4ece-9d18-177263d1e97c", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "N00001\tEGF-EGFR-RAS-ERK signaling pathway\n", + "N00002\tBCR-ABL fusion kinase to RAS-ERK signaling pathway\n", + "N00003\tMutation-activated KIT to RAS-ERK signaling pathway\n", + "N00004\tDuplication or mutation-activated FLT3 to RAS-ERK signaling pathway\n", + "N00005\tMutation-activated MET to RAS-ERK signaling pathway\n", + "N00006\tAmplified EGFR to RAS-ERK signaling pathway\n", + "N00007\tEML4-ALK fusion kinase to RAS-ERK signaling pathway\n", + "N00008\tRET fusion kinase to RAS-ERK signaling pathway\n", + "N00009\tTRK fusion kinase to RAS-ERK signaling pathway\n", + "N00010\tMutation-inactivated PTCH1 to Hedgehog signaling pathway\n", + "N00011\tMutation-activated FGFR3 to RAS-ERK signaling pathway\n", + "N00012\tMutation-activated KRAS/NRAS to ERK signaling pathway\n", + "N00013\tMutation-activated BRAF to ERK signaling pathway\n", + "N00014\tMutation-activated EGFR to RAS-ERK signaling pathway\n", + "N00015\tPDGF-PDGFR-RAS-ERK signaling pathway\n", + "N00016\tPDGF-overexpression to RAS-ERK signaling pathway\n", + "N00017\tMutation-activated SMO to Hedgehog signaling pathway\n", + "N00018\tAmplified PDGFR to RAS-ERK signaling pathway\n", + "N00019\tFGF-FGFR-RAS-ERK signaling pathway\n", + "N00020\tAmplified FGFR to RAS-ERK signaling pathway\n", + "N00021\tEGF-ERBB2-RAS-ERK signaling pathway\n", + "N00022\tERBB2-overexpression to RAS-ERK signaling pathway\n", + "N00023\tEGF-EGFR-PLCG-ERK signaling pathway\n", + "N00024\tMutation-activated EGFR to PLCG-ERK signaling pathway\n", + "N00025\tEML4-ALK fusion kinase to PLCG-ERK signaling pathway\n", + "N00026\tEGF-EGFR-PLCG-CAMK signaling pathway\n", + "N00027\tAmplified EGFR to PLCG-CAMK signaling pathway\n", + "N00028\tPDGF-PDGFR-PLCG-CAMK signaling pathway\n", + "N00029\tAmplified PDGFR to PLCG-CAMK signaling pathway\n", + "N00030\tEGF-EGFR-RAS-PI3K signaling pathway\n", + "N00031\tDuplication or mutation-activated FLT3 to RAS-PI3K signaling pathway\n", + "N00032\tMutation-activated KRAS/NRAS to PI3K signaling pathway\n", + "N00033\tEGF-EGFR-PI3K signaling pathway\n", + "N00034\tERBB2-overexpression to PI3K signaling pathway\n", + "N00035\tAmplified EGFR to PI3K signaling pathway\n", + "N00036\tMutation-activated EGFR to PI3K signaling pathway\n", + "N00037\tFGF-FGFR-PI3K signaling pathway\n", + "N00038\tAmplified FGFR to PI3K signaling pathway\n", + "N00039\tPDGF-PDGFR-PI3K signaling pathway\n", + "N00040\tAmplified PDGFR to PI3K signaling pathway\n", + "N00041\tEGFR-overexpression to RAS-ERK signaling pathway\n", + "N00042\tEGFR-overexpression to PI3K signaling pathway\n", + "N00043\tHGF-MET-PI3K signaling pathway\n", + "N00044\tMutation-activated MET to PI3K signaling pathway\n", + "N00045\tKITLG-KIT-PI3K signaling pathway\n", + "N00046\tMutation-activated KIT to PI3K signaling pathway\n", + "N00047\tEML4-ALK fusion kinase to PI3K signaling pathway\n", + "N00048\tBCR-ABL fusion kinase to PI3K signaling pathway\n", + "N00049\tMutation-activated PI3K to PI3K signaling pathway\n", + "N00050\tAmplified PI3K to PI3K signaling pathway\n", + "N00051\tDeleted PTEN to PI3K signaling pathway\n", + "N00052\tMutation-inactivated PTEN to PI3K signaling pathway\n", + "N00053\tCytokine-Jak-STAT signaling pathway\n", + "N00054\tDuplication or mutation-activated FLT3 to Jak-STAT signaling pathway\n", + "N00055\tBCR-ABL fusion kinase to Jak-STAT signaling pathway\n", + "N00056\tWnt signaling pathway\n", + "N00057\tMutation-inactivated APC to Wnt signaling pathway\n", + "N00058\tMutation-activated CTNNB1 to Wnt signaling pathway\n", + "N00059\tFZD7-overexpression to Wnt signaling pathway\n", + "N00060\tLRP6-overexpression to Wnt signaling pathway\n", + "N00061\tCDH1-reduced expression to beta-catenin signaling pathway\n", + "N00062\tHedgehog signaling pathway\n", + "N00063\tTGF-beta signaling pathway\n", + "N00064\tMutation-inactivated TGFBR2 to TGF-beta signaling pathway\n", + "N00065\tMutation-inactivated SMAD2 to TGF-beta signaling pathway\n", + "N00066\tMDM2-p21-Cell cycle G1/S\n", + "N00067\tDeleted p14(ARF) to p21-cell cycle G1/S\n", + "N00068\tAmplified MDM2 to p21-cell cycle G1/S\n", + "N00069\tp16-Cell cycle G1/S\n", + "N00070\tMutation-inactivated p16(INK4a) to p16-cell cycle G1/S\n", + "N00071\tDeleted p16(INK4a) to p16-cell cycle G1/S\n", + "N00072\tAmplified CDK4 to cell cycle G1/S\n", + "N00073\tMutation-activated CDK4 to cell cycle G1/S\n", + "N00074\tLoss of RB1 to cell cycle G1/S\n", + "N00075\tMutation-inactivated RB1 to cell cycle G1/S\n", + "N00076\tMutation-inactivated p14(ARF) to p21-cell cycle G1/S\n", + "N00077\tHRAS-overexpression to ERK signaling pathway\n", + "N00078\tMutation-activated HRAS to ERK signaling pathway\n", + "N00079\tHIF-1 signaling pathway\n", + "N00080\tLoss of VHL to HIF-1 signaling pathway\n", + "N00081\tMutation-inactivated VHL to HIF-1 signaling pathway\n", + "N00082\tLoss of NKX3-1 to PI3K signaling pathway\n", + "N00083\tAndrogen receptor signaling pathway\n", + "N00084\tAmplified AR to androgen receptor signaling pathway\n", + "N00085\tMutation-activated AR to androgen receptor signaling pathway\n", + "N00086\tNotch signaling pathway\n", + "N00087\tNOTCH-overexpression to Notch signaling pathway\n", + "N00088\tAmplified MYC to p15-cell cycle G1/S\n", + "N00089\tAmplified MYC to cell cycle G1/S\n", + "N00090\tp15-Cell cycle G1/S\n", + "N00091\tp27-Cell cycle G1/S\n", + "N00092\tAmplified MYC to p27-cell cycle G1/S\n", + "N00093\tLoss of CDKN1B to p27-cell cycle G1/S\n", + "N00094\tEGF-Jak-STAT signaling pathway\n", + "N00095\tERBB2-overexpression to EGF-Jak-STAT signaling pathway\n", + "N00096\tEGF-EGFR-RAS-RASSF1 signaling pathway\n", + "N00097\tLoss of RASSF1 to RAS-RASSF1 signaling pathway\n", + "N00098\tIntrinsic apoptotic pathway\n", + "N00099\tMutation-inactivated BAX to apoptotic pathway\n", + "N00100\tBCL2-overexpression to intrinsic apoptotic pathway\n", + "N00101\tDCC-apoptotic pathway\n", + "N00102\tLoss of DCC to DCC-apoptotic pathway\n", + "N00103\tEGF-EGFR-RAS-RalGDS signaling pathway\n", + "N00104\tMutation-activated KRAS to RalGDS signaling pathway\n", + "N00105\tEML4-ALK fusion kinase to Jak-STAT signaling pathway\n", + "N00106\tAML1-EVI1 fusion to TGF-beta signaling pathway\n", + "N00107\tEVI-1 overexpression to TGF-beta signaling pathway\n", + "N00108\tAML1-ETO fusion to transcriptional activtion\n", + "N00109\tPML-RARA fusion to transcriptional activtion\n", + "N00110\tPLZF-RARA fusion to transcriptional activtion\n", + "N00111\tAML1-ETO fusion to CEBPA-mediated transcription\n", + "N00112\tAML1-ETO fusion to PU.1-mediated transcription\n", + "N00113\tPML-RARA fusion to transcriptional repression\n", + "N00114\tPLZF-RARA fusion to transcriptional repression\n", + "N00115\tMutation-inactivated TP53 to transcription\n", + "N00116\tMutation-inactivated RUNX1 to transcription\n", + "N00117\tE2A-PBX1 fusion to transcriptional activation\n", + "N00118\tTEL-AML1 fusion to transcriptional repression\n", + "N00119\tMLL-AF4 fusion to transcriptional activation\n", + "N00120\tMLL-ENL fusion to transcriptional activation\n", + "N00121\tLMO2-rearrangement to transcriptional activation\n", + "N00122\tLMO2-rearrangement to transcriptional repression\n", + "N00123\tAmplified REL to transcription\n", + "N00124\tIGH-MAF fusion to transcriptional activation\n", + "N00125\tIGH-MMSET fusion to transcriptional activation\n", + "N00126\tPAX8-PPARG fusion to PPARG-mediated transcription\n", + "N00127\tPRCC-TFE3 fusion to transcriptional activation\n", + "N00128\tTMPRSS2-ERG fusion to transcriptional activation\n", + "N00129\tTMPRSS2-ERG fusion to transcriptional repression\n", + "N00130\tTMPRSS2-ETV5 fusion to transcriptional activation\n", + "N00131\tAmplified MYCN to transcriptional activation\n", + "N00132\tAmplified MYCN to transcriptional repression\n", + "N00133\tEWSR1-FLI1 fusion to transcriptional activation\n", + "N00134\tEWSR1-FLI1 fusion to transcriptional repression\n", + "N00135\tEWSR1-ERG fusion to transcriptional activation\n", + "N00136\tEWSR1-ATF1 fusion to transcriptional activation\n", + "N00137\tEWSR1-WT1 fusion to transcriptional activation\n", + "N00138\tEWSR1-NR4A3\n", + "N00139\tFUS-DDIT3 fusion to CEBPB-mediated transcription\n", + "N00140\tFUS-DDIT3 fusion to NFKB-mediated transcription\n", + "N00141\tPAX3-FOXO1 fusion to transcriptional activation\n", + "N00142\tSYT-SSX fusion to transcriptional repression\n", + "N00143\tASPL-TFE3 fusion to transcriptional activation\n", + "N00144\tTLX1 rearrangement to transcriptional repression\n", + "N00145\tExtrinsic apoptotic pathway\n", + "N00146\tCrosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00147\tEGF-EGFR-PLCG-calcineurin signaling pathway\n", + "N00148\tTLR3-IRF7 signaling pathway\n", + "N00149\tTLR3-IRF3 signaling pathway\n", + "N00150\tType I IFN signaling pathway\n", + "N00151\tTNF-NFKB signaling pathway\n", + "N00152\tCXCR-GNB/G-ERK signaling pathway\n", + "N00153\tCCR/CXCR-GNB/G-PI3K-RAC signaling pathway\n", + "N00154\tCXCR-GNB/G-PI3K-AKT signaling pathway\n", + "N00155\tAutophagy-vesicle nucleation/elongation/maturation, mTORC1-PI3KC3-C1\n", + "N00156\tAutophagy-vesicle nucleation/elongation/maturation, LC3-II formation\n", + "N00157\tKSHV vGPCR to GNB/G-ERK signaling pathway\n", + "N00158\tKSHV vGPCR to GNB/G-PI3K-AKT signaling pathway\n", + "N00159\tKSHV K1 to PI3K signaling pathway\n", + "N00160\tKSHV K1 to RAS-ERK signaling pathway\n", + "N00161\tKSHV vIRF1/2 to TLR3-IRF3 signaling pathway\n", + "N00162\tKSHV vIRF3 to TLR3-IRF7 signaling pathway\n", + "N00163\tKSHV KIE1/2 to TLR3-IRF7 signaling pathway\n", + "N00164\tKSHV vBCL2 to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00165\tKSHV vIAP to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00166\tKSHV vFLIP to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00167\tKSHV vIRF1/3 to p21-cell cycle G1/S\n", + "N00168\tKSHV vCyclin to cell cycle G1/S\n", + "N00169\tKSHV LANA to p21-cell cycle G1/S\n", + "N00170\tKSHV LANA to cell cycle G1/S\n", + "N00171\tKSHV vFLIP to NFKB signaling pathway\n", + "N00172\tKSHV K15 to PLCG-calcineurin signaling pathway\n", + "N00173\tKSHV K15 to TNF-NFKB signaling pathway\n", + "N00174\tKSHV vFLIP to TNF-NFKB signaling pathway\n", + "N00175\tKSHV LANA to Wnt signaling pathway\n", + "N00176\tKSHV vFLIP to autophagy-vesicle elongation\n", + "N00177\tKSHV vBCL2 to autophagy-vesicle nucleation\n", + "N00178\tKSHV vGPCR to GNB/G-PI3K-JNK signaling pathway\n", + "N00179\tKSHV K1 to PI3K-NFKB signaling pathway\n", + "N00180\tKSHV K1 to PLCG-calcineurin signaling pathway\n", + "N00181\tKSHV vIL-6 to Jak-STAT signaling pathway\n", + "N00182\tIGF-IGFR-PI3K-NFKB signaling pathway\n", + "N00184\tKSHV MIR1/2 to antigen processing and presentation by MHC class I molecules\n", + "N00185\tKSHV MIR2 to cell surface molecule-endocytosis\n", + "N00186\tIL1-IL1R-p38 signaling pathway\n", + "N00187\tKSHV Kaposin B to p38 signaling pathway\n", + "N00188\tIL1-IL1R-JNK signaling pathway\n", + "N00189\tKSHV K15 to JNK signaling pathway\n", + "N00212\tKSHV vCCL2 to CCR signaling pathway\n", + "N00213\tKSHV Kaposin to alternative pathway of complement cascade\n", + "N00215\tKITLG-KIT-RAS-ERK signaling pathway\n", + "N00216\tHGF-MET-RAS-ERK signaling pathway\n", + "N00217\tFLT3LG-FLT3-RAS-ERK signaling pathway\n", + "N00218\tFLT3LG-FLT3-RAS-PI3K signaling pathway\n", + "N00219\tFLT3LG-FLT3-STAT5 signaling pathway\n", + "N00220\tPTEN-PIP3-AKT signaling pathway\n", + "N00221\tHTLV-1 Tax to spindle assembly checkpoint signaling\n", + "N00222\tHTLV-1 Tax to spindle assembly checkpoint signaling\n", + "N00223\tEBV EBNA1 to p53-mediated transcription\n", + "N00224\tEBV EBNALP RBP-Jk-mediated transcription\n", + "N00225\tEBV EBNA2 to RBP-Jk-mediated transcription\n", + "N00226\tEBV EBNA3A/3B/3C to RBP-Jk-mediated transcription\n", + "N00227\tTGFA-EGFR-PLCG-PKC signaling pathway\n", + "N00228\tTGFA-overexpression to PLCG-PKC signaling pathway\n", + "N00229\tTGFA-EGFR-RAS-ERK signaling pathway\n", + "N00230\tTGFA-overexpression to RAS-ERK signaling pathway\n", + "N00231\tTGFA-EGFR-PI3K signaling pathway\n", + "N00232\tTGFA-overexpression to PI3K signaling pathway\n", + "N00233\tIGF-IGF1R-RAS-ERK signaling pathway\n", + "N00234\tIGF2-IGF1R-PI3K signaling pathway\n", + "N00235\tIGF2-overexpression to RAS-ERK signaling pathway\n", + "N00236\tIGF2-overexpression to PI3K signaling pathway\n", + "N00237\tIGF1R-overexpression to RAS-ERK signaling pathway\n", + "N00238\tIGF1R-overexpression to PI3K signaling pathway\n", + "N00239\tTelomerase activity\n", + "N00240\tTERT-overexpression to telomerase activity\n", + "N00241\tTGFBR2-reduced expression to TGF-beta signaling pathway\n", + "N00242\tMutation-inactivated AXIN to Wnt signaling pathway\n", + "N00243\tKEAP1-NRF2 signaling pathway\n", + "N00244\tMutation-inactivated KEAP1 to KEAP1-NRF2 signaling pathway\n", + "N00245\tMutation-activated NRF2 to KEAP1-NRF2 signaling pathway\n", + "N00246\tHGF-overexpression to RAS-ERK signaling pathway\n", + "N00247\tHGF-overexpression to PI3K signaling pathway\n", + "N00248\tMET-overexpression to RAS-ERK signaling pathway\n", + "N00249\tMET-overexpression to PI3K signaling pathway\n", + "N00250\tCDX2-overexpression to transcriptional activation\n", + "N00251\tCDX2-overexpression to transcriptional repression\n", + "N00252\tAmplified ERBB2 to RAS-ERK signaling pathway\n", + "N00253\tAmplified ERBB2 to PI3K signaling pathway\n", + "N00254\tCDKN1B-reduced expression to p27-cell cycle G1/S\n", + "N00255\tAmplified CCNE to cell cycle G1/S\n", + "N00256\tTGFBR1-reduced expression to TGF-beta signaling pathway\n", + "N00257\tLoss of CDH1 to beta-catenin signaling pathway\n", + "N00258\tMutation-inactivated CDH1 to beta-catenin signaling pathway\n", + "N00259\tAmplified MET to RAS-ERK signaling pathway\n", + "N00260\tAmplified MET to PI3K signaling pathway\n", + "N00261\tKSHV vIRF2 to IFN signaling pathway\n", + "N00262\tEBV EBNA3C to intrinsic apoptotic pathway\n", + "N00263\tEBV EBNA3C to p53-mediated transcription\n", + "N00264\tEBV EBNA3C to p27-Cell cycle G1/S\n", + "N00265\tEBV LMP1 to NFKB signaling pathway\n", + "N00266\tEBV LMP2A to PI3K signaling pathway\n", + "N00267\tHBV HBx to PI3K signaling pathway\n", + "N00268\tHBV HBx to RIG-I-like receptor signaling pathway\n", + "N00269\tHCV core to TNF-NFKB signaling pathway\n", + "N00270\tHCV Core to IFN signaling pathway\n", + "N00271\tHCV NS3/4A to RIG-I-like receptor signaling pathway\n", + "N00272\tHCV NS5A to PI3K signaling pathway\n", + "N00273\tHCV NS5A to oligoadenylate synthetase (OAS)/RNase L pathway\n", + "N00274\tHCV NS5A to RAS-ERK signaling pathway\n", + "N00275\tAmplified CCND1 to cell cycle G1/S\n", + "N00276\tEGF-overexpression to RAS-ERK signaling pathway\n", + "N00277\tEREG-EGFR-RAS-ERK signaling pathway\n", + "N00278\tEREG-overexpression to RAS-ERK signaling pathway\n", + "N00279\tAREG-EGFR-RAS-ERK signaling pathway\n", + "N00280\tAREG-overexpression to RAS-ERK signaling pathway\n", + "N00281\tEGF-overexpression to PI3K signaling pathway\n", + "N00282\tEREG-EGFR-PI3K signaling pathway\n", + "N00283\tEREG-overexpression to PI3K signaling pathway\n", + "N00284\tAREG-EGFR-PI3K signaling pathway\n", + "N00285\tAREG-overexpression to PI3K signaling pathway\n", + "N00286\tNuclear-initiated estrogen signaling pathway\n", + "N00287\tESR1-positive to nuclear-initiated estrogen signaling pathway\n", + "N00288\tPTH-PTH1R-PKA signaling pathway\n", + "N00290\tMutation-inactivated MEN1 to transcription\n", + "N00291\tCaSR-PTH signaling pathway\n", + "N00293\tGCM2-mediated transcription\n", + "N00297\tACTH-cortisol signaling pathway\n", + "N00298\tCYP11B1-CYP11B2 fusion to ACTH-cortisol signaling pathway\n", + "N00301\tAngiotensin-aldosterone signaling pathway\n", + "N00302\tMutation-activated CACNA1D/H to angiotensin-aldosterone signaling pathway\n", + "N00303\tMutation-activated KCNJ5 to angiotensin-aldosterone signaling pathway\n", + "N00304\tMutation-inactivated ATP1A1 to angiotensin-aldosterone signaling pathway\n", + "N00305\tMutation-inactivated ATP2B3 to angiotensin-aldosterone signaling pathway\n", + "N00306\tSF-1-mediated transcription\n", + "N00309\tCortisone reduction\n", + "N00311\tNADPH generation\n", + "N00313\tTransport of cortisol\n", + "N00315\tMutation-inactivated AIP to AhR-mediated transcription\n", + "N00316\tMutation-inactivated CDKN1B to p27-cell cycle G1/S\n", + "N00317\tAhR signaling pathway\n", + "N00318\tEGFR-ERK-ACTH signaling pathway\n", + "N00319\tMutation-activated USP8 to EGFR-ERK-ACTH signaling pathway\n", + "N00320\tMutation-activated PRKACA to ACTH-cortisol signaling pathway\n", + "N00321\tMutation-activated GNAS to ACTH-cortisol signaling pathway\n", + "N00322\tMutation-inactivated PRKAR1A to ACTH-cortisol signaling pathway\n", + "N00323\tMutation-inactivated PDE11A/PDE8B to ACTH-cortisol signaling pathway\n", + "N00324\tCRHR-PKA-ACTH signaling pathway\n", + "N00325\tMutation-inactivated RASD1 to CRHR-PKA-ACTH signaling pathway\n", + "N00326\tMutation-activated GNAS to CRHR-PKA-ACTH signaling pathway\n", + "N00327\tMutation-inactivated PRKAR1A to CRHR-PKA-ACTH signaling pathway\n", + "N00332\tVesicular uptake of lipoproteins\n", + "N00336\tPCSK9-mediated LDLR degradation\n", + "N00338\tSteroid hormone biosynthesis, progesterone to cortisol/cortisone\n", + "N00339\tSteroid hormone biosynthesis, progesterone to aldosterone\n", + "N00340\tThe Scribble/Dlg/Lgl polarity module\n", + "N00341\tHPV E6 to the Scribble/Dlg/Lgl polarity module\n", + "N00342\tMAGI-PTEN signaling pathway\n", + "N00343\tHPV E6 to MAGI-PTEN signaling pathway\n", + "N00344\tCRB3-Pals1-PATJ complex\n", + "N00345\tHPV E6 to CRB3-Pals1-PATJ complex\n", + "N00346\tHPV E6 to TLR-IRF3 signaling pathway\n", + "N00347\tp300-p21-Cell cycle G1/S\n", + "N00348\tHPV E6 to p300-p21-Cell cycle G1/S\n", + "N00349\tHPV E6 to p300-p21-Cell cycle G1/S\n", + "N00350\tHPV E6 to extrinsic apoptotic pathway\n", + "N00351\tHPV E6 to extrinsic apoptotic pathway\n", + "N00352\tHPV E6 to extrinsic apoptotic pathway\n", + "N00353\tHPV E6 to PTEN-PIP3-AKT signaling pathway\n", + "N00354\tHPV E6 to PTEN-PIP3-AKT signaling pathway\n", + "N00355\tPP2A-AKT signaling pathway\n", + "N00356\tHPV E7 to PP2A-AKT signaling patyway\n", + "N00357\tHPV E6 to MTOR signaling pathway\n", + "N00358\tHPV E6 to p21-cell cycle G1/S\n", + "N00359\tHPV E7 to p27-cell cycle G1/S\n", + "N00360\tHPV E7 to p27-cell cycle G1/S\n", + "N00361\tHPV E7 to cell cycle G1/S\n", + "N00362\tHPV E5 to p21-cell cycle G1/S\n", + "N00363\tAntigen processing and presentation by MHC class I molecules\n", + "N00364\tHPV E5 to antigen processing and presentation by MHC class I molecules\n", + "N00365\tHPV E7 to cell cycle G1/S\n", + "N00366\tHPV E5 to EGFR-PI3K signaling pathway\n", + "N00367\tHPV E5 to EGFR-RAS-ERK signaling pathway\n", + "N00368\tHPV E5 to PDGFR-PI3K signaling pathway\n", + "N00369\tHPV E5 to PDGFR-RAS-ERK signaling pathway\n", + "N00370\tPyruvate generation\n", + "N00371\tHPV E7 to pyruvate generation\n", + "N00372\tHPV E7 to p300-p21-Cell cycle G1/S\n", + "N00373\tHPV E6 to NFX1-mediated transcription\n", + "N00374\tTNF-IRF1 signaling pathway\n", + "N00375\tHPV E7 to TNF-IRF1 signaling pathway\n", + "N00376\tHPV E7 to TBP1-mediated transcription\n", + "N00377\tHPV E6 to IFN signaling pathway\n", + "N00378\tHPV E6 to IFN signaling pathway\n", + "N00379\tHPV E7 to IFN signaling pathway\n", + "N00380\tHPV E6 to Notch signaling pathway\n", + "N00381\tHPV E6 to Notch signaling pathway\n", + "N00382\tHPV E6 to Notch signaling pathway\n", + "N00383\tHPV E6 to intrinsic apoptotic pathway\n", + "N00384\tHPV E6 to intrinsic apoptotic pathway\n", + "N00385\tHCMV gB to PDGFR-PI3K signaling pathway\n", + "N00386\tHCMV gB to PDGFR-RAS-ERK signaling pathway\n", + "N00387\tHCMV IE1-72/IE2-86 to PI3K signaling pathway\n", + "N00388\tHCMV UL38 to MTOR signaling pathway\n", + "N00389\tHCMV IE1-72 to transcription\n", + "N00390\tEGF-EGFR-PI3K-NFKB signaling pathway\n", + "N00391\tHCMV gB to EGFR-PI3K-NFKB signaling pathway\n", + "N00392\tHCMV gB to EGFR-RAS-ERK signaling pathway\n", + "N00393\tITGA/B-RhoGAP-RhoA signaling pathway\n", + "N00394\tHCMV gH to ITGA/B-RhoA signaling pathway\n", + "N00395\tcGAS-STING signaling pathway\n", + "N00396\tHCMV UL82 to cGAS-STING signaling pathway\n", + "N00397\tHCMV UL26 to NFKB signaling pathway\n", + "N00398\tHCMV IE2-86 to TNF-NFKB signaling pathway\n", + "N00399\tCCR2-GNB/G-PI3K-NFKB signaling pathway\n", + "N00400\tHCMV US28 to GNB/G-PI3K-NFKB signaling pathway\n", + "N00401\tCXCR4-GNAQ-PLCB/G-calcineurin signaling pathway\n", + "N00402\tHCMV US28 to GNAQ-PLCB/G-calcineurin signaling pathway\n", + "N00403\tCX3CR1-GNAI-AC-PKA signaling pathway\n", + "N00404\tHCMV US28 to GNAI-AC-PKA signaling pathway\n", + "N00405\tCXCR4-GNA12/13-Rho signaling pathway\n", + "N00406\tHCMV US28 to GNA12/13-Rho signaling pathway\n", + "N00407\tHCMV UL33 to GNAQ-PLCB/G-calcineurin signaling pathway\n", + "N00408\tLPAR-GNB/G-Rho signaling pathway\n", + "N00409\tHCMV UL33 to GNB/G-Rho signaling pathway\n", + "N00410\tDRD1-GNAS-AC-PKA signaling pathway\n", + "N00411\tHCMV UL33 to GNAS-AC-PKA signaling pathway\n", + "N00412\tHCMV UL33 to GNAI-AC-PKA signaling pathway\n", + "N00413\tCXCR4-GNB/G-PLCB-PKC signaling pathway\n", + "N00414\tHCMV US27 to CXCR4-GNB/G-PLCB-PKC signaling pathway\n", + "N00415\tIL10 family to Jak-STAT signaling pathway\n", + "N00416\tHCMV vIL10 to IL10-JAK-STAT signaling pathway\n", + "N00417\tHCMV US6 to antigen processing and presentation by MHC class I molecules\n", + "N00418\tHCMV US2/11 to antigen processing and presentation by MHC class I molecules\n", + "N00419\tHCMV US3/10 to antigen processing and presentation by MHC class I molecules\n", + "N00420\tHCMV IE2-86 to p21-cell cycle G1/S\n", + "N00421\tHCMV IE2-86 to p21-cell cycle G1/S\n", + "N00422\tHCMV IE2-86 to cell cycle G1/S\n", + "N00423\tHCMV IE1-72 to cell cycle G1/S\n", + "N00424\tHCMV pp71 to cell cycle G1/S\n", + "N00425\tHCMV UL36 to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00426\tHCMV UL37x1 to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00427\tHCMV vCXCL to CXCR-GNB/G-PI3K-AKT signaling pathway\n", + "N00428\tCCR5-GNB/G-PLCB/G-PKC signaling pathway\n", + "N00429\tHCMV UL22A to CCR5-GNB/G-PLCB/G-PKC signaling pathway\n", + "N00430\tCXCR4-GNAI-PI3K-BAD signaling pathway\n", + "N00431\tHIV gp120 to CXCR4-GNAI-PI3K-BAD signaling pathway\n", + "N00432\tHIV gp120 to CXCR4-GNAQ-PLCB/G-calcineurin\n", + "N00433\tCXCR4-GNB/G-RAC signaling pathway\n", + "N00434\tHIV gp120 to CXCR4-GNB/G-RAC signaling pathway\n", + "N00435\tTLR1/2/4-NFKB signaling pathway\n", + "N00436\tHIV Tat to TLR2/4-NFKB signaling pathway\n", + "N00437\tHIV Vpu to TLR2/4-NFKB signaling pathway\n", + "N00438\tTLR2/4-MAPK signaling pathway\n", + "N00439\tHIV Nef to TLR2/4-MAPK signaling pathway\n", + "N00440\tHIV Vpu/Vif/Vpr to cGAS-STING signaling pathway\n", + "N00441\tHIV gp120 to TNF-NFKB signaling pathway\n", + "N00442\tHIV Nef to TNF-NFKB signaling pathway\n", + "N00443\tHIV Vpr/Nef/Tat to TNF-NFKB signaling pathway\n", + "N00444\tTNF-p38 signaling pathway\n", + "N00445\tHIV Tat/Nef to TNF-p38 signaling pathway\n", + "N00446\tTNF-JNK signaling pathway\n", + "N00447\tHIV Vpr/Tat to TNF-JNK signaling pathway\n", + "N00448\tHIV Tat/Nef to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00449\tHIV Tat/Nef to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00450\tHIV Tat to intrinsic apoptotic pathway\n", + "N00451\tHIV Tat to intrinsic apoptotic pathway\n", + "N00452\tHIV Nef to intrinsic apoptotic pathway\n", + "N00453\tHIV Vpr to intrinsic apoptotic pathway\n", + "N00454\tHIV Vpr to intrinsic apoptotic pathway\n", + "N00455\tCDC25-Cell cycle G2/M\n", + "N00456\tHIV Vpr to CDC25-cell cycle G2M\n", + "N00457\tHIV Vpr to cell cycle G2M\n", + "N00458\tHIV Vpr to CDC25-cell cycle G2M\n", + "N00459\tWEE1-Cell cycle G2/M\n", + "N00460\tHIV Vpr to WEE1-cell cycle G2M\n", + "N00461\tHIV Nef to antigen processing and presentation by MHC class I molecules\n", + "N00462\tKSHV vCCL1/2/3 to CCR signaling pathway\n", + "N00465\tDeleted DMD to dystrophin-associated protein complex\n", + "N00466\tEBV BPLF1 to TLR2/4-NFKB signaling pathway\n", + "N00467\tEBV BPLF1 to TLR2/4-NFKB signaling pathway\n", + "N00468\tEBV BPLF1 to TLR2/4-NFKB signaling pathway\n", + "N00469\tRIG-I-IRF7/3 signaling pathway\n", + "N00470\tEBV BGLF4 to RIG-I-like receptor signaling pathway\n", + "N00471\tEBV LMP2A/2B to IFN signaling pathway\n", + "N00472\tEBV LMP1 to IFN signaling pathway\n", + "N00473\tEBV BGLF4 to IFN signaling pathway\n", + "N00474\tEBV BHRF1 to intrinsic apoptotic pathway\n", + "N00475\tEBV BHRF1 to intrinsic apoptotic pathway\n", + "N00476\tEBV BHRF1 to intrinsic apoptotic pathway\n", + "N00477\tEBV BHRF1 to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00478\tEBV BARF1 to intrinsic apoptotic pathway\n", + "N00479\tEBV BNLF2a to antigen processing and presentation by MHC class I molecules\n", + "N00480\tEBV BILF1 to antigen processing and presentation by MHC class I molecules\n", + "N00481\tEBV BZLF1 to p53-mediated transcription\n", + "N00482\tEBV EBNA3C to p27-Cell cycle G1/S\n", + "N00483\tEBV EBNA3C to cell cycle G1/S\n", + "N00484\tEBV EBNA3C to cell cycle G1/S\n", + "N00485\tEBV LMP1 to PI3K signaling pathway\n", + "N00486\tEBV LMP1 to Jak-STAT signaling pathway\n", + "N00487\tBCR-PLCG-Calcineurin signaling pathway\n", + "N00488\tEBV LMP2A to BCR signaling pathway\n", + "N00489\tHTLV-1 p30II to c-myc-mediated transcription\n", + "N00490\tHTLV-1 p12 to calcineurin signaling pathway\n", + "N00491\tHTLV-1 p12 to Jak-STAT signaling pathway\n", + "N00492\tHTLV-1 p12 to antigen processing and presentation by MHC class I molecules\n", + "N00493\tSpindle assembly checkpoint signaling\n", + "N00494\tHTLV-1 Tax to p16-cell cycle G1/S\n", + "N00495\tHTLV-1 Tax to p15-cell cycle G1/S\n", + "N00497\tHTLV-1 Tax to p21-cell cycle G1/S\n", + "N00498\tHTLV-1 Tax to p21-cell cycle G1/S\n", + "N00499\tATR-p21-Cell cycle G2/M\n", + "N00500\tHTLV-1 Tax to p21-cell cycle G2/M\n", + "N00501\tHTLV-1 Tax to EGFR-PI3K-NFKB signaling pathway\n", + "N00502\tHTLV-1 Tax to PTEN-PIP3-AKT signaling pathway\n", + "N00503\tHTLV-1 Tax to TNF-JNK signaling pathway\n", + "N00504\tHTLV-1 Tax to NFKB signaling pathway\n", + "N00505\tCD40-NFKB signaling pathway\n", + "N00506\tHTLV-1 Tax to CD40-NFKB signaling pathway\n", + "N00507\tHTLV-1 Tax to TGF-beta signaling pathway\n", + "N00508\tHTLV-1 Tax to NFY-mediated transcription\n", + "N00509\tHTLV-1 Tax to SRF-mediated transcription\n", + "N00510\tHTLV-1 Tax to CREB-mediated transcription\n", + "N00511\tHTLV-1 Tax to E47-mediated transcription\n", + "N00512\tHTLV-1 Tax to c-myc-mediated transcription\n", + "N00513\tMutation-activated EGFR to RAS-ERK signaling pathway\n", + "N00514\tMutation-activated EGFR to PI3K signaling pathway\n", + "N00515\tOligoadenylate synthetase (OAS)/RNase L pathway\n", + "N00516\tHCV NS3/4A to TLR3-IRF3 signaling pathway\n", + "N00517\tHCV NS3/4A to TLR3-IRF3 signaling pathway\n", + "N00518\tHCV Core to ERK signaling pathway\n", + "N00519\tHCV Core to ERK signaling pathway\n", + "N00520\tHCV NS5A to p21-cell cycle G1/S\n", + "N00521\tHCV Core to p21-cell cycle G1/S\n", + "N00522\tHCV NS3 to p21-cell cycle G1/S\n", + "N00523\tHCV Core to p21-cell cycle G1/S\n", + "N00524\tHCV NS5A to extrinsic apoptotic pathway\n", + "N00525\tHCV NS5A to TNF-NFKB signaling pathway\n", + "N00526\tHCV NS3 to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00527\tHCV Core to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00528\tHCV core to extrinsic apoptotic pathway\n", + "N00529\tHCV core to RXRA/PPARA-mediated transcription\n", + "N00530\tHCV core to RXRA/LXRA-mediated transcription\n", + "N00531\tHBV HBx to TGF-beta signaling pathway\n", + "N00532\tHBV HBx to Egr-mediated transcription\n", + "N00533\tHBV HBx to Crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00534\tHBV HBx to Crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00535\tHBV HBx to p53-mediated transcription\n", + "N00536\tMDM2-p21-Cell cycle G1/S\n", + "N00537\tHBV HBx to cell cycle G1/S\n", + "N00538\tCa2+-PYK2-RAS-ERK signaling pathway\n", + "N00539\tHBV HBx to Ca2+-PYK2-RAS-ERK signaling pathway\n", + "N00540\tHBV HBx to RAS-ERK signaling pathway\n", + "N00541\tHBV HBx to RAS-ERK signaling pathway\n", + "N00542\tEGF-EGFR-RAS-JNK signaling pathway\n", + "N00543\tHBV HBx to JNK signaling pathway\n", + "N00544\tHBV HBx to CREB-mediated transcription\n", + "N00545\tHBV HBx to ERK signaling pathway\n", + "N00546\tCXCL12-CXCR4-PKC-ERK signaling pathaway\n", + "N00547\tHBV LHBs to PKC-ERK signaling pathway\n", + "N00548\tHBV HBx to Jak-STAT signaling pathway\n", + "N00549\tHBV HBeAg to TLR2/4-NFKB signaling pathway\n", + "N00550\tHBV HBeAg to TLR2/4-NFKB signaling pathway\n", + "N00551\tHBV HBs to TLR2/4-MAPK signaling pathway\n", + "N00552\tHBV pol to TLR3-IRF3 signaling pathway\n", + "N00553\tTLR4-IRF3/7 signaling pathway\n", + "N00554\tHBV HBe to TLR4-IRF3/7 signaling pathway\n", + "N00555\tHBV HBe to TLR4-IRF3/7 signaling pathway\n", + "N00556\tHBV HBe to TLR2/4-NFKB signaling pathway\n", + "N00557\tHBV HBe to TLR2/4-NFKB signaling pathway\n", + "N00558\tHBV pol to IFN signaling pathway\n", + "N00559\tLIGHT-HVEM-NFKB signaling pathway\n", + "N00560\tHSV gD to HVEM-NFKB signaling pathway\n", + "N00561\tHSV ICP0 to TLR2/4-NFKB signaling pathway\n", + "N00562\tHSV US3 to TLR2/4-NFKB signaling pathway\n", + "N00563\tTLR3-NFKB signaling pathway\n", + "N00564\tHSV US3 to TLR3-NFKB signaling pathway\n", + "N00565\tHSV US11 to RIG-I-like receptor signaling pathway\n", + "N00566\tHSV UL36USP to RIG-I-like receptor signaling pathway\n", + "N00567\tHSV ICP34.5 to TBK1 signaling pathway\n", + "N00568\tHSV US3 to IRF3 signaling pathway\n", + "N00569\tHSV UL41 to cGAS-STING signaling pathway\n", + "N00570\tHSV ICP0 to cGAS-STING signaling pathway\n", + "N00571\tPKR-eIF2alpha signaling pathway\n", + "N00572\tHSV ICP34.5 to PKR-eIF2alpha signaling pathway\n", + "N00573\tHSV US11 to PKR-eIF2alpha signaling pathway\n", + "N00574\tHSV US11 to oligoadenylate synthetase (OAS)/RNase L pathway\n", + "N00575\tHSV ICP27 to IFN signaling pathway\n", + "N00576\tHSV UL41/UL13 to IFN signaling pathway\n", + "N00577\tHSV UL41 to IFN signaling pathway\n", + "N00578\tHSV UL41 to IFN signaling pathway\n", + "N00579\tHSV ICP6 to extrinsic apoptotic pathway\n", + "N00580\tHSV ICP0 to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00581\tHSV ICP47 to antigen processing and presentation by MHC class I molecules\n", + "N00582\tIGF-IGF1R-PI3K signaling pathway\n", + "N00583\tHSV VP11/12 to PI3K signaling pathway\n", + "N00584\tHSV US3 to MTOR signaling pathway\n", + "N00585\tHSV US3 to intrinsic apoptotic pathway\n", + "N00586\tNuclear export of mRNA\n", + "N00587\tHSV ICP27 to Nuclear export of mRNA\n", + "N00588\tHSV VP16 to Oct-1-mediated transcription\n", + "N00589\tHSV gC to alternative pathway of complement cascade\n", + "N00590\tAntigen processing and presentation by MHC class II molecules\n", + "N00591\tHSV gB to antigen processing and presentation by MHC class II molecules\n", + "N00592\tHSV ICP0 to p53-mediated transcription\n", + "N00593\tUrea cycle\n", + "N00599\tObligate allosteric activation of CPS1 by NAG\n", + "N00600\tNAGS deficiency in urea cycle\n", + "N00601\tHeme biosynthesis\n", + "N00610\tDermatan sulfate degradation\n", + "N00615\tHeparan sulfate degradation\n", + "N00623\tKeratan sulfate degradation\n", + "N00627\tMannose type O-glycan biosynthesis, POMT to POMK\n", + "N00640\tHydrolysis of lactosylceramide\n", + "N00642\tSaposin stimulation of GBA and GALC\n", + "N00643\tLoss of saposin stimulation\n", + "N00644\tHydrolysis of galabiosylceramide\n", + "N00647\tHydrolysis of galactosylceramide sulfate\n", + "N00649\tHydrolysis of sphingomyelin\n", + "N00653\tN-Glycan precursor biosynthesis, ALG7 to ALG11\n", + "N00667\tN-Glycan precursor biosynthesis, Glc-6P to Man-P-Dol\n", + "N00673\tN-Glycan precursor biosynthesis, Glc-6P to UDP-Glu\n", + "N00675\tN-Glycan precursor biosynthesis, farnesy-PP to P-Dol\n", + "N00679\tGlucosylceramide synthesis in GBA deficiency\n", + "N00680\tN-Glycan precursor biosynthesis, ALG3 to ALG9\n", + "N00681\tN-Glycan precursor biosynthesis, ALG6 to OST\n", + "N00682\tN-Glycan precursor biosynthesis, P-Dol to Glc-P-Dol\n", + "N00683\tCD80/CD86-CD28-PI3K signaling pathway\n", + "N00684\tMV F/H to CD28-PI3K signaling pathway\n", + "N00685\tMV V to RIG-I-IRF7/3 signaling pathway\n", + "N00686\tMV N to RIG-I-IRF7/3 signaling pathway\n", + "N00687\tMV V/C to RIG-I-IRF7/3 signaling pathway\n", + "N00688\tRIG-I-NFKB signaling pathway\n", + "N00689\tMV V/P/C to RIG-I-NFKB signaling pathway\n", + "N00690\tTLR7/9-IRF7 signaling pathway\n", + "N00691\tMV V to TLR7/9-IRF7 signaling pathway\n", + "N00692\tMV P to TLR2/4-NFKB signaling pathway\n", + "N00693\tMV V/P to IFN signaling pathway\n", + "N00694\tMV V/P/C to IFN signaling pathway\n", + "N00695\tMV V to p73-mediated transcription\n", + "N00696\tMV C to PKR-eIF2alpha signaling pathway\n", + "N00697\tHV P to p53-mediated transcription\n", + "N00698\tMannose type O-glycan biosynthesis, Rib-ol-5P to CDP-Rib-ol\n", + "N00699\tMannose type O-glycan biosynthesis, FKTN to LARGE\n", + "N00700\tTyrosine biosynthesis\n", + "N00702\tTetrahydrobiopterin biosynthesis, GTP to BH4\n", + "N00705\tTetrahydrobiopterin biosynthesis, BH4OH to BH4\n", + "N00708\tTyrosine degradation\n", + "N00713\tGlycogen biosynthesis\n", + "N00718\tGlycogen degradation\n", + "N00720\tGlycogen degradation (amylase)\n", + "N00724\tIAV NS1 to oligoadenylate synthetase (OAS)/RNase L pathway\n", + "N00725\tIAV NS1 to PKR-eIF2alpha signaling pathway\n", + "N00726\tIAV NP to PKR-eIF2alpha signaling pathway\n", + "N00727\tIAV NS1 to RIG-I-like receptor signaling pathway\n", + "N00728\tIAV NS1 to RIG-I-like receptor signaling pathway\n", + "N00729\tIAV NS1 to RIG-I-like receptor signaling pathway\n", + "N00730\tIAV NS1 to RIG-I-like receptor signaling pathway\n", + "N00731\tGlycolysis\n", + "N00732\tIAV PB1-F2/PB2 to RIG-I-like receptor signaling pathway\n", + "N00734\tIAV PB1-F2/PB2 to RIG-I-like receptor signaling pathway\n", + "N00736\tIAV NS1 to PI3K signaling pathway\n", + "N00738\tIAV NS1 to IFN signaling pathway\n", + "N00741\tIAV M2 to cell cycle G1/S\n", + "N00742\tNLRP3 inflammasome signaling pathway\n", + "N00743\tIAV NS1 to NLRP3 inflammasome signaling pathway\n", + "N00744\tIAV HA to ERK signaling pathway\n", + "N00745\tIAV PB1-F2 to intrinsic apoptotic pathway\n", + "N00746\tIAV NS1 to nuclear export of mRNA\n", + "N00748\tGPI-anchor biosynthesis\n", + "N00759\tSteroid hormone biosynthesis, cholesterol to pregnenolone/progesterone\n", + "N00765\tbeta-Oxidation, acyl-CoA synthesis\n", + "N00776\tbeta-Oxidation, peroxisome, VLCFA\n", + "N00779\tbeta-Oxidation, peroxisome, bile acid\n", + "N00782\tTSH-TG signaling pathway\n", + "N00786\tTransport of iodide\n", + "N00789\tMutation-inactivated TPO to iodide organification/coupling reactions\n", + "N00791\tDeiodination of MIT and DIT\n", + "N00793\tTSH-DUOX2-TG signaling pathway\n", + "N00795\tDUOX2-generated H2O2 production\n", + "N00798\tThyroid hormone signaling pathway\n", + "N00803\tIodide organification/coupling reactions\n", + "N00804\tbeta-Oxidation\n", + "N00805\tBile acid biosynthesis\n", + "N00812\tTransport of carnitine\n", + "N00814\tTransport of L-palmitoylcarnitine\n", + "N00816\tTransport of glucose 6-phosphate\n", + "N00818\tTransport of glucose\n", + "N00820\tN-Glycan biosynthesis\n", + "N00824\tTransport of GDP-fucose\n", + "N00826\tTransport of UDP-galactose\n", + "N00828\tTransport of CMP-N-acetylneuraminate\n", + "N00830\tTransport of Man5GlcNAc2-PP-dolichol\n", + "N00832\tBranched-chain amino acids degradation 1\n", + "N00842\tPropanoyl-CoA metabolism\n", + "N00847\tGalactose degradation\n", + "N00851\tLeucine degradation\n", + "N00852\tValine degradation\n", + "N00856\tIsoleucine degradation\n", + "N00859\tYersinia YopP/J to TLR2/4-NFKB signaling pathway\n", + "N00862\tYersinia YopP/J to TLR2/4-MAPK signaling pathway\n", + "N00863\tYersinia YopM to NLRP3 Inflammasome signaling pathway\n", + "N00864\tYersinia YopK to NLRP3 Inflammasome signaling pathway\n", + "N00865\tYersinia YopK to NLRC4 Inflammasome signaling pathway\n", + "N00866\tYersinia YopM to Pyrin Inflammasome signaling pathway\n", + "N00867\tNLRC4 inflammasome signaling pathway\n", + "N00868\tPyrin inflammasome signaling pathway\n", + "N00869\tKISS1-KISS1R-PLCB-PKC signaling pathway\n", + "N00873\tGnRH-GnRHR-PLCB-PKC signaling pathway\n", + "N00879\tPROK-PRKR-Gi-ERK signaling pathway\n", + "N00882\tTAC3-TACR3-PLC-PKC signaling pathway\n", + "N00885\tLHCGR-GNAS-PKA signaling pathway\n", + "N00888\tHypoxanthine oxidation\n", + "N00890\tMolybdenum cofactor biosynthesis\n", + "N00899\t5-Oxoproline metabolism\n", + "N00904\tGlutathione reduction\n", + "N00905\tNADP+ reduction\n", + "N00907\tGH-Jak-STAT signaling pathway\n", + "N00910\tGHRHR-PKA-GH signaling pathway\n", + "N00915\tAVP-V2R-PKA signaling pathway\n", + "N00918\tTRH-TRHR-PLCB-PKC signaling pathway\n", + "N00920\tPRL-JAK-STAT signaling pathway\n", + "N00922\tFSHR-GNAS-PKA signaling pathway\n", + "N00924\tGlucocorticoid receptor signaling pathway\n", + "N00926\tEscherichia Tir to TLR2/4-MAPK signaling pathway\n", + "N00927\tEscherichia/Shigella NleE/OspZ to TNF-NFKB signaling pathway\n", + "N00928\tEscherichia NleB to TNF-NFKB signaling pathway\n", + "N00929\tEscherichia NleC to TNF-NFKB signaling pathway\n", + "N00930\tEscherichia NleD to TNF-JNK signaling pathway\n", + "N00931\tEscherichia NleD to TNF-p38 signaling pathway\n", + "N00932\tEscherichia NleH1 to TNF-NFKB signaling pathway\n", + "N00933\tEscherichia NleA to NLRP3 inflammasome signaling pathway\n", + "N00934\tNon-canonical inflammasome signaling pathway\n", + "N00935\tEscherichia NleF to non-canonical inflammasome signaling pathway\n", + "N00936\tEscherichia NleB1 to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00937\tEscherichia NleF to extrinsic apoptotic pathway\n", + "N00938\tEscherichia NleH to intrinsic apoptotic pathway\n", + "N00939\tEscherichia EspF to intrinsic apoptotic pathway\n", + "N00940\tNOD-NFKB signaling pathway\n", + "N00941\tShigella IpaH9.8 to NOD-NFKB signaling pathway\n", + "N00942\tShigella OspG to TNF-NFKB signaling pathway\n", + "N00943\tShigella IpaH4.5 to TNF-NFKB signaling pathway\n", + "N00944\tShigella OspI to TNF-NFKB signaling pathway\n", + "N00945\tShigella IpaH1.4/2.5 to TNF-NFKB signaling pathway\n", + "N00946\tShigella IpaJ to cGAS-STING signaling pathway\n", + "N00947\tShigella Ipa4.5 to cGAS-STING signaling pathway\n", + "N00948\tShigella IpaH7.8 to NLRP3 Inflammasome signaling pathway\n", + "N00949\tShigella IpaB to NLRC4 Inflammasome signaling pathway\n", + "N00950\tShigella FimA to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N00951\tITGA/B-RHOG-RAC signaling pathway\n", + "N00952\tShigella IpgB1 to ITGA/B-RHOG-RAC signaling pathway\n", + "N00953\tmGluR1-TRPC3 signaling pathway\n", + "N00954\tMutation-activated GRM1 to mGluR1-TRPC3 signaling pathway\n", + "N00955\tMutation-inactivated PRKCG to mGluR1-TRPC3 signaling pathway\n", + "N00956\tMutation-activated PRKCG to mGluR1-TRPC3 signaling pathway\n", + "N00957\tMutation-caused abberant ATXN2/3 to mGluR5-Ca2+ -apoptotic pathway\n", + "N00958\tMutation-activated ITPR1 to mGluR1-TRPC3 signaling pathway\n", + "N00959\tITPR1-reduced expression to mGluR1-TRPC3 signaling pathway\n", + "N00960\tMutation-caused aberrant SPTBN2 to mGluR1-TRPC3 signaling pathway\n", + "N00961\tMutation-activated TRPC3 to mGluR1-TRPC3 signaling pathway\n", + "N00962\tMutation-inactivated ATXN3 to autophagy-vesicle nucleation\n", + "N00963\tRELN-VLDLR-PI3K signaling pathway\n", + "N00964\tDAB1-overexpression to RELN-VLDLR-PI3K signaling pathway\n", + "N00965\tRORA-mediated transcription\n", + "N00966\tMutation-caused aberrant ATXN1 to RORA-mediated transcription\n", + "N00967\tVGCC-Ca2+ -apoptotic pathway\n", + "N00968\tMutation-activated CACNA1A to VGCC-Ca2+ -apoptotic pathway\n", + "N00969\tMutation-inactivated CACNA1A to VGCC-Ca2- -apoptotic pathway\n", + "N00970\tTransport of calcium\n", + "N00971\tMutation-caused aberrant PDYN to transport of calcium\n", + "N00972\tTransport of potassium\n", + "N00973\tMutation-inactivated KCNC3 to transport of potassium\n", + "N00974\tTransport of potassium\n", + "N00975\tMutation-inactivated KCND3 to transport of potassium\n", + "N00976\tRetrograde axonal transport\n", + "N00977\tMutation-caused aberrant Htt to retrograde axonal transport\n", + "N00978\tAnterograde axonal transport\n", + "N00979\tMutation-caused aberrant Htt to anterograde axonal transport\n", + "N00980\tMutation-caused aberrant Htt to REST-mediated transcriptional repression\n", + "N00981\tMutation-caused aberrant Htt to CREB-mediated transcription\n", + "N00982\tMutation-caused aberrant Htt to p53-mediated transcription\n", + "N00983\tMutation-caused aberrant Htt to extrinsic apoptotic pathway\n", + "N00984\tmGluR5-Ca2+ -apoptotic pathway\n", + "N00985\tMutation-caused aberrant Htt to mGluR5-Ca2+ -apoptotic pathway\n", + "N00986\tMutation-caused aberrant Htt to VGCC-Ca2+ -apoptotic pathway\n", + "N00987\tMutation-caused aberrant Htt to transport of calcium\n", + "N00988\tElectron transfer in Complex II\n", + "N00989\tMutation-caused aberrant Htt to electron transfer in Complex II\n", + "N00990\tElectron transfer in Complex III\n", + "N00991\tMutation-caused aberrant Htt to electron transfer in Complex III\n", + "N00992\tMutation-caused aberrant Htt to TNF-JNK signaling pathway\n", + "N00993\tMutation-caused aberrant Htt to autophagy-vesicle nucleation\n", + "N00994\tAGE-RAGE signaling pathway\n", + "N00995\tElectron transfer in Complex I\n", + "N00996\tMutation-caused aberrant Abeta to AGE-RAGE signaling pathway\n", + "N00997\tMutation-caused aberrant Abeta to electron transfer in Complex I\n", + "N00998\tElectron transfer in Complex IV\n", + "N00999\tMutation-caused aberrant Abeta to electron transfer in Complex IV\n", + "N01000\tmAChR-Ca2+ -apoptotic pathway\n", + "N01001\tMutation-caused aberrant Abeta to mAchR-Ca2+ -apoptotic pathway\n", + "N01002\tMutation-caused aberrant Abeta to mGluR5-Ca2+ -apoptotic pathway\n", + "N01003\tMutation-caused aberrant Abeta to transport of calcium\n", + "N01004\tMutation-caused aberrant Abeta to VGCC-Ca2+ -apoptotic pathway\n", + "N01005\tMutation-caused aberrant Abeta to crosstalk between extrinsic and intrinsic apoptotic pathways\n", + "N01006\tMutation-caused aberrant Abeta to VGCC-Ca2+ -apoptotic pathway\n", + "N01007\tMutation-caused aberrant PSEN to mGluR5-Ca2+ -apoptotic pathway\n", + "N01008\tMutation-caused aberrant PSEN1 to mGluR5-Ca2+ -apoptotic pathway\n", + "N01009\tPERK-ATF4 signaling pathway\n", + "N01010\tMutation-caused aberrant PSEN1 to PERK-ATF4 signaling pathway\n", + "N01011\tIRE1a-XBP1 signaling pathway\n", + "N01012\tMutation-caused aberrant PSEN1 to IRE1a-XBP1 signaling pathway\n", + "N01013\tIRE1a-JNK signaling pathway\n", + "N01014\tMutation-caused aberrant Abeta to IRE1a-JNK signaling pathway\n", + "N01015\tATF6-mediated transcription\n", + "N01016\tMutation-caused aberrant PSEN1 to ATF6-mediated transcription\n", + "N01017\tMutation-caused aberrant PSEN1 to anterograde axonal transport\n", + "N01018\tMutation-caused aberrant Abeta to anterograde axonal transport\n", + "N01019\tParkin-mediated ubiquitination\n", + "N01020\tMutation-inactivated PRKN to Parkin-mediated ubiquitination\n", + "N01021\tParkin-mediated ubiquitination\n", + "N01022\tMutation-inactivated PRKN to Parkin-mediated ubiquitination\n", + "N01023\tParkin-mediated ubiquitination\n", + "N01024\tMutation-inactivated PRKN to Parkin-mediated ubiquitination\n", + "N01025\tParkin-mediated ubiquitination\n", + "N01026\tMutation-inactivated PRKN to Parkin-mediated ubiquitination\n", + "N01027\tUCHL1-mediated hydrolysis\n", + "N01028\tMutation-inactivated UCHL1 to UCHL1-mediated hydrolysis\n", + "N01029\t26S proteasome-mediated protein degradation\n", + "N01030\tMutation-caused aberrant SNCA to 26S proteasome-mediated protein degradation\n", + "N01031\tMutation-caused aberrant SNCA to VGCC-Ca2+ -apoptotic pathway\n", + "N01032\tMutation-inactivated PRKN to mGluR1 signaling pathway\n", + "N01033\tMutation-caused aberrant SNCA to ATF6-mediated transcription\n", + "N01034\tMutation-caused aberrant SNCA to IRE1a-XBP1 signaling pathway\n", + "N01035\tMutation-caused aberrant SNCA to PERK-ATF4 signaling pathway\n", + "N01037\tMutation-caused aberrant SNCA to L-DOPA generation\n", + "N01039\tMutation-inactivated PRKN to DOPAL generation\n", + "N01040\tTransport of dopamine to synaptic vesicle\n", + "N01041\tMutation-caused aberrant SNCA to transport of dopamine\n", + "N01042\tMutation-caused aberrant SNCA to electron transfer in Complex I\n", + "N01043\tMutation-inactivated PINK1 to electron transfer in Complex I\n", + "N01044\tMPP+ to electron transfer in Complex I\n", + "N01045\tRotenone to electron transfer in Complex I\n", + "N01046\tManeb to electron transfer in Complex III\n", + "N01047\tMutation-activated LRRK2 to intrinsic apoptotic pathway\n", + "N01048\tMutation-inactivated PINK1 to intrinsic apoptotic pathway\n", + "N01049\tMutation-inactivated PRKN to intrinsic apoptotic pathway\n", + "N01050\tMutation-inactivated PINK1 to intrinsic apoptotic pathway\n", + "N01051\tMutation-inactivated DJ1 to intrinsic apoptotic pathway\n", + "N01052\tPINK1-Parkin-mediated MFN2 degradation\n", + "N01053\tMutation-inactivated PINK1 to PINK1-Parkin-mediated MFN2 degradation\n", + "N01054\tMutation-inactivated PRKN to PINK1-Parkin-mediated MFN2 degradation\n", + "N01055\tMutation-caused aberrant SNCA to anterograde axonal transport\n", + "N01056\tFAS-JNK signaling pathway\n", + "N01057\tMutation-inactivated DJ1 to FAS-JNK signaling patwhay\n", + "N01058\tMutation-inactivated DJ1 to to p53-mediated transcription\n", + "N01059\tMutation-inactivated DJ1 to KEAP1-NRF2 signaling pathway\n", + "N01060\tMutation-caused aberrant Abeta to 26S proteasome-mediated protein degradation\n", + "N01061\tMutation-caused aberrant Htt to 26S proteasome-mediated protein degradation\n", + "N01062\tMutation-activated MET to RAS-ERK signaling pathway\n", + "N01063\tMutation-activated MET to PI3K signaling pathway\n", + "N01064\tMutation-activated RET to RAS-ERK signaling pathway\n", + "N01065\tMutation-activated RET to PI3K signaling pathway\n", + "N01066\tARNO-ARF-ACTB_G signaling pathway\n", + "N01067\tShigella IpgD to ARNO-ARF-ACTB_G signaling pathway\n", + "N01068\tITGA/B-FAK-RAC signaling pathway\n", + "N01069\tShigella IpgB1 to ITGA/B-FAK-RAC signaling pathway\n", + "N01070\tITGA/B-FAK-CDC42 signaling pathway\n", + "N01071\tShigella IpgB1 to ITGA/B-FAK-CDC42 signaling pathway\n", + "N01072\tITGA/B-RhoGEF-RhoA signaling pathway\n", + "N01073\tShigella IpgB2 to ITGA/B-RhoGEF-RhoA signaling pathway\n", + "N01074\tShigella IpaA to ITGA/B-RhoGEF-RhoA signaling pathway\n", + "N01075\tShigella IcsB to ITGA/B-RhoGEF-RhoA signaling pathway\n", + "N01076\tShigella IcsB to ITGA/B-FAK-CDC42 signaling pathway\n", + "N01077\tShigella IcsB to ITGA/B-FAK-RAC signaling pathway\n", + "N01078\tEGF-EGFR-Actin signaling pathway\n", + "N01079\tShigella IpaC to Actin signaling pathway\n", + "N01080\tITGA/B-TALIN/VINCULIN signaling pathway\n", + "N01081\tShigella IpaB/C/D to ITGA/B-TALIN/VINCULIN signaling pathway\n", + "N01082\tShigella IpaA to ITGA/B-TALIN/VINCULIN signaling pathway\n", + "N01083\tShigella OspE to ITGA/B-TALIN/VINCULIN signaling pathway\n", + "N01084\tEscherichia EspG to ARNO-ARF-ACTB/G signaling pathway\n", + "N01085\tEscherichia EspG to ARNO-ARF-ACTB/G signaling pathway\n", + "N01086\tEscherichia EspT to RAC signaling pathway\n", + "N01087\tEscherichia EspW to RAC signaling pathway\n", + "N01088\tEscherichia EspH to LPA-GNA12/13-RhoA signaling pathway\n", + "N01089\tEscherichia EspM to LPA-GNA12/13-Rho signaling pathway\n", + "N01090\tIGG-FCGR-RAC signaling pathway\n", + "N01091\tEscherichia EspJ to IGG-FCGR-RAC signaling pathway\n", + "N01092\tEscherichia Eae/Tir to Actin signaling pathway\n", + "N01093\tEscherichia EspJ/Tir to Actin signaling pathway\n", + "N01094\tEscherichia Eae/Tir/TccP to Actin signaling pathway\n", + "N01095\tEscherichia Map to LPA-GNA12/13-RhoA signaling pathway\n", + "N01096\tEscherichia Map to CDC42 signaling pathway\n", + "N01097\tLPA-GNA12/13-RhoA signaling pathway\n", + "N01098\tYersinia YopT to ITGA/B-RhoGEF-RhoA signaling pathway\n", + "N01099\tYersinia YopE to RhoA signaling pathway\n", + "N01100\tYersinia YopE to ITGA/B-RHOG-RAC signaling pathway\n", + "N01101\tYersinia YopT to ITGA/B-RHOG-RAC signaling pathway\n", + "N01102\tYersinia YopE to ITGA/B-RHOG-RAC signaling pathway\n", + "N01103\tYersinia YpkA to IGG-FCGR-RAC signaling pathway\n", + "N01104\tLPA-GNAQ/11-RhoA signaling pathway\n", + "N01105\tYersinia YpkA to LPA-GNAQ-RhoA signaling pathway\n", + "N01106\tTCR-PLCG-ITPR signaling pathway\n", + "N01107\tYersinia YopH to TCR-NFAT signaling pathway\n", + "N01108\tYersinia YopH to TCR-NFAT signaling pathway\n", + "N01109\tYersinia YopH to ITGA/B-FAK-RAC signaling pathway\n", + "N01110\tYersinia YopH to ITGA/B-FAK-RAC signaling pathway\n", + "N01111\tYersinia YopH to ITGA/B-FAK-RAC signaling pathway\n", + "N01112\tSalmonella SopE/E2 to NOD-NFKB signaling pathway\n", + "N01113\tSalmonella SseK1 to TNF-NFKB signaling pathway\n", + "N01114\tSalmonella SseK3 to TNF-NFKB signaling pathway\n", + "N01116\tSalmonella SseL to TNF-NFKB signaling pathway\n", + "N01117\tSalmonella GogB to TNF-NFKB signaling pathway\n", + "N01118\tSalmonella SpvD to TNF-NFKB signaling pathway\n", + "N01119\tRAC/CDC42-PAK-ERK signaling pathway\n", + "N01120\tSalmonella SptP to RAC/CDC42-PAK-ERK signaling pathway\n", + "N01121\tSalmonella SpvC to ERK signaling pathway\n", + "N01122\tSalmonella PipA/GogA/GtgA to TNF-NFKB signaling pathway\n", + "N01123\tSalmonella AvrA to TNF-NFKB signaling pathway\n", + "N01124\tSalmonella AvrA to beta-catenin signaling pathway\n", + "N01125\tSalmonella AvrA to TNF-JNK signaling pathway\n", + "N01126\tSalmonella SipB to Inflammasome signaling pathway\n", + "N01127\tSalmonella SopE to Inflammasome signaling pathway\n", + "N01128\tSalmonella SopE to RAC signaling pathway\n", + "N01129\tSalmonella SopB to ARNO-ARF-ACTB/G signaling pathway\n", + "N01130\tSalmonella SopB to RhoA signaling pathway\n", + "N01131\tSalmonella SopE/E2 to RhoA signaling pathway\n", + "N01132\tSalmonella SopE/E2 to RhoG signaling pathway\n", + "N01133\tSalmonella SopB to RhoG signaling pathway\n", + "N01134\tSalmonella SopB to CDC42 signaling pathway\n", + "N01135\tMutation-caused aberrant SOD1 to intrinsic apoptotic pathway\n", + "N01136\tMutation-caused aberrant TDP43 to electron transfer in Complex I\n", + "N01137\tPINK-Parkin-mediated autophagosome formation\n", + "N01138\tMutation-inactivated OPTN to PINK-Parkin-mediated autophagosome formation\n", + "N01139\tMutation-inactivated p62 to PINK-Parkin-mediated autophagosome formation\n", + "N01140\tTBK1-mediated autophagosome formation\n", + "N01141\tMutation-inactivated TBK1 to TBK1-mediated autophagosome formation\n", + "N01142\tC9orf72-mediated autophagy initiation\n", + "N01143\tMutation-inactivated C9orf72 to C9orf72-mediated autophagy initiation\n", + "N01144\tMutation-caused aberrant SOD1 to 26S proteasome-mediated protein degradation\n", + "N01145\tMutation-inactivated VCP to 26S proteasome-mediated protein degradation\n", + "N01146\tMutation-inactivated UBQLN2 to 26S proteasome-mediated protein degradation\n", + "N01147\tMutation-caused aberrant SOD1 to ATF6-mediated transcription\n", + "N01148\tMutation-caused aberrant SOD1 to IRE1a-XBP1 signaling pathway\n", + "N01149\tMutation-caused aberrant SOD1 to PERK-ATF4 signaling pathway\n", + "N01150\tMutation-inactivated VAPB to ATF6-mediated transcription\n", + "N01151\tMutation-inactivated SIGMAR1 to Ca2+ -apoptotic pathway\n", + "N01152\tNuclear export of mRNA\n", + "N01153\tMutation-caused aberrant GLE1 to nuclear export of mRNA\n", + "N01154\tTDP-43-regulated splicing\n", + "N01155\tMutation-caused aberrant TDP43 to TDP-43-regulated splicing\n", + "N01156\tFUS-regulated splicing\n", + "N01157\tMutation-caused aberrant FUS to FUS-regulated splicing\n", + "N01158\tMutation-caused aberrant DCTN1 to retrograde axonal transport\n", + "N01159\tMutation-caused aberrant TUBA4A to retrograde axonal transport\n", + "N01160\tMutation-caused aberrant SOD1 to retrograde axonal transport\n", + "N01161\tActin polymerization\n", + "N01162\tMutation-caused aberrant PFN1 to actin polymerization\n", + "N01163\tNRG-ERBB4-PI3K signaling pathway\n", + "N01164\tMutation-inactivated ERBB4 to NRG-ERBB4-PI3K signaling pathway\n", + "N01165\tPDL/PD1-SHP-PI3K signaling pathway\n", + "N01197\tScrapie conformation PrPSc to 26S proteasome-mediated protein degradation\n", + "N01198\tScrapie conformation PrPSc to PERK-ATF4 signaling pathway\n", + "N01199\tScrapie conformation PrPSc to mGluR5-Ca2+ -apoptotic pathway\n", + "N01200\tScrapie conformation PrPSc to transport of calcium\n", + "N01201\tScrapie conformation PrPSc to VGCC-Ca2+ -apoptotic pathway\n", + "N01202\tOligomeric conformation PrPc to anterograde axonal transport\n", + "N01203\tScrapie conformation PrPSc to Notch singling pathway\n", + "N01204\tPRNP-PI3K-NOX2 signaling pathway\n", + "N01205\tScrapie conformation PrPSc to PRNP-PI3K-NOX2 signaling pathway\n", + "N01282\tRegulation of CAV1.1\n", + "N01283\tShigella OspF to TLR2/4-MAPK signaling pathway\n", + "N01284\tShigella IcsP to Autophagy-vesicle elongation\n", + "N01285\tMicrotubule-RHOA signaling pathway\n", + "N01286\tEscherichia EspG to Microtubule-RHOA signaling pathway\n", + "N01287\tTight junction-Actin signaling pathway\n", + "N01288\tEscherichia EspF to Tight junction-Actin signaling pathway\n", + "N01289\tCOPII vesicle formation\n", + "N01290\tEscherichia NleA to COPII vesicle formation\n", + "N01291\tTRAPPI-RAB1 signaling pathway\n", + "N01292\tEscherichia EspG to RAB1 signaling pathway\n", + "N01293\tCOPI vesicle formation\n", + "N01294\tEscherichia NleF to COPI vesicle formation\n", + "N01295\tRab7-regulated microtubule minus-end directed transport\n", + "N01296\tSalmonella SopD2 to Rab7-regulated microtubule minus-end directed transport\n", + "N01297\tArl8-regulated microtubule plus-end directed transport\n", + "N01298\tSalmonella SifA to microtubule plus-end directed transport\n", + "N01299\tSalmonella PipB2 to microtubule plus-end directed transport\n", + "N01300\tTethering of late endosomes and lysosomes\n", + "N01301\tSalmonella SifA to Tethering of late endosomes and lysosomes\n", + "N01302\tEarly endosomal fusion\n", + "N01303\tSalmonella SopB to Early endosomal fusion\n", + "N01304\tANXA2-S100A10-regulated actin cytoskeleton\n", + "N01305\tSalmonella SopB to ANXA2-S100A10-regulated actin cytoskeleton\n", + "N01306\tAngII-AT1R-NOX2 signaling pathway\n", + "N01307\tSARS-CoV-2 S to AngII-AT1R-NOX2 signaling pathway\n", + "N01308\tMDA5-IRF7/3 signaling pathway\n", + "N01309\tSARS-CoV-2 nsp3 to MDA5-IRF7/3 signaling pathway\n", + "N01310\tSARS-CoV-2 nsp13 to RIG-I-IRF7/3 signaling pathway\n", + "N01312\tSARS-CoV-2 S to lectin pathway of complement cascade\n", + "N01314\tSARS-CoV-2 S to classical pathway of complement cascade\n", + "N01315\tLectin pathway of coagulation cascade, prothrombin to thrombin\n", + "N01316\tSARS-CoV-2 S/N to lectin pathway of coagulation cascade\n", + "N01317\tTranslation initiation\n", + "N01318\tSARS-CoV-2 nsp1 to translation initiation\n", + "N01319\tSARS-CoV-2 nsp6 and ORF6 to RIG-I-IRF7/3 signaling pathway\n", + "N01320\tSARS-CoV-2 nsp3 to RIG-I-IRF7/3 signaling pathway\n", + "N01321\tSARS-CoV-2 nsp1/6/13, ORF3a/6/7b and M to IFN signaling pathway\n", + "N01322\tSARS-CoV-2 nsp6/13 and ORF7a/7b to IFN signaling pathway\n", + "N01336\tCHRNA7-E2F signaling pathway\n", + "N01337\tNNK/NNN to CHRNA7-E2F signaling pathway\n", + "N01338\tACH-CHRN-PI3K signaling pathway\n", + "N01339\tNNK/NNN to PI3K signaling pathway\n", + "N01340\tACH-CHRN-JAK-STAT signaling pathway\n", + "N01341\tNNK/NNN to Jak-STAT signaling pathway\n", + "N01342\tNicotine to Jak-STAT signaling pathway\n", + "N01343\tACH-CHRN-RAS-ERK signaling pathway\n", + "N01344\tNNK/NNN to RAS-ERK signaling pathway\n", + "N01345\tEP/NE-ADRB-cAMP signaling pathway\n", + "N01346\tNicotine/NNK to cAMP signaling pathway\n", + "N01347\tEP/NE-ADRB-PI3K signaling pathway\n", + "N01348\tNicotine/NNK to PI3K signaling pathway\n", + "N01349\tACH-CHRN-PI3K signaling pathway\n", + "N01350\tNNK/NNN to PI3K signaling pathway\n", + "N01351\tE2-ER-RAS-ERK signaling pathway\n", + "N01352\tBPA to RAS-ERK signaling pathway\n", + "N01353\tE2 to RAS-ERK signaling pathway\n", + "N01354\tBPA to RAS-ERK signaling pathway\n", + "N01355\tArsenic to PI3K signaling pathway\n", + "N01356\tMembrane-initiated progesterone signaling pathway\n", + "N01357\tP4/MPA to membrane-initiated progesterone signaling pathway\n", + "N01358\tP4-PR-PI3K signaling pathway\n", + "N01359\tP4/MPA to PR-PI3K signaling pathway\n", + "N01360\tP4-PR-RAS-ERK signaling pathway\n", + "N01361\tP4/MPA to PR-RAS-ERK signaling pathway\n", + "N01362\tNuclear-initiated progesterone signaling pathway\n", + "N01363\tP4/MPA to nuclear-initiated progesterone signaling pathway\n", + "N01364\tE2 to nuclear-initiated estrogen signaling pathway\n", + "N01365\tTCDD to Ahr signaling pathway\n", + "N01366\tBaP to Ahr signaling pathway\n", + "N01367\tPCB to Ahr signaling pathway\n", + "N01368\tHCB to Ahr signaling pathway\n", + "N01369\t4-ABP to DNA adducts\n", + "N01370\tPhIP to DNA adducts\n", + "N01371\tPhIP to DNA adducts\n", + "N01372\tIQ to DNA adducts\n", + "N01373\tMeIQx to DNA adducts\n", + "N01374\tBaP to DNA adducts\n", + "N01375\tDMBA to DNA adducts\n", + "N01376\tMelphalan to DNA adducts/cross-links\n", + "N01377\tThiotepa to DNA adducts/cross-links\n", + "N01378\tAFB1 to DNA adducts\n", + "N01379\tNNK to DNA adducts\n", + "N01380\tNNK to DNA adducts\n", + "N01381\tNNK to DNA adducts\n", + "N01382\tNNK to DNA adducts\n", + "N01383\tNDMA to DNA adducts\n", + "N01384\tEO to DNA adducts\n", + "N01385\tVC to DNA adducts\n", + "N01386\tDCE to DNA adducts\n", + "N01387\tSM to DNA adducts/cross-links\n", + "N01388\tSOD/Cat-mediated ROS neutralization\n", + "N01389\tLead to SOD/Cat-mediated ROS neutralization\n", + "N01390\tp,p'-DDT to SOD/Cat-mediated ROS neutralization\n", + "N01391\tLead to SOD/Cat-mediated ROS neutralization\n", + "N01392\tArsenic to electron transfer in complex II\n", + "N01393\tArsenic to electron transfer in complex II\n", + "N01394\tArsenic to electron transfer in complex IV\n", + "N01395\tCadmium to electron transfer in complex III\n", + "N01396\t4-Aminobiphenyl to CYP-mediated metabolism\n", + "N01397\t4-Aminobiphenyl to CYP-mediated metabolism\n", + "N01398\tPentachlorophenol to CYP-mediated metabolism\n", + "N01399\tBenzene to CYP-mediated metabolism\n", + "N01400\tBenzene to CYP-mediated metabolism\n", + "N01401\tBenzo[a]pyrenre to CYP-mediated metabolism\n", + "N01402\tManganese to electron transfer in Complex II\n", + "N01403\tZn to anterograde axonal transport\n", + "N01404\t17beta-estradiol to CYP-mediated metabolism\n", + "N01405\t17beta-estradiol to CYP-mediated metabolism\n", + "N01406\tEthanol to CYP-mediated metabolism\n", + "N01407\tMetals to JNK signaling pathway\n", + "N01408\tMetals to RAS-ERK signaling pathway\n", + "N01409\tMetals to PI3K signaling pathway\n", + "N01410\tMetals to NFKB signaling pathway\n", + "N01411\tMetals to NFKB signaling pathway\n", + "N01412\tMetals to HTF-1 signaling pathway\n", + "N01413\tMetals to KEAP1-NRF2 signalig pathway\n", + "N01414\tIron to anterograde axonal transport\n", + "N01415\tNEP-mediated Abeta degradation\n", + "N01416\tMercury to NEP-mediated Abeta degradation\n", + "N01417\tParaquat to FAS-JNK signaling pathway\n", + "N01418\tPurine salvage pathway, adenine to AMP\n", + "N01419\tAPRT deficiency in purine salvage pathway\n", + "N01420\tAPRT deficiency in adenine metabolism\n", + "N01421\tPurine salvage pathway, hypoxanthine/guanine to IMP/GMP\n", + "N01422\tHPRT1 deficiency in purine salvage pathway\n", + "N01423\tHPRT1 deficiency in hypoxanthine metabolism\n", + "N01424\tHPRT1 deficiency in guanine metabolism\n", + "N01425\tGlobal genome NER\n", + "N01426\tBMP9/10 signaling pathway\n", + "N01427\tWNT5A-ROR signaling pathway\n", + "N01428\tBMP signaling pathway, BMP antagonist\n", + "N01429\tCytosolic Ca2+ removal, PMCA\n", + "N01430\tTranscription-coupled NER\n", + "N01431\tCore NER reaction\n", + "N01432\tMismatch repair\n", + "N01433\tBase excision and strand cleavage by monofunctional glycosylase\n", + "N01434\tBase excision and strand cleavage by bifunctional glycosylase\n", + "N01435\tBase excision and strand cleavage by NEIL glycosylase\n", + "N01436\tLong patch BER\n", + "N01437\tShort patch BER\n", + "N01438\tMitochondrial BER\n", + "N01439\tDouble-strand break signaling\n", + "N01440\tWnt signaling modulation, LGR/RSPO\n", + "N01441\tWnt signaling modulation, SOST/LRP4\n", + "N01442\tWnt signaling modulation, Wnt inhibitor\n", + "N01443\tWnt signaling modulation, Wnt acylation\n", + "N01444\tNXN mutation to WNT5A-ROR signaling pathway\n", + "N01445\tNon-homologous end-joining\n", + "N01446\tDNA end resection and RPA loading\n", + "N01447\tDouble Holliday junction dissolution\n", + "N01448\tDouble Holliday junction resolution\n", + "N01449\tSynthesis-dependent strand annealing\n", + "N01450\tBreak induced replication\n", + "N01451\tATR signaling\n", + "N01452\tHomologous recombination\n", + "N01453\tBMP signaling pathway\n", + "N01454\tAMH signaling pathway\n", + "N01455\tBMP15 signaling pathway\n", + "N01456\tActivin signaling pathway\n", + "N01457\tMyostatin signaling pathway\n", + "N01458\tBMP-HAMP signaling pathway\n", + "N01459\tNodal signaling pathway\n", + "N01460\tPlasmin mediated activation of latent TGF-beta\n", + "N01461\tBMP-HAMP signaling pathway, auxiliary factor\n", + "N01462\tBMP9/10 signaling pathway, BMP9/10 coreceptor\n", + "N01464\tFanconi anemia pathway\n", + "N01465\tLesion bypass by TLS and DSB formation\n", + "N01466\tHomologous recombination in ICLR\n", + "N01467\tV(D)J recombination\n", + "N01468\tDNA replication licensing\n", + "N01469\tCdt1 downregulation\n", + "N01470\tPre-IC formation\n", + "N01471\tOrigin unwinding and elongation\n", + "N01472\tOkazaki fragment maturation\n", + "N01473\tDNA replication termination\n", + "N01474\tTRAIP-dependent replisome disassembly\n", + "N01475\tTelomerase RNA maturation\n", + "N01476\tAssembly and trafficking of telomerase\n", + "N01477\tTelomere elongation\n", + "N01478\tNotch proteolytic activation\n", + "N01479\tNotch ligand ubiquitylation\n", + "N01480\tNotch-HES7 signaling\n", + "N01481\tNotch-MESP2 signaling\n", + "N01482\tCohesin loading\n", + "N01483\tCohesin acetylation\n", + "N01484\tEstablishment of cohesion\n", + "N01485\tCohesin dissociation in prophase\n", + "N01486\tCohesin dissociation in anaphase\n", + "N01487\tClassical pathway of complement cascade, C4/C2 to C3 convertase formation\n", + "N01489\tClassical/Lectin pathway of complement cascade, C5 convertase formation\n", + "N01490\tCommon pathway of complement cascade, MAC formation\n", + "N01491\tLectin pathway of complement cascade, C4/C2 to C3 convertase formation\n", + "N01493\tAlternative pathway of complement cascade, C3 convertase formation\n", + "N01494\tAlternative pathway of complement cascade, C3/5 convertase formation\n", + "N01495\tClassical/Lectin pathway of complement cascade, C4b breakdown\n", + "N01496\tAlternative pathway of complement cascade, C3b breakdown\n", + "N01497\tCondensin loading\n", + "N01498\tInhibition of condensin II association\n", + "N01499\tModifying of condensin II subunits\n", + "N01500\tModifying of condensin I subunits\n", + "N01501\tInactivation of condensin I\n", + "N01502\tLectin pathway of coagulation cascade, fibrinogen to fibrin\n", + "N01503\tExtrinsic pathway of coagulation cascade, F7 activation\n", + "N01504\tRegulation of complement cascade, CFHR\n", + "N01505\tRegulation of complement cascade, MAC inhibition\n", + "N01506\tIntrinsic pathway of coagulation cascade, F12 activation\n", + "N01507\tIntrinsic pathway of coagulation cascade, F11 activation\n", + "N01508\tIntrinsic pathway of coagulation cascade, F9 activation\n", + "N01509\tIntrinsic pathway of coagulation cascade, F8 activation\n", + "N01510\tCommon pathway of coagulation cascade, F10 activation\n", + "N01511\tCommon pathway of coagulation cascade, F5 activation\n", + "N01512\tCommon pathway of coagulation cascade, prothrombin activation\n", + "N01513\tCommon pathway of coagulation cascade, fibrinogen to fibrin\n", + "N01514\tCommon pathway of coagulation cascade, F13 activation\n", + "N01515\tRegulation of coagulation cascade, protein C system\n", + "N01516\tKallikrein-kinin system, prekallikrein activation\n", + "N01517\tKallikrein-kinin system, HMWK to bradykinin\n", + "N01518\tFibrinolytic system\n", + "N01519\tRegulation of coagulation cascade, AT3\n", + "N01520\tRegulation of fibrinolytic system, C1INH\n", + "N01521\tRegulation of coagulation cascade, HCF2\n", + "N01522\tRegulation of fibrinolytic system, AAP\n", + "N01523\tRegulation of fibrinolytic system, AAT\n", + "N01524\tRegulation of fibrinolytic system, PAI\n", + "N01525\tOrganization of the inner kinetochore\n", + "N01526\tOrganization of the outer kinetochore\n", + "N01527\tKSHV Kaposin to classical/Lectin pathway of complement cascade, C4b breakdown\n", + "N01528\tKSHV Kaposin to alternative pathway of complement cascade, C3b breakdown\n", + "N01529\tRecruitment and formation of the MCC\n", + "N01530\tDopamine metabolism\n", + "N01531\tCENPE interaction with NDC80 complex\n", + "N01532\tKinetochore targeting of MAD1-MAD2\n", + "N01533\tDisassembly of MCC\n", + "N01534\tDynein recruitment to the kinetochore\n", + "N01535\tKinetochore microtubule attachment\n", + "N01536\tDephosphorylation of kinetochore\n", + "N01537\tHedgehog signaling pathway, HH ligand secretion\n", + "N01538\tHedgehog signaling pathway, PTCH coreceptor\n", + "N01539\tRAD51 -dsDNA destabilization\n", + "N01540\tEstrogen biosynthesis\n", + "N01541\tTestosterone biosynthesis\n", + "N01542\tPKA holoenzyme\n", + "N01543\tTLR7/8/9-IRF5 signaling pathway\n", + "N01544\tMicrotubule nucleation\n", + "N01545\tRegulation of TNF-NFKB signaling pathway, LUBAC-mediated linear ubiquitination\n", + "N01546\tRegulation of TNF-NFKB signaling pathway, OTULIN/TNFAIP3-mediated deubiquitination\n", + "N01547\tKinetochore fiber organization\n", + "N01548\tKinetochore-fiber stabilization\n", + "N01549\tBranching microtubule nucleation\n", + "N01550\tAdrenaline metabolism\n", + "N01551\tSerotonin metabolism\n", + "N01552\tEumelanin biosynthesis\n", + "N01553\tPromotion of microtubule growth\n", + "N01554\tIL2 family to Jak-STAT signaling pathway\n", + "N01555\tHormone-like-cytokine to Jak-STAT signaling pathway\n", + "N01556\tIL6 family to Jak-STAT signaling pathway\n", + "N01557\tIL12/23 to Jak-STAT signaling pathway\n", + "N01558\tType I interferon to Jak-STAT signaling pathway\n", + "N01559\tType II interferon to Jak-STAT signaling pathway\n", + "N01560\tRegulation of type I interferon to Jak-STAT signaling pathway, USP18\n", + "N01561\tMicrotubule depolymerization\n", + "N01562\tMicrotubule depolymerization at the minus ends\n", + "N01563\tInhibition of Kif2A\n", + "N01564\tPost-translational modifications of RIG-I and MDA5\n", + "N01565\tAdenosine-to-inosine RNA editing by ADAR\n", + "N01566\tTLR5-NFKB signaling pathway\n", + "N01567\tNLRP1 inflammasome signaling pathway\n", + "N01568\tRegulation of NLRP3 inflammasome signaling pathway, NLRP3 inhibition\n", + "N01569\tNALP12 inflammasome signaling pathway\n", + "N01570\tRegulation of Pyrin inflammasome signaling pathway, PSTPIP1\n", + "N01571\tDNA degradation by extracellular/endolysosomal DNAse\n", + "N01572\tRNASEH2-mediated RNA degradation in RNA-DNA hybrids\n", + "N01573\tSAMHD1-mediated dNTP degradation\n", + "N01574\tGlycosaminoglycan biosynthesis, linkage tetrasaccharide\n", + "N01575\tTSC1/2-mTORC1 signaling pathway\n", + "N01576\tSTRAD/STK11- TSC signaling pathway\n", + "N01577\tGene silencing by methylation of H3K27 and ubiquitination of H2AK119\n", + "N01578\tGATOR1-mTORC1 signaling pathway\n", + "N01579\tCD80/CD86-CTLA4-PP2A signaling pathway\n", + "N01580\tChondroitin sulfate biosynthesis\n", + "N01581\tDermatan sulfate biosynthesis\n", + "N01582\tHeparan sulfate biosynthesis\n", + "N01583\tRegulation of extrinsic apoptotic pathway, XIAP\n", + "N01584\tFLCN-mTORC1 signaling pathway\n", + "N01585\tDeubiquitination of H2AK119\n", + "N01586\tActivation of PRC2.2 by ubiquitination of H2AK119\n", + "N01587\tFe-TF transport\n", + "N01588\tFe3+ Ferritin transport\n", + "N01589\tGlutathione biosynthesis\n", + "N01590\tArachidonate/Adrenic acid metabolism\n", + "N01591\tFe2+ Ferroportin transport\n", + "N01592\tGF-RTK-RAS-ERK signaling pathway\n", + "N01593\tRegulation of GF-RTK-RAS-ERK signaling, PTP\n", + "N01594\tMLK-JNK signaling pathway\n", + "N01595\tRegulation of GF-RTK-RAS-ERK signaling pathway, adaptor proteins\n", + "N01596\tRegulation of GF-RTK-RAS-ERK signaling, RAS ubiquitination by CUL3 complex\n", + "N01597\tRegulation of GF-RTK-RAS-ERK signaling, SPRED and NF1\n", + "N01598\tRegulation of GF-RTK-RAS-ERK signaling, MRAS-SHOC2-PP1 holophosphatase\n", + "N01599\tRegulation of GF-RTK-RAS-ERK signaling, ubiquitination of RTK by CBL\n", + "N01600\tRegulation of GF-RTK-RAS-ERK signaling, RasGAP\n", + "N01601\tERK-RSK signaling\n", + "N01602\tERK-MYC signaling pathway\n", + "N01603\tPyruvate oxidation\n", + "N01604\tCitrate cycle, first carbon oxidation\n", + "N01605\tGluconeogenesis\n", + "N01606\tGlycolysis\n", + "N01607\tMethionine degradation\n", + "N01608\tSerine biosynthesis\n", + "N01609\tCitrate cycle, second carbon oxidation 1\n", + "N01610\tDihydrolipoamide dehydrogenase\n", + "N01611\tGlycine cleavage system\n", + "N01612\tCreatine pathway\n", + "N01613\tGlycine cleavage system, Gly to MTHF\n", + "N01614\tActivation of PRC2.2 by ubiquitination of H2AK119 in germline genes\n", + "N01615\tTransport of creatine\n", + "N01616\tDihydrolipoamide dehydrogenase\n", + "N01617\tCitrate cycle, second carbon oxidation 2\n", + "N01618\tProline biosynthesis, Orn to Pro\n", + "N01619\tBranched-chain amino acids degradation 2\n", + "N01620\tBlocking ubiquitination of H2AK119 by CK2\n", + "N01621\tTNF-RIPK1/3 signaling pathway\n", + "N01622\tProline degradation\n", + "N01623\tSpermine biosynthesis\n", + "N01624\tCholesterol biosynthesis\n", + "N01625\tCYLD regulation of RIPK1/3\n", + "N01626\tCholecalciferol biosynthesis\n", + "N01627\tAdenosine phosphorylation\n", + "N01628\tCysteine biosynthesis\n", + "N01629\tRemethylation, THF to 5-MTHF\n", + "N01630\tRemethylation, Hcy to Met\n", + "N01631\tTNFSF10-RIPK1/3 signaling pathway\n", + "N01632\tFASLG-RIPK1/3 signaling pathway\n", + "N01633\tTLR3-RIPK3 signaling pathway\n", + "N01634\tTLR4-RIPK3 signaling pathway\n", + "N01635\tMevalonate pathway\n", + "N01636\tLoading of the SMC5-SMC6 complex\n", + "N01637\tCa2+ entry, Voltage-gated Ca2+ channel\n", + "N01638\tSkeletal-type VGCC-RYR signaling\n", + "N01639\tCardiac-type VGCC-RYR signaling\n", + "N01640\tGPCR-PLCB-ITPR signaling pathway\n", + "N01641\tRTK-PLCG-ITPR signaling pathway\n", + "N01642\tCa2+ entry, Ligand-gated Ca2+ channel\n", + "N01643\tCa2+ entry, Store-operated Ca2+ channel\n", + "N01644\tLysosomal Ca2+ release\n", + "N01645\tCytosolic Ca2+ removal, SERCA\n", + "N01646\tRegulation of SERCA\n", + "N01647\tCa2+/CAM-CN signaling pathway\n", + "N01648\tCa2+/CAM-CAMK signaling pathway\n", + "N01649\tCa2+/CAM-VGCC/RYR signaling pathway\n", + "N01650\tSQSTM1 regulation of RIPK1/3\n", + "N01651\tBlood group H (O) antigen type 1 biosynthesis\n", + "N01652\tBlood group A antigen type 1 biosynthesis\n", + "N01653\tBlood group B antigen type 1 biosynthesis\n", + "N01654\tForssman blood group antigen biosynthesis\n", + "N01655\tCa2+-PLCD-ITPR signaling pathway\n", + "N01656\tGF-RTK-PI3K signaling pathway\n", + "N01657\tGPCR-PI3K signaling pathway\n", + "N01658\tGF-RTK-RAS-PI3K signaling pathway\n", + "N01659\tLewis b antigen biosynthesis\n", + "N01660\tLewis a antigen biosynthesis\n", + "N01661\tSialyl lewis a antigen biosynthesis\n", + "N01662\tIFN-RIPK1/3 signaling pathway\n", + "N01663\tCASP8 regulation of RIPK1/3\n", + "N01664\tBlood group A/B Lewis b antigen biosynthesis\n", + "N01666\tBlood group H (O) antigen type 2 biosynthesis\n", + "N01667\tBlood group A antigen type 2 biosynthesis\n", + "N01668\tBlood group B antigen type 2 biosynthesis\n", + "N01669\tBlood group A/B Lewis y antigen biosynthesis\n", + "N01670\tBlood group antigen type 3 biosynthesis\n", + "N01672\tLewis x antigen biosynthesis\n", + "N01673\tLewis y antigen biosynthesis\n", + "N01674\tSialyl lewis x antigen biosynthesis\n", + "N01675\tSID blood group Sd(a) antigen biosynthesis\n", + "N01676\tP1 antigen biosynthesis\n", + "N01677\tPX2 antigen biosynthesis\n", + "N01678\tIi blood group antigen biosynthesis\n", + "N01679\tPk and P antigens biosynthesis\n", + "N01680\tNOR antigen biosynthesis\n", + "N01682\tBlood group A antigen type 4 (Globo-A) biosynthesis\n", + "N01683\tOh (Bombay), deficiency of ABH antigens\n", + "N01684\tLipoic acid biosynthesis\n", + "N01685\tLysine degradation 1\n", + "N01686\tLysine degradation 2\n", + "N01687\tLysine degradation 3\n", + "N01688\tADRB3-UCP1 signaling pathway\n", + "N01689\tFUT2 nonsecretor\n", + "N01690\tBlood group H antigen type 4 (Globo-H) biosynthesis\n", + "N01691\tmitochondrial complex - UCP1 in Thermogenesis\n", + "N01695\tBCR-BCAP/CD19-PI3K signaling pathway\n", + "N01696\tICOSLG/ICOS-PI3K signaling pathway\n", + "N01697\tP/PX2 negative, Pk positive\n", + "N01698\tP1/Pk/P/NOR all negative (P null)\n", + "N01699\tP1 negative\n", + "N01700\tLewis negative, Le (a-b-)\n", + "N01701\tTranscriptional activation by acetylation of H3K27\n", + "N01702\tSd(a) negative\n", + "N01703\tBlood group B antigen type 4 (Globo-B) biosynthesis\n", + "N01704\tI negative (adult i)\n", + "N01708\tINS-AKT signaling pathway\n", + "N01709\tHydrolysis of globoside\n", + "N01710\tHydrolysis of ganglioside\n", + "N01711\tHydrolysis of GA1\n", + "N01712\tHydrolysis of psychosine\n", + "N01713\tGM2A activation of HEXA and HEXB\n", + "N01714\tLoss of GM2A activation\n", + "N01715\tAutophagy-vesicle nucleation/elongation/maturation, PI3P synthesis by PI3KC3-C1\n", + "N01716\tAutophagy-vesicle nucleation/elongation/maturation, sequestosome-1-like receptor\n", + "N01717\tRegulation of autophagy-vesicle nucleation/elongation/maturation, ATXN3\n", + "N01718\tAutophagy-vesicle nucleation/elongation/maturation, PACER-RUBCN-PI3KC3-C2\n", + "N01719\tAutophagy-vesicle nucleation/elongation/maturation, E3 ubiquitin-ligase Malin\n", + "N01720\tAutophagosome and lysosome fusion, trans-SNARE\n", + "N01721\tAutophagosome and lysosome fusion, tethering factor\n", + "N01722\tAutophagosome and lysosome fusion, tethering factor, GRASP55\n", + "N01723\tNAD biosynthesis\n", + "N01724\tNAD+ phosphorylation\n", + "N01725\tTetrahydrofolate biosynthesis\n", + "N01726\tFolate cycle\n", + "N01727\tHistidine degradation\n", + "N01729\tHistamine biosynthesis\n", + "N01741\tCa2+/TRPC3 signaling pathway\n", + "N01743\tRenin-angiotensin signaling pathway\n", + "N01746\tCCR/CXCR-GNB/G-PI3K signaling pathway\n", + "N01747\tFind-me signal (nucleotide)\n", + "N01748\tFind-me signal (LPC)\n", + "N01749\tFind-me signal (CX3CL1)\n", + "N01750\tFind-me signal (S1P)\n", + "N01751\tMacrophage EPO signaling\n", + "N01752\tTranslocation of phosphatidylserine to the inner leaflet\n", + "N01753\tExposure of phosphatidylserine to the outer leaflet\n", + "N01754\tActivation of XKR8\n", + "N01756\tPINK-Parkin-independent ubiquitin-mediated mitophagy\n", + "N01757\tPINK-Parkin-independent ubiquitin-mediated mitophagy, ubiquitin E3 ligase\n", + "N01758\tDesmosome - Vimentin filaments\n", + "N01759\tINK1-Parkin-mediated MFN2 degradation, VCP-OPA1\n", + "N01760\tEndosomal Rab cycles\n", + "N01761\tActivation of CRK-DOCK-Rac1 pathway\n", + "N01762\tMERTK-mediated recognition and engulfment\n", + "N01763\tMEGF10-mediated recognition and engulfment\n", + "N01764\tCalreticulin-LRP1 mediated recognition and engulfment\n", + "N01765\tCXCR4-GNAQ-PLCB/G signaling pathway\n", + "N01766\tCX3CR1-GNAI-AC signaling pathway\n", + "N01767\tCXCR4-GNAI-Src signaling pathway\n", + "N01768\tCXCR4-GNA12/13 signaling pathway\n", + "N01769\tCCR5-GNB/G-PLCB/G signaling pathway\n", + "N01770\tCCR2-GNB/G-PI3K signaling pathway\n", + "N01771\tCXCR4-GNB/G signaling pathway\n", + "N01772\tInduction of the PTGS2\n", + "N01773\tPTGS2-PGE2-TGFB1 pathway\n", + "N01774\tERK-DUSP4 negative feedback pathway\n", + "N01775\tInactivation of CaMKII by inducing SERCA2\n", + "N01776\tCaMK2-p38-MK2-ALOX5 pathway\n", + "N01777\tEfferocytosis-induced NAD production\n", + "N01778\tProduction of IL10 via the Sirtuin1 signaling cascade\n", + "N01779\tContinual efferocytosis enhanced by the AC-derived arginine and ornithine\n", + "N01780\tHydrolyzing AC-derived cholesterol esters in the lysosome\n", + "N01781\tActivation of LXRs by oxysterols\n", + "N01782\tGHRL-GHSR signaling\n", + "N01783\tNPPA-NPR1 signaling\n", + "N01784\tGlucose uptake and lactate release induced by efferocytosis\n", + "N01785\tDon't eat me signal (CD47)\n", + "N01786\tDon't eat me signal (CD24)\n", + "N01787\tNPPC-NPR2 signaling\n", + "N01788\tADIPOQ-ADIPOR signaling pathway\n", + "N01789\tBetaine metabolism\n", + "N01790\tTransport of dopamine into the neuron\n", + "N01791\tGlycine metabolism, Ser to Gly\n", + "N01792\tEDN-EDNR signaling pathway\n", + "N01793\tGAL-GALR signaling pathway\n", + "N01794\tHCRT-HCRTR signaling pathway\n", + "N01796\tTNFSF4-TNFRSF4 signaling pathway\n", + "N01797\tEDA-EDAR signaling pathway\n", + "N01798\tTNFSF11-TNFRSF11A signaling pathway\n", + "N01799\tCD70-CD27 signaling pathway\n", + "N01800\tLEP-LEPR signaling pathway\n", + "N01801\tTNFSF13-TNFRSF13B/C signaling pathway\n", + "N01802\tDihydrotestosterone biosynthesis\n", + "N01804\tIL3 family to Jak-STAT signaling pathway\n", + "N01806\tCobalamin (Vitamin B12) absorption\n", + "N01807\tTransfer of cobalamin to the portal blood\n", + "N01808\tIntracellular processing of cobalamin (reduction)\n", + "N01809\tMutation-caused epigenetic silencing of MMACHC\n", + "N01810\tRegulation of MMACHC expression\n", + "N01811\tMitochondrial adenocylation of cobalamin and loading onto MMUT\n", + "N01812\tCobalamin loading and activation of MTR\n", + "N01813\tEnhancement of NIPBL loading\n", + "N01814\tExtracellular matrix - Basal lamina\n", + "N01815\tVinculin-talin-integrin macromolecular complex\n", + "N01816\tCostamere\n", + "N01817\tMyosin thick filament\n", + "N01818\tActin thin filament, muscle contraction\n", + "N01819\tActin thin filament, length regulation\n", + "N01820\tSarcomere, Z-disc\n", + "N01821\tSarcomere, M-band\n", + "N01822\tLinker of nucleoskeleton and cytoskeleton (LINC) complex\n", + "N01823\tFGF23-NCC/NPT signaling pathway\n", + "N01824\tSGK1-NHERF1+NPT signaling pathway\n", + "N01831\tRegulation of VWF-GPIb-IX-V interaction, ADAMTS13\n", + "N01832\tNTN1-MAP1B axon guidance signaling\n", + "N01833\tDRAXIN-MAP1B axon guidance signaling\n", + "N01834\tSEMA3A-MAP1B axon guidance signaling\n", + "N01835\tSEMA3-CRMP2/MAPT axon guidance signaling\n", + "N01836\tMicrotubule plus end regulation network\n", + "N01837\tRegulation of neurite extension, NAV1-TRIO\n", + "N01838\tRegulation of synaptic plasticity, p140Cap\n", + "N01839\tSevering of microtubule, SPAST/KATN\n", + "N01840\tSevering of microtubule, KIF2A\n", + "N01841\tAnterograde axonal transport, Kinesin-2\n", + "N01842\tAnterograde axonal/dendrite transport, Kinesin-3\n", + "N01843\tAnterograde dendrite transport, Kinesin-4\n", + "N01844\tAnterograde dendrite transport, Kinesin-6\n", + "N01845\tAnterograde axonal/dendrite transport, Kinesin-12\n", + "N01846\tRetrograde axonal/dendrite transport, Dynein\n", + "N01847\tRegulation of dynein-mediated retrograde transport\n", + "N01848\tMembrane-associated periodic skeleton (MPS)\n", + "N01849\tAxonal actin ring structure\n", + "N01850\tMYO5B-mediated vesicle transport\n", + "N01851\tMYO5A-mediated vesicle transport\n", + "N01852\tMYO6-mediated vesicle transport\n", + "N01853\tNeurofilament structure\n", + "N01854\tNeurofilament regulation, ubiqutination by TRIM2\n", + "N01855\tNeurofilament regulation, ubiqutination by Gigaxonin\n", + "N01856\tCytomatrix at the active zone (CAZ) protein complex\n", + "N01857\tSEMA3A-DCX axon guidance signaling\n", + "N01858\tEFNB1-MAPT axon guidance signaling\n", + "N01859\tAnterograde axonal/dendrite transport, Kinesin-1\n", + "N01860\tGPI-anchor remodeling\n", + "N01867\tDemethylation of dimethylglycine\n", + "N01868\tDemethylation of sarcosine\n", + "N01869\tTHF conversion, THF to 5,10-MTHF\n", + "N01870\tHIF-2A signaling pathway\n", + "N01871\tHydroxylation of HIF\n", + "N01872\tProteasomal degradation of HIF by VHL complex\n", + "N01873\tVHL mutation to HIF-2 signaling pathway\n", + "N01874\tNRG-ERBB2/ERBB3 pathway (RAS-ERK signaling)\n", + "N01875\tNRG-ERBB2/ERBB3 pathway (P13K signaling)\n", + "N01876\tNRG1 fusion to NRG-ERBB2/ERBB3 pathway\n", + "N01877\tERBB4 mutation to GF-RTK-PI3K signaling pathway\n", + "N01878\tGlutamate-GRM-GNAQ/S signaling pathway\n", + "N01879\tGlutamate-GRM-GNAI/O signaling pathway\n", + "N01880\tGRM1/5-interacting scaffold proteins\n", + "N01881\tGRM1/5-interacting partners\n", + "N01882\tTransport of natrium, KA receptor\n", + "N01883\tTransport of natrium, AMPAR\n", + "N01884\tTransport of glutamate, EAAT\n", + "N01885\tTransport of glutamine, SNAT\n", + "N01886\tGlutamate transport in synapse\n", + "N01887\tTransport of chloride, GABAA receptor\n", + "N01888\tGABA-GABBR-GNAI/O signaling pathway\n", + "N01889\tGbeta/gamma-KCNJ signaling\n", + "N01890\tGephyrin-containing complex at inhibitory synapse\n", + "N01891\tGABAA receptor trafficking\n", + "N01892\tGABA metabolism and transport in glia\n", + "N01893\tGlutamine metabolism and transport in neuron\n", + "N01894\tAcetylcholine-CHRM-GNAQ/11 signaling pathway\n", + "N01895\tTransport of natrium/calcium, CHRN\n", + "N01896\tAcetylcholine metabolism and transport in neuron\n", + "N01897\tDopamine-DRD-GNAQ/S signaling pathway\n", + "N01898\tDopamine-DRD-GNAI/O signaling pathway\n", + "N01899\tGbeta/gamma-CACNA signaling\n", + "N01900\tSerotonin-HTR2-GNAQ/11 signaling pathway\n", + "N01901\tSerotonin-HTR1/5-GNAI/O signaling pathway\n", + "N01902\tTransport of serotonin, SLC6A4\n", + "N01903\tNorepinephrine-ADRA2-GNAI/O signaling pathway\n", + "N01904\tNorepinephrine-ADRB-GNAS signaling pathway\n", + "N01905\tAC-PKA-HCN signaling\n", + "N01906\tGlycine transport in neuron\n", + "N01907\tTransport of chloride, GLR\n", + "N01908\tADP/UDP-glucose-P2RY-GNAI/O signaling pathway\n", + "N01909\tTransport of calcium, P2RX\n", + "N01910\tAdenine nucleotide conversion\n", + "N01911\tTransport of ATP, SLC17A9\n", + "N01912\tHistamine metabolism and transport in neuron\n", + "N01913\tMelanocortin receptor signaling, MSH\n", + "N01914\tMelanocortin receptor signaling, AgRP\n", + "N01915\tTachykinin receptor signaling\n", + "N01916\tPreprohormone cleavage, POMC\n", + "N01917\tPreprohormone cleavage, PDYN\n", + "N01918\tDopamine metabolism in astrocyte\n", + "N01919\tDopamine/Adrenaline metabolism in presynaptic neuron\n", + "N01920\tTransport of norepinephrine into neuron\n", + "nt06031\tCitrate cycle and pyruvate metabolism\n", + "nt06017\tGlycogen metabolism\n", + "nt06023\tGalactose degradation\n", + "nt06020\tbeta-Oxidation in mitochondria\n", + "nt06021\tbeta-Oxidation in peroxisome\n", + "nt06034\tCholesterol biosynthesis\n", + "nt06019\tSteroid hormone biosynthesis\n", + "nt06022\tBile acid biosynthesis\n", + "nt06014\tSphingolipid degradation\n", + "nt06027\tPurine salvage pathway\n", + "nt06033\tGlycine, serine and arginine metabolism\n", + "nt06030\tMethionine metabolism\n", + "nt06024\tValine, leucine and isoleucine degradation\n", + "nt06036\tLysine degradation\n", + "nt06010\tUrea cycle\n", + "nt06037\tHistidine metabolism\n", + "nt06016\tPhenylalanine and tyrosine metabolism\n", + "nt06028\tDopamine and serotonin metabolism\n", + "nt06026\tGlutathione biosynthesis\n", + "nt06015\tN-Glycan biosynthesis\n", + "nt06013\tO-Glycan biosynthesis\n", + "nt06029\tGlycosaminoglycan biosynthesis\n", + "nt06012\tGlycosaminoglycan degradation\n", + "nt06018\tGPI-anchor biosynthesis\n", + "nt06035\tBlood group carbohydrate antigen biosynthesis\n", + "nt06032\tLipoic acid metabolism\n", + "nt06038\tFolate metabolism\n", + "nt06025\tMolybdenum cofactor biosynthesis\n", + "nt06011\tHeme biosynthesis\n", + "nt06538\tCobalamin transport and metabolism\n", + "nt06509\tDNA replication\n", + "nt06510\tTelomere length regulation\n", + "nt06504\tBase excision repair\n", + "nt06502\tNucleotide excision repair\n", + "nt06503\tMismatch repair\n", + "nt06506\tDouble-strand break repair\n", + "nt06508\tInterstrand crosslink repair\n", + "nt06526\tMAPK signaling\n", + "nt06530\tPI3K signaling\n", + "nt06505\tWNT signaling\n", + "nt06511\tNOTCH signaling\n", + "nt06501\tHH signaling\n", + "nt06507\tTGFB signaling\n", + "nt06518\tJAK-STAT signaling\n", + "nt06516\tTNF signaling\n", + "nt06528\tCalcium signaling\n", + "nt06522\tmTOR signaling\n", + "nt06542\tHIF signaling\n", + "nt06543\tNRG-ERBB signaling\n", + "nt06523\tEpigenetic regulation by Polycomb complexes\n", + "nt06512\tChromosome cohesion and segregation\n", + "nt06515\tRegulation of kinetochore-microtubule interactions\n", + "nt06534\tUnfolded protein response\n", + "nt06532\tAutophagy\n", + "nt06536\tMitophagy\n", + "nt06535\tEfferocytosis\n", + "nt06524\tApoptosis\n", + "nt06525\tFerroptosis\n", + "nt06527\tNecroptosis\n", + "nt06529\tThermogenesis\n", + "nt06539\tCytoskeleton in muscle cells\n", + "nt06541\tCytoskeleton in neurons\n", + "nt06544\tNeuroactive ligand signaling\n", + "nt06513\tComplement cascade\n", + "nt06514\tCoagulation cascade\n", + "nt06517\tTLR signaling\n", + "nt06521\tNLR signaling\n", + "nt06519\tRLR signaling\n", + "nt06520\tCGAS-STING signaling\n", + "nt06537\tTCR/BCR signaling\n", + "nt06533\tChemokine signaling\n", + "nt06310\tCRH-ACTH-cortisol signaling\n", + "nt06322\tTRH-TSH-TH signaling\n", + "nt06323\tKISS1-GnRH-LH/FSH-E2 signaling\n", + "nt06324\tGHRH-GH-IGF signaling\n", + "nt06318\tCaSR-PTH signaling\n", + "nt06316\tRenin-angiotensin-aldosterone signaling\n", + "nt06325\tHormone/cytokine signaling\n", + "nt06320\tAPOB-LDLR signaling\n", + "nt06260\tColorectal cancer\n", + "nt06261\tGastric cancer\n", + "nt06262\tPancreatic cancer\n", + "nt06263\tHepatocellular carcinoma\n", + "nt06264\tRenal cell carcinoma\n", + "nt06265\tBladder cancer\n", + "nt06266\tNon-small cell lung cancer\n", + "nt06267\tSmall cell lung cancer\n", + "nt06268\tMelanoma\n", + "nt06269\tBasal cell carcinoma\n", + "nt06270\tBreast cancer\n", + "nt06271\tEndometrial cancer\n", + "nt06272\tProstate cancer\n", + "nt06273\tGlioma\n", + "nt06274\tThyroid cancer\n", + "nt06275\tAcute myeloid leukemia\n", + "nt06276\tChronic myeloid leukemia\n", + "nt06210\tERK signaling (cancer)\n", + "nt06214\tPI3K signaling (cancer)\n", + "nt06213\tOther RAS signaling (cancer)\n", + "nt06211\tOther MAPK signaling (cancer)\n", + "nt06215\tWNT signaling (cancer)\n", + "nt06216\tNOTCH signaling (cancer)\n", + "nt06217\tHH signaling (cancer)\n", + "nt06218\tTGFB signaling (cancer)\n", + "nt06219\tJAK-STAT signaling (cancer)\n", + "nt06220\tCalcium signaling (cancer)\n", + "nt06234\tcAMP signaling (cancer)\n", + "nt06222\tIFN signaling (cancer)\n", + "nt06223\tTNF signaling (cancer)\n", + "nt06224\tCXCR signaling (cancer)\n", + "nt06225\tHIF-1 signaling (cancer)\n", + "nt06226\tKEAP1-NRF2 signaling (cancer)\n", + "nt06227\tNuclear receptor signaling (cancer)\n", + "nt06229\tMHC presentation (cancer)\n", + "nt06230\tCell cycle (cancer)\n", + "nt06231\tApoptosis (cancer)\n", + "nt06232\tTelomerase activity (cancer)\n", + "nt06240\tTranscription (cancer)\n", + "nt06250\tDNA adduct formation (cancer)\n", + "nt06251\tCYP-mediated ROS formation (cancer)\n", + "nt06252\tMitochondrial ROS formation (cancer)\n", + "nt06253\tAntioxidant system (cancer)\n", + "nt06460\tAlzheimer disease\n", + "nt06463\tParkinson disease\n", + "nt06464\tAmyotrophic lateral sclerosis\n", + "nt06461\tHuntington disease\n", + "nt06462\tSpinocerebellar ataxia\n", + "nt06465\tPrion disease\n", + "nt06466\tPathways of neurodegeneration\n", + "nt06360\tCushing syndrome\n", + "nt06160\tHuman T-cell leukemia virus 1 (HTLV-1)\n", + "nt06161\tHuman immunodeficiency virus 1 (HIV-1)\n", + "nt06162\tHepatitis B virus (HBV)\n", + "nt06163\tHepatitis C virus (HCV)\n", + "nt06171\tSARS coronavirus 2 (SARS-CoV-2)\n", + "nt06170\tInfluenza A virus (IAV)\n", + "nt06169\tMeasles virus (MV)\n", + "nt06168\tHerpes simplex virus 1 (HSV-1)\n", + "nt06167\tHuman cytomegalovirus (HCMV)\n", + "nt06164\tKaposi sarcoma-associated herpesvirus (KSHV)\n", + "nt06165\tEpstein-Barr virus (EBV)\n", + "nt06166\tHuman papillomavirus (HPV)\n", + "nt06180\tPathogenic Escherichia coli\n", + "nt06181\tSalmonella\n", + "nt06182\tShigella\n", + "nt06183\tYersinia\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull rest list network" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8bc3095e-6122-46cd-a7ff-7f77cbbaf28f", + "metadata": {}, + "outputs": [], + "source": [ + "#kegg_pull pull database network\n", + "\n", + "# Pulling all nodes in the network database. Will download it to current working directory. " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "94ea7e25-deb4-4b13-8ca9-9e865f792ccd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "network KEGG Network Database\n", + "ne Release 114.0+/04-11, Apr 25\n", + " Kanehisa Laboratories\n", + " 1,637 entries\n", + "\n", + "linked db pathway\n", + " ko\n", + " hsa\n", + " compound\n", + " variant\n", + " disease\n", + " pubmed\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull rest info network" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "379395b3-8bb4-4282-9967-3b9305540771", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 1415\n" + ] + } + ], + "source": [ + "kegg_pull rest link network pathway | wc -l" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "d53d358e-7f55-4a49-b277-d6781cabf389", + "metadata": {}, + "outputs": [], + "source": [ + "kegg_pull rest link network pathway --output network_pathway.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "40c35d4e-4eee-4c23-97ba-e21410d74c4c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 1414 network_pathway.tsv\n" + ] + } + ], + "source": [ + "wc -l network_pathway.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "ead20644-6632-4002-aae7-2e73962dafe8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "path:hsa05225\tne:N00005\n", + "path:hsa05211\tne:N00005\n", + "path:hsa05223\tne:N00007\n", + "path:hsa05216\tne:N00009\n", + "path:hsa05210\tne:N00012\n", + "path:hsa05212\tne:N00012\n", + "path:hsa05226\tne:N00012\n", + "path:hsa05216\tne:N00012\n", + "path:hsa05221\tne:N00012\n", + "path:hsa05213\tne:N00012\n" + ] + } + ], + "source": [ + "head network_pathway.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a697076f-118f-4151-8720-4b0bcda35a5d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "62077ac1-2eb4-421c-8133-8da3610b0c3b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 1306\n" + ] + } + ], + "source": [ + "kegg_pull rest link network disease | wc -l" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "4ad19eb5-09fd-4f88-8d22-ab04b1b0d12f", + "metadata": {}, + "outputs": [], + "source": [ + "kegg_pull rest link network disease --output network_disease.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "460b3d26-0221-4347-abd4-860dd6b3e125", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 1305 network_disease.tsv\n" + ] + } + ], + "source": [ + "wc -l network_disease.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "a2841f84-d8fb-445d-8bef-47155a13cb3e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ds:H01489\tne:nt06018\n", + "ds:H01486\tne:nt06018\n", + "ds:H01488\tne:nt06018\n", + "ds:H01487\tne:nt06018\n", + "ds:H01127\tne:nt06018\n", + "ds:H01485\tne:nt06018\n", + "ds:H00216\tne:nt06019\n", + "ds:H02314\tne:nt06019\n", + "ds:H00259\tne:nt06019\n", + "ds:H01111\tne:nt06019\n" + ] + } + ], + "source": [ + "head network_disease.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8363fad9-f5b6-42b6-a3b6-dac6c86af932", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90785681-45ac-4f65-85a4-5fb5852acce2", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "3a86642c-9caa-4ba4-a493-1811bb060cd0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████| 1/1 [00:01<00:00, 1.37s/it]\n" + ] + } + ], + "source": [ + "kegg_pull pull entry-ids H01489" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81ff19a1-1016-4658-909a-7aaef6790c19", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "765b890a-bf17-43cc-8ac9-6e0b6b497e16", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 76\n" + ] + } + ], + "source": [ + "kegg_pull rest link disease pathway | wc -l" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "1744f449-033c-482d-9fd7-60f37a319fc5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "path:hsa05211\tds:H00021\n", + "path:hsa05110\tds:H00110\n", + "path:hsa05220\tds:H00004\n", + "path:hsa05210\tds:H00020\n", + "path:hsa05212\tds:H00019\n", + "path:hsa05217\tds:H00039\n", + "path:hsa05130\tds:H00277\n", + "path:hsa05130\tds:H00278\n", + "path:hsa05332\tds:H00084\n", + "path:hsa05132\tds:H00111\n", + "path:hsa05223\tds:H00014\n", + "path:hsa05135\tds:H00298\n", + "path:hsa05214\tds:H00042\n", + "path:hsa05221\tds:H00003\n", + "path:hsa05166\tds:H00009\n", + "path:hsa05226\tds:H00018\n", + "path:hsa05224\tds:H00031\n", + "path:hsa05216\tds:H00032\n", + "path:hsa05161\tds:H00412\n", + "path:hsa05144\tds:H00361\n" + ] + } + ], + "source": [ + "kegg_pull rest link disease pathway | head -20" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "4931f2d0-e53e-4db3-b3ed-0b0d875d3384", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pathway KEGG Pathway Database\n", + "path Release 114.0+/04-11, Apr 25\n", + " Kanehisa Laboratories\n", + " 579 entries\n", + "\n", + "linked db module\n", + " ko\n", + " \n", + " genome\n", + " compound\n", + " glycan\n", + " reaction\n", + " rclass\n", + " enzyme\n", + " network\n", + " disease\n", + " drug\n", + " pubmed\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull rest info pathway" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "a12375f3-3bcb-4f00-8662-01e6b941cfbb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "disease KEGG Disease Database\n", + "ds Release 114.0+/04-11, Apr 25\n", + " Kanehisa Laboratories\n", + " 2,900 entries\n", + "\n", + "linked db pathway\n", + " brite\n", + " ko\n", + " hsa\n", + " genome\n", + " network\n", + " variant\n", + " drug\n", + " pubmed\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull rest info disease" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "c6dff0fa-7a12-4e0e-8af6-76f7d37adb71", + "metadata": {}, + "outputs": [], + "source": [ + "kegg_pull rest list network --output kegg_network.txt" + ] + }, + { + "cell_type": "markdown", + "id": "bf0aa25e-dfae-4bef-aaaa-a45d3417c125", + "metadata": {}, + "source": [ + "## Getting the number of reference vs disease networks" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "488d4d96-4f3f-4725-930f-77efb4aac6a8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████| 1637/1637 [11:53<00:00, 2.30it/s]\n" + ] + } + ], + "source": [ + "kegg_pull pull database network --output kegg_network" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "3c0adc9f-544a-4f52-9e67-232798caa6b0", + "metadata": {}, + "outputs": [], + "source": [ + "# Output file\n", + "output=\"kegg_network_types.tsv\"\n", + "> \"$output\" # Clear or create the file\n", + "\n", + "# Iterate over each .txt file in the kegg_network directory\n", + "for file in kegg_network/*.txt; do\n", + " # Get the filename without path and extension\n", + " base=$(basename \"$file\" .txt)\n", + "\n", + " # Extract the line containing TYPE\n", + " type_line=$(grep \"TYPE\" \"$file\")\n", + "\n", + " # Extract the TYPE line and remove the word \"TYPE\" and any whitespace\n", + " type_value=$(grep \"^TYPE\" \"$file\" | sed 's/TYPE[ \\t]*//')\n", + "\n", + " # Write to the output file\n", + " echo -e \"${base}\\t${type_value}\" >> \"$output\"\n", + "done" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "0ac320a9-9079-4fb6-81fd-975977a82db0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 1.17it/s]\n", + "hsa_var:1950v1\n", + "ENTRY 1950v1 Variant\n", + "NAME EGF overexpression\n", + "TYPE Gain of function\n", + "GENE EGF epidermal growth factor [KO:K04357]\n", + "ORGANISM hsa_var Human gene variants (Homo sapiens)\n", + "VARIATION overexpression\n", + "NETWORK nt06210 ERK signaling (cancer)\n", + " nt06214 PI3K signaling (cancer)\n", + " nt06260 Colorectal cancer\n", + " nt06526 MAPK signaling\n", + " nt06530 PI3K signaling\n", + "DISEASE H00020 Colorectal cancer\n", + "REFERENCE PMID:7912978\n", + " AUTHORS Hayashi Y, Widjono YW, Ohta K, Hanioka K, Obayashi C, Itoh K, Imai Y, Itoh H\n", + " TITLE Expression of EGF, EGF-receptor, p53, v-erb B and ras p21 in colorectal neoplasms by immunostaining paraffin-embedded tissues.\n", + " JOURNAL Pathol Int 44:124-30 (1994)\n", + " DOI:10.1111/j.1440-1827.1994.tb01696.x\n", + "REFERENCE PMID:15668269\n", + " AUTHORS Spano JP, Fagard R, Soria JC, Rixe O, Khayat D, Milano G\n", + " TITLE Epidermal growth factor receptor signaling in colorectal cancer: preclinical data and therapeutic perspectives.\n", + " JOURNAL Ann Oncol 16:189-94 (2005)\n", + " DOI:10.1093/annonc/mdi057\n", + "///\n", + "\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull pull entry-ids hsa_var:1950v1 --print" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "9dd433af-9634-4dd9-bdb1-e04d6f0610f1", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ENTRY 1950 CDS T01001\n", + "SYMBOL EGF, HOMG4, URG\n", + "NAME (RefSeq) epidermal growth factor\n", + "ORTHOLOGY K04357 epidermal growth factor\n", + "ORGANISM hsa Homo sapiens (human)\n", + "PATHWAY hsa01521 EGFR tyrosine kinase inhibitor resistance\n", + " hsa04010 MAPK signaling pathway\n", + " hsa04012 ErbB signaling pathway\n", + " hsa04014 Ras signaling pathway\n", + " hsa04015 Rap1 signaling pathway\n", + " hsa04020 Calcium signaling pathway\n", + " hsa04066 HIF-1 signaling pathway\n", + " hsa04068 FoxO signaling pathway\n", + " hsa04072 Phospholipase D signaling pathway\n", + " hsa04151 PI3K-Akt signaling pathway\n", + " hsa04510 Focal adhesion\n", + " hsa04540 Gap junction\n", + " hsa04630 JAK-STAT signaling pathway\n", + " hsa04810 Regulation of actin cytoskeleton\n", + " hsa05160 Hepatitis C\n", + " hsa05165 Human papillomavirus infection\n", + " hsa05200 Pathways in cancer\n", + " hsa05207 Chemical carcinogenesis - receptor activation\n", + " hsa05208 Chemical carcinogenesis - reactive oxygen species\n", + " hsa05210 Colorectal cancer\n", + " hsa05212 Pancreatic cancer\n", + " hsa05213 Endometrial cancer\n", + " hsa05214 Glioma\n", + " hsa05215 Prostate cancer\n", + " hsa05218 Melanoma\n", + " hsa05219 Bladder cancer\n", + " hsa05223 Non-small cell lung cancer\n", + " hsa05224 Breast cancer\n", + " hsa05226 Gastric cancer\n", + " hsa05231 Choline metabolism in cancer\n", + " hsa05235 PD-L1 expression and PD-1 checkpoint pathway in cancer\n", + "NETWORK nt06160 Human T-cell leukemia virus 1 (HTLV-1)\n", + " nt06162 Hepatitis B virus (HBV)\n", + " nt06163 Hepatitis C virus (HCV)\n", + " nt06164 Kaposi sarcoma-associated herpesvirus (KSHV)\n", + " nt06165 Epstein-Barr virus (EBV)\n", + " nt06166 Human papillomavirus (HPV)\n", + " nt06167 Human cytomegalovirus (HCMV)\n", + " nt06170 Influenza A virus (IAV)\n", + " nt06180 Pathogenic Escherichia coli\n", + " nt06182 Shigella\n", + " nt06210 ERK signaling (cancer)\n", + " nt06213 Other RAS signaling (cancer)\n", + " nt06214 PI3K signaling (cancer)\n", + " nt06219 JAK-STAT signaling (cancer)\n", + " nt06220 Calcium signaling (cancer)\n", + " nt06227 Nuclear receptor signaling (cancer)\n", + " nt06260 Colorectal cancer\n", + " nt06261 Gastric cancer\n", + " nt06262 Pancreatic cancer\n", + " nt06263 Hepatocellular carcinoma\n", + " nt06265 Bladder cancer\n", + " nt06266 Non-small cell lung cancer\n", + " nt06268 Melanoma\n", + " nt06270 Breast cancer\n", + " nt06271 Endometrial cancer\n", + " nt06273 Glioma\n", + " nt06274 Thyroid cancer\n", + " nt06276 Chronic myeloid leukemia\n", + " nt06526 MAPK signaling\n", + " nt06528 Calcium signaling\n", + " nt06530 PI3K signaling\n", + " ELEMENT N00001 EGF-EGFR-RAS-ERK signaling pathway\n", + " N00021 EGF-ERBB2-RAS-ERK signaling pathway\n", + " N00022 ERBB2-overexpression to RAS-ERK signaling pathway\n", + " N00023 EGF-EGFR-PLCG-ERK signaling pathway\n", + " N00026 EGF-EGFR-PLCG-CAMK signaling pathway\n", + " N00030 EGF-EGFR-RAS-PI3K signaling pathway\n", + " N00033 EGF-EGFR-PI3K signaling pathway\n", + " N00034 ERBB2-overexpression to PI3K signaling pathway\n", + " N00094 EGF-Jak-STAT signaling pathway\n", + " N00095 ERBB2-overexpression to EGF-Jak-STAT signaling pathway\n", + " N00096 EGF-EGFR-RAS-RASSF1 signaling pathway\n", + " N00103 EGF-EGFR-RAS-RalGDS signaling pathway\n", + " N00147 EGF-EGFR-PLCG-calcineurin signaling pathway\n", + " N00252 Amplified ERBB2 to RAS-ERK signaling pathway\n", + " N00253 Amplified ERBB2 to PI3K signaling pathway\n", + " N00276 EGF-overexpression to RAS-ERK signaling pathway\n", + " N00281 EGF-overexpression to PI3K signaling pathway\n", + " N00390 EGF-EGFR-PI3K-NFKB signaling pathway\n", + " N00542 EGF-EGFR-RAS-JNK signaling pathway\n", + " N01078 EGF-EGFR-Actin signaling pathway\n", + " N01364 E2 to nuclear-initiated estrogen signaling pathway\n", + " N01592 GF-RTK-RAS-ERK signaling pathway\n", + " N01641 RTK-PLCG-ITPR signaling pathway\n", + " N01656 GF-RTK-PI3K signaling pathway\n", + " N01658 GF-RTK-RAS-PI3K signaling pathway\n", + "DISEASE H00020 Colorectal cancer\n", + " H01210 Hypomagnesemia\n", + "BRITE KEGG Orthology (KO) [BR:hsa00001]\n", + " 09130 Environmental Information Processing\n", + " 09132 Signal transduction\n", + " 04014 Ras signaling pathway\n", + " 1950 (EGF)\n", + " 04015 Rap1 signaling pathway\n", + " 1950 (EGF)\n", + " 04630 JAK-STAT signaling pathway\n", + " 1950 (EGF)\n", + " 04066 HIF-1 signaling pathway\n", + " 1950 (EGF)\n", + " 04068 FoxO signaling pathway\n", + " 1950 (EGF)\n", + " 04072 Phospholipase D signaling pathway\n", + " 1950 (EGF)\n", + " 04151 PI3K-Akt signaling pathway\n", + " 1950 (EGF)\n", + " 09160 Human Diseases\n", + " 09161 Cancer: overview\n", + " 05200 Pathways in cancer\n", + " 1950 (EGF)\n", + " 05207 Chemical carcinogenesis - receptor activation\n", + " 1950 (EGF)\n", + " 05208 Chemical carcinogenesis - reactive oxygen species\n", + " 1950 (EGF)\n", + " 05231 Choline metabolism in cancer\n", + " 1950 (EGF)\n", + " 05235 PD-L1 expression and PD-1 checkpoint pathway in cancer\n", + " 1950 (EGF)\n", + " 09162 Cancer: specific types\n", + " 05210 Colorectal cancer\n", + " 1950 (EGF)\n", + " 05212 Pancreatic cancer\n", + " 1950 (EGF)\n", + " 05226 Gastric cancer\n", + " 1950 (EGF)\n", + " 05214 Glioma\n", + " 1950 (EGF)\n", + " 05218 Melanoma\n", + " 1950 (EGF)\n", + " 05219 Bladder cancer\n", + " 1950 (EGF)\n", + " 05215 Prostate cancer\n", + " 1950 (EGF)\n", + " 05213 Endometrial cancer\n", + " 1950 (EGF)\n", + " 05224 Breast cancer\n", + " 1950 (EGF)\n", + " 05223 Non-small cell lung cancer\n", + " 1950 (EGF)\n", + " 09172 Infectious disease: viral\n", + " 05160 Hepatitis C\n", + " 1950 (EGF)\n", + " 05165 Human papillomavirus infection\n", + " 1950 (EGF)\n", + " 09176 Drug resistance: antineoplastic\n", + " 01521 EGFR tyrosine kinase inhibitor resistance\n", + " 1950 (EGF)\n", + " 09180 Brite Hierarchies\n", + " 09183 Protein families: signaling and cellular processes\n", + " 04052 Cytokines and neuropeptides [BR:hsa04052]\n", + " 1950 (EGF)\n", + " Cytokines and neuropeptides [BR:hsa04052]\n", + " Cytokines\n", + " Growth factors (RTK binding)\n", + " 1950 (EGF)\n", + "POSITION 4:109912883..110013766\n", + "MOTIF Pfam: Ldl_recept_b FXa_inhibition cEGF EGF EGF_CA EGF_3 DUF5050 Vgb_lyase Plasmod_Pvs28\n", + "DBLINKS NCBI-GeneID: 1950\n", + " NCBI-ProteinID: NP_001954\n", + " OMIM: 131530\n", + " HGNC: 3229\n", + " Ensembl: ENSG00000138798\n", + " UniProt: P01133\n", + "STRUCTURE PDB\n", + "AASEQ 1207\n", + " MLLTLIILLPVVSKFSFVSLSAPQHWSCPEGTLAGNGNSTCVGPAPFLIFSHGNSIFRID\n", + " TEGTNYEQLVVDAGVSVIMDFHYNEKRIYWVDLERQLLQRVFLNGSRQERVCNIEKNVSG\n", + " MAINWINEEVIWSNQQEGIITVTDMKGNNSHILLSALKYPANVAVDPVERFIFWSSEVAG\n", + " SLYRADLDGVGVKALLETSEKITAVSLDVLDKRLFWIQYNREGSNSLICSCDYDGGSVHI\n", + " SKHPTQHNLFAMSLFGDRIFYSTWKMKTIWIANKHTGKDMVRINLHSSFVPLGELKVVHP\n", + " LAQPKAEDDTWEPEQKLCKLRKGNCSSTVCGQDLQSHLCMCAEGYALSRDRKYCEDVNEC\n", + " AFWNHGCTLGCKNTPGSYYCTCPVGFVLLPDGKRCHQLVSCPRNVSECSHDCVLTSEGPL\n", + " CFCPEGSVLERDGKTCSGCSSPDNGGCSQLCVPLSPVSWECDCFPGYDLQLDEKSCAASG\n", + " PQPFLLFANSQDIRHMHFDGTDYGTLLSQQMGMVYALDHDPVENKIYFAHTALKWIERAN\n", + " MDGSQRERLIEEGVDVPEGLAVDWIGRRFYWTDRGKSLIGRSDLNGKRSKIITKENISQP\n", + " RGIAVHPMAKRLFWTDTGINPRIESSSLQGLGRLVIASSDLIWPSGITIDFLTDKLYWCD\n", + " AKQSVIEMANLDGSKRRRLTQNDVGHPFAVAVFEDYVWFSDWAMPSVMRVNKRTGKDRVR\n", + " LQGSMLKPSSLVVVHPLAKPGADPCLYQNGGCEHICKKRLGTAWCSCREGFMKASDGKTC\n", + " LALDGHQLLAGGEVDLKNQVTPLDILSKTRVSEDNITESQHMLVAEIMVSDQDDCAPVGC\n", + " SMYARCISEGEDATCQCLKGFAGDGKLCSDIDECEMGVPVCPPASSKCINTEGGYVCRCS\n", + " EGYQGDGIHCLDIDECQLGEHSCGENASCTNTEGGYTCMCAGRLSEPGLICPDSTPPPHL\n", + " REDDHHYSVRNSDSECPLSHDGYCLHDGVCMYIEALDKYACNCVVGYIGERCQYRDLKWW\n", + " ELRHAGHGQQQKVIVVAVCVVVLVMLLLLSLWGAHYYRTQKLLSKNPKNPYEESSRDVRS\n", + " RRPADTEDGMSSCPQPWFVVIKEHQDLKNGGQPVAGEDGQAADGSMQPTSWRQEPQLCGM\n", + " GTEQGCWIPVSSDKGSCPQVMERSFHMPSYGTQTLEGGVEKPHSLLSANPLWQQRALDPP\n", + " HQMELTQ\n", + "NTSEQ 3624\n", + " atgctgctcactcttatcattctgttgccagtagtttcaaaatttagttttgttagtctc\n", + " tcagcaccgcagcactggagctgtcctgaaggtactctcgcaggaaatgggaattctact\n", + " tgtgtgggtcctgcacccttcttaattttctcccatggaaatagtatctttaggattgac\n", + " acagaaggaaccaattatgagcaattggtggtggatgctggtgtctcagtgatcatggat\n", + " tttcattataatgagaaaagaatctattgggtggatttagaaagacaacttttgcaaaga\n", + " gtttttctgaatgggtcaaggcaagagagagtatgtaatatagagaaaaatgtttctgga\n", + " atggcaataaattggataaatgaagaagttatttggtcaaatcaacaggaaggaatcatt\n", + " acagtaacagatatgaaaggaaataattcccacattcttttaagtgctttaaaatatcct\n", + " gcaaatgtagcagttgatccagtagaaaggtttatattttggtcttcagaggtggctgga\n", + " agcctttatagagcagatctcgatggtgtgggagtgaaggctctgttggagacatcagag\n", + " aaaataacagctgtgtcattggatgtgcttgataagcggctgttttggattcagtacaac\n", + " agagaaggaagcaattctcttatttgctcctgtgattatgatggaggttctgtccacatt\n", + " agtaaacatccaacacagcataatttgtttgcaatgtccctttttggtgaccgtatcttc\n", + " tattcaacatggaaaatgaagacaatttggatagccaacaaacacactggaaaggacatg\n", + " gttagaattaacctccattcatcatttgtaccacttggtgaactgaaagtagtgcatcca\n", + " cttgcacaacccaaggcagaagatgacacttgggagcctgagcagaaactttgcaaattg\n", + " aggaaaggaaactgcagcagcactgtgtgtgggcaagacctccagtcacacttgtgcatg\n", + " tgtgcagagggatacgccctaagtcgagaccggaagtactgtgaagatgttaatgaatgt\n", + " gctttttggaatcatggctgtactcttgggtgtaaaaacacccctggatcctattactgc\n", + " acgtgccctgtaggatttgttctgcttcctgatgggaaacgatgtcatcaacttgtttcc\n", + " tgtccacgcaatgtgtctgaatgcagccatgactgtgttctgacatcagaaggtccctta\n", + " tgtttctgtcctgaaggctcagtgcttgagagagatgggaaaacatgtagcggttgttcc\n", + " tcacccgataatggtggatgtagccagctctgcgttcctcttagcccagtatcctgggaa\n", + " tgtgattgctttcctgggtatgacctacaactggatgaaaaaagctgtgcagcttcagga\n", + " ccacaaccatttttgctgtttgccaattctcaagatattcgacacatgcattttgatgga\n", + " acagactatggaactctgctcagccagcagatgggaatggtttatgccctagatcatgac\n", + " cctgtggaaaataagatatactttgcccatacagccctgaagtggatagagagagctaat\n", + " atggatggttcccagcgagaaaggcttattgaggaaggagtagatgtgccagaaggtctt\n", + " gctgtggactggattggccgtagattctattggacagacagagggaaatctctgattgga\n", + " aggagtgatttaaatgggaaacgttccaaaataatcactaaggagaacatctctcaacca\n", + " cgaggaattgctgttcatccaatggccaagagattattctggactgatacagggattaat\n", + " ccacgaattgaaagttcttccctccaaggccttggccgtctggttatagccagctctgat\n", + " ctaatctggcccagtggaataacgattgacttcttaactgacaagttgtactggtgcgat\n", + " gccaagcagtctgtgattgaaatggccaatctggatggttcaaaacgccgaagacttacc\n", + " cagaatgatgtaggtcacccatttgctgtagcagtgtttgaggattatgtgtggttctca\n", + " gattgggctatgccatcagtaatgagagtaaacaagaggactggcaaagatagagtacgt\n", + " ctccaaggcagcatgctgaagccctcatcactggttgtggttcatccattggcaaaacca\n", + " ggagcagatccctgcttatatcaaaacggaggctgtgaacatatttgcaaaaagaggctt\n", + " ggaactgcttggtgttcgtgtcgtgaaggttttatgaaagcctcagatgggaaaacgtgt\n", + " ctggctctggatggtcatcagctgttggcaggtggtgaagttgatctaaagaaccaagta\n", + " acaccattggacatcttgtccaagactagagtgtcagaagataacattacagaatctcaa\n", + " cacatgctagtggctgaaatcatggtgtcagatcaagatgactgtgctcctgtgggatgc\n", + " agcatgtatgctcggtgtatttcagagggagaggatgccacatgtcagtgtttgaaagga\n", + " tttgctggggatggaaaactatgttctgatatagatgaatgtgagatgggtgtcccagtg\n", + " tgcccccctgcctcctccaagtgcatcaacaccgaaggtggttatgtctgccggtgctca\n", + " gaaggctaccaaggagatgggattcactgtcttgatattgatgagtgccaactgggggag\n", + " cacagctgtggagagaatgccagctgcacaaatacagagggaggctatacctgcatgtgt\n", + " gctggacgcctgtctgaaccaggactgatttgccctgactctactccaccccctcacctc\n", + " agggaagatgaccaccactattccgtaagaaatagtgactctgaatgtcccctgtcccac\n", + " gatgggtactgcctccatgatggtgtgtgcatgtatattgaagcattggacaagtatgca\n", + " tgcaactgtgttgttggctacatcggggagcgatgtcagtaccgagacctgaagtggtgg\n", + " gaactgcgccacgctggccacgggcagcagcagaaggtcatcgtggtggctgtctgcgtg\n", + " gtggtgcttgtcatgctgctcctcctgagcctgtggggggcccactactacaggactcag\n", + " aagctgctatcgaaaaacccaaagaatccttatgaggagtcgagcagagatgtgaggagt\n", + " cgcaggcctgctgacactgaggatgggatgtcctcttgccctcaaccttggtttgtggtt\n", + " ataaaagaacaccaagacctcaagaatgggggtcaaccagtggctggtgaggatggccag\n", + " gcagcagatgggtcaatgcaaccaacttcatggaggcaggagccccagttatgtggaatg\n", + " ggcacagagcaaggctgctggattccagtatccagtgataagggctcctgtccccaggta\n", + " atggagcgaagctttcatatgccctcctatgggacacagacccttgaagggggtgtcgag\n", + " aagccccattctctcctatcagctaacccattatggcaacaaagggccctggacccacca\n", + " caccaaatggagctgactcagtga\n", + "///\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull rest get hsa:1950" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "926ac49a-8cc3-472c-ae07-4ecd4b70f5aa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T01001 Homo sapiens (human) KEGG Genes Database\n", + "hsa Release 114.0+/04-11, Apr 25\n", + " Kanehisa Laboratories\n", + " 24,685 entries\n", + "\n", + "linked db pathway\n", + " brite\n", + " module\n", + " ko\n", + " genome\n", + " enzyme\n", + " network\n", + " disease\n", + " drug\n", + " ncbi-geneid\n", + " ncbi-proteinid\n", + " uniprot\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull rest info hsa" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "b58ec9a4-dce3-4900-8021-9bc9155a925e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "variant KEGG Variant Database\n", + "hsa_var Release 114.0+/04-12, Apr 25\n", + " Kanehisa Laboratories\n", + " 1,536 entries\n", + "\n", + "linked db network\n", + " disease\n", + " drug\n", + " pubmed\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull rest info variant" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "cd56ab73-c7ab-4e9f-bf79-c1f2fe6c3b40", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10000v1\tAKT3 mutation\n", + "10026v1\tPIGK deficiency\n", + "10075v1\tHUWE1 mutation\n", + "100v1\tADA deficiency\n", + "10111v1\tRAD50 mutation\n", + "10133v1\tOPTN mutation\n", + "10133v2\tOPTN activating mutation\n", + "10157v1\tAASS deficiency\n", + "10195v1\tALG3 deficiency\n", + "1019v1\tCDK4 amplification\n", + "1019v2\tCDK4 mutation\n", + "10243v1\tGPHN deficiency\n", + "10274v1\tSTAG1 mutation\n", + "1027v1\tCDKN1B loss\n", + "1027v2\tCDKN1B reduced expression\n", + "1027v3\tCDKN1B mutation\n", + "10280v1\tSIGMAR1 mutation\n", + "10293v1\tTRAIP mutation\n", + "10297v1\tAPC2 mutation\n", + "1029v1\tCDKN2A deletion\n" + ] + } + ], + "source": [ + "kegg_pull rest list variant | head -20" + ] + }, + { + "cell_type": "markdown", + "id": "771ddf6d-dafc-4368-a5a3-0b6a2abdeeb3", + "metadata": {}, + "source": [ + "## Subsetting data to the Variant set of the networks" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "bc2e85c1-71d8-4c68-b0be-00ec72217cc7", + "metadata": {}, + "outputs": [], + "source": [ + "mkdir network_variant" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "590d7a64-857a-44bc-9251-90cdbc1d4181", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + "cp kegg_network/$p.txt network_variant/\n", + "\n", + "done < network_variants.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "fce29a29-33ca-4a03-89e2-2ed4aafc929f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 298\n" + ] + } + ], + "source": [ + "ls network_variant/* | wc -l" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "a339aed9-8ef3-421d-a454-5aea56b0124c", + "metadata": {}, + "outputs": [], + "source": [ + "#!/bin/bash\n", + "\n", + "output=\"gene_variants.tsv\"\n", + "> \"$output\" # Clear the output file\n", + "\n", + "for file in network_variant/*.txt; do\n", + " base=$(basename \"$file\" .txt)\n", + "\n", + " # Find and extract all matches of digits-v-digits\n", + " grep -oE \"[0-9]+v[0-9]+\" \"$file\" | while read -r match; do\n", + " echo -e \"${base}\\t${match}\" >> \"$output\"\n", + " done\n", + "done" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "279cc34b-6bd9-46d2-be11-785e9793ecfe", + "metadata": {}, + "outputs": [], + "source": [ + "sort gene_variants.tsv | uniq > temp.tsv && mv temp.tsv gene_variants.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "88ffb62a-be1a-4e05-854c-b061d596985c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 328 gene_variants.tsv\n" + ] + } + ], + "source": [ + "wc -l gene_variants.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "e1895ec2-6791-4daf-bcbf-3817e2e3a963", + "metadata": {}, + "outputs": [], + "source": [ + "cut -f 2 gene_variants.tsv > gene_variants.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "21e29f30-fecb-4360-ae18-278757bdbd0e", + "metadata": {}, + "outputs": [], + "source": [ + "sort gene_variants.txt | uniq > temp.tsv && mv temp.tsv gene_variants.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "015831af-db64-4386-b595-b2ceab4369d2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 200 gene_variants.txt\n" + ] + } + ], + "source": [ + "wc -l gene_variants.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "4ddf080a-1786-44ea-acdd-e7e474952ee3", + "metadata": {}, + "outputs": [], + "source": [ + "sed -i '' 's/^/hsa_var:/' gene_variants.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "5bb68dd7-08e6-4c3d-99c0-6131df914af6", + "metadata": {}, + "outputs": [], + "source": [ + "mkdir variant_info" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "f225e592-665e-4442-a385-73addd61b902", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████| 200/200 [00:51<00:00, 3.85it/s]\n" + ] + } + ], + "source": [ + "cat gene_variants.txt | kegg_pull pull entry-ids - --output=variant_info" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "679f5a1b-f4d1-4585-8a11-93bf81fcf795", + "metadata": {}, + "outputs": [], + "source": [ + "cat variant_info/* > all_variants.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd86096d-cf04-4117-b4c2-9736327dcc86", + "metadata": {}, + "outputs": [], + "source": [ + "cp all_variants.txt all_variants_filtered.txt" + ] + }, + { + "cell_type": "markdown", + "id": "810dd902-7dad-4fe7-b028-7a865c9d35d6", + "metadata": {}, + "source": [ + "### Switching to python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5bc079e4-bf59-404a-9f4b-c8b2b706a03a", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6851a309-c86f-477f-a355-feb3e959aa48", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "def remove_references(text):\n", + " # This regex matches 'REFERENCE' lines and all subsequent indented lines (those starting with 2+ spaces)\n", + " cleaned_text = re.sub(r'REFERENCE\\s+PMID:\\d+\\n(?: {2}.*\\n)*', '', text)\n", + " return cleaned_text" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "98db6b07-480f-42e7-8b45-6f710a33b6ef", + "metadata": {}, + "outputs": [], + "source": [ + "with open('all_variants_filtered.txt', 'r') as f:\n", + " original_text = f.read()\n", + "\n", + "cleaned_text = remove_references(original_text)\n", + "\n", + "with open('all_variants_filtered.txt', 'w') as f:\n", + " f.write(cleaned_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfeab2a6-81b9-4d3a-b1bc-a6491294dd15", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "dc011a9b-7c1b-40c8-86f8-1c1b28bae6b7", + "metadata": {}, + "outputs": [], + "source": [ + "def remove_network(text):\n", + " lines = text.split('\\n')\n", + " cleaned_lines = []\n", + " skip_block = False\n", + "\n", + " for line in lines:\n", + " if line.startswith(\"NETWORK\"):\n", + " skip_block = True\n", + " continue\n", + " if skip_block:\n", + " if line.startswith(\" \") or line.startswith(\"\\t\"):\n", + " continue\n", + " else:\n", + " skip_block = False\n", + " if not skip_block:\n", + " cleaned_lines.append(line)\n", + "\n", + " return '\\n'.join(cleaned_lines)\n", + "\n", + "\n", + "with open('all_variants_filtered.txt', 'r') as f:\n", + " original_text = f.read()\n", + "\n", + "cleaned_text = remove_network(original_text)\n", + "\n", + "with open('all_variants_filtered.txt', 'w') as f:\n", + " f.write(cleaned_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "65f14020-a3b1-4c71-a529-421eb66c70f3", + "metadata": {}, + "outputs": [], + "source": [ + "def remove_network(text):\n", + " lines = text.split('\\n')\n", + " cleaned_lines = []\n", + " skip_block = False\n", + "\n", + " for line in lines:\n", + " if line.startswith(\"DISEASE\"):\n", + " skip_block = True\n", + " continue\n", + " if skip_block:\n", + " if line.startswith(\" \") or line.startswith(\"\\t\"):\n", + " continue\n", + " else:\n", + " skip_block = False\n", + " if not skip_block:\n", + " cleaned_lines.append(line)\n", + "\n", + " return '\\n'.join(cleaned_lines)\n", + "\n", + "\n", + "with open('all_variants_filtered.txt', 'r') as f:\n", + " original_text = f.read()\n", + "\n", + "cleaned_text = remove_network(original_text)\n", + "\n", + "with open('all_variants_filtered.txt', 'w') as f:\n", + " f.write(cleaned_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c17ee2ae-a9ae-4ff3-b8ae-6a8bbc40e758", + "metadata": {}, + "outputs": [], + "source": [ + "def remove_network(text):\n", + " lines = text.split('\\n')\n", + " cleaned_lines = []\n", + " skip_block = False\n", + "\n", + " for line in lines:\n", + " if line.startswith(\"DRUG_TARGET\"):\n", + " skip_block = True\n", + " continue\n", + " if skip_block:\n", + " if line.startswith(\" \") or line.startswith(\"\\t\"):\n", + " continue\n", + " else:\n", + " skip_block = False\n", + " if not skip_block:\n", + " cleaned_lines.append(line)\n", + "\n", + " return '\\n'.join(cleaned_lines)\n", + "\n", + "\n", + "with open('all_variants_filtered.txt', 'r') as f:\n", + " original_text = f.read()\n", + "\n", + "cleaned_text = remove_network(original_text)\n", + "\n", + "with open('all_variants_filtered.txt', 'w') as f:\n", + " f.write(cleaned_text)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "114de6af-3c21-40a6-ad39-76c6c85adf38", + "metadata": {}, + "source": [ + "Chatgpt to parse out this file and give me a tsv with 3 columns. Entry, Source and ID\n", + "\n", + "Source is which SNV database it is from. Omimvar or clinvar or dbsnp or cosm or dbvar or cosf" + ] + }, + { + "cell_type": "markdown", + "id": "8e397b34-94e0-4564-bb21-2a92d161b5af", + "metadata": {}, + "source": [ + "### switch back to bash" + ] + }, + { + "cell_type": "markdown", + "id": "8fcdd7d0-27de-45f0-b6d3-96f9ee39f183", + "metadata": {}, + "source": [ + "# Downloading all Variant Information" + ] + }, + { + "cell_type": "markdown", + "id": "115df1ef-4e1c-4f31-babf-cd85960e6fea", + "metadata": {}, + "source": [ + "**Not using dbVar as it has been discontinued and most of the links to dbvar are bad** ClinVar is the alternate and holds all of the data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "61487285-a1af-4c1a-8b20-a52c8f26951b", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6ea7baea-ae92-4a93-a524-f44608dbe6d9", + "metadata": {}, + "outputs": [], + "source": [ + "rm all_variants.txt\n", + "rm all_variants_filtered.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b66473bd-cebc-4fa2-bdd8-310b67e82aaf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 60\n", + " 235\n", + " 201\n", + " 202\n", + " 28\n", + " 87\n" + ] + } + ], + "source": [ + "grep OmimVar parsed_variants.tsv | wc -l\n", + "grep ClinVar parsed_variants.tsv | wc -l\n", + "grep dbSNP parsed_variants.tsv | wc -l\n", + "grep COSM parsed_variants.tsv | wc -l\n", + "grep dbVar parsed_variants.tsv | wc -l\n", + "grep COSF parsed_variants.tsv | wc -l" + ] + }, + { + "cell_type": "markdown", + "id": "0eaac99f-2166-43c7-9b16-90616b71272d", + "metadata": {}, + "source": [ + "### OmimVar" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a135480-1dab-491c-b92b-03e9cc579c71", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "esearch -db clinvar -query \"601556[mim]\" | efetch -format docsum" + ] + }, + { + "cell_type": "markdown", + "id": "1b364d25-e8dc-4815-8a62-0dd00554d875", + "metadata": {}, + "source": [ + "From the output that you get, look for the variant ID in the output and then get that specific document summary" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "cf01ffe2-79d1-4dd5-b130-2d1a07554b90", + "metadata": {}, + "outputs": [], + "source": [ + "grep OmimVar parsed_variants.tsv | cut -f3 > Omim/OmimVar_id.txt" + ] + }, + { + "cell_type": "markdown", + "id": "f48e906b-70f7-420b-ba97-5105eda3c74d", + "metadata": {}, + "source": [ + "It is being really difficult to run this with a loop in bash, so just running it all manually like this" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3697f89a-6e17-435d-906d-927947259177", + "metadata": {}, + "outputs": [], + "source": [ + "esearch -db clinvar -query \"601978[mim]\" | efetch -format docsum > Omim/601978.xml\n", + "esearch -db clinvar -query \"602533[mim]\" | efetch -format docsum > Omim/602533.xml\n", + "esearch -db clinvar -query \"609007[mim]\" | efetch -format docsum > Omim/609007.xml\n", + "esearch -db clinvar -query \"111730[mim]\" | efetch -format docsum > Omim/111730.xml\n", + "esearch -db clinvar -query \"603448[mim]\" | efetch -format docsum > Omim/603448.xml\n", + "esearch -db clinvar -query \"608300[mim]\" | efetch -format docsum > Omim/608300.xml\n", + "esearch -db clinvar -query \"601143[mim]\" | efetch -format docsum > Omim/601143.xml\n", + "esearch -db clinvar -query \"614260[mim]\" | efetch -format docsum > Omim/614260.xml\n", + "esearch -db clinvar -query \"600543[mim]\" | efetch -format docsum > Omim/600543.xml\n", + "esearch -db clinvar -query \"605078[mim]\" | efetch -format docsum > Omim/605078.xml\n", + "esearch -db clinvar -query \"137070[mim]\" | efetch -format docsum > Omim/137070.xml\n", + "esearch -db clinvar -query \"211100[mim]\" | efetch -format docsum > Omim/211100.xml\n", + "esearch -db clinvar -query \"182100[mim]\" | efetch -format docsum > Omim/182100.xml\n", + "esearch -db clinvar -query \"111100[mim]\" | efetch -format docsum > Omim/111100.xml\n", + "esearch -db clinvar -query \"189980[mim]\" | efetch -format docsum > Omim/189980.xml\n", + "esearch -db clinvar -query \"606463[mim]\" | efetch -format docsum > Omim/606463.xml\n", + "esearch -db clinvar -query \"600429[mim]\" | efetch -format docsum > Omim/600429.xml\n", + "esearch -db clinvar -query \"603371[mim]\" | efetch -format docsum > Omim/603371.xml\n", + "esearch -db clinvar -query \"613109[mim]\" | efetch -format docsum > Omim/613109.xml\n", + "esearch -db clinvar -query \"604834[mim]\" | efetch -format docsum > Omim/604834.xml\n", + "esearch -db clinvar -query \"604473[mim]\" | efetch -format docsum > Omim/604473.xml\n", + "esearch -db clinvar -query \"300264[mim]\" | efetch -format docsum > Omim/300264.xml\n", + "esearch -db clinvar -query \"613004[mim]\" | efetch -format docsum > Omim/613004.xml\n", + "esearch -db clinvar -query \"308000[mim]\" | efetch -format docsum > Omim/308000.xml\n", + "esearch -db clinvar -query \"104760[mim]\" | efetch -format docsum > Omim/104760.xml\n", + "esearch -db clinvar -query \"102600[mim]\" | efetch -format docsum > Omim/102600.xml\n", + "esearch -db clinvar -query \"176264[mim]\" | efetch -format docsum > Omim/176264.xml\n", + "esearch -db clinvar -query \"605411[mim]\" | efetch -format docsum > Omim/605411.xml\n", + "esearch -db clinvar -query \"600734[mim]\" | efetch -format docsum > Omim/600734.xml\n", + "esearch -db clinvar -query \"607047[mim]\" | efetch -format docsum > Omim/607047.xml\n", + "esearch -db clinvar -query \"176763[mim]\" | efetch -format docsum > Omim/176763.xml\n", + "esearch -db clinvar -query \"602544[mim]\" | efetch -format docsum > Omim/602544.xml\n", + "esearch -db clinvar -query \"131340[mim]\" | efetch -format docsum > Omim/131340.xml\n", + "esearch -db clinvar -query \"176610[mim]\" | efetch -format docsum > Omim/176610.xml\n", + "esearch -db clinvar -query \"607922[mim]\" | efetch -format docsum > Omim/607922.xml\n", + "esearch -db clinvar -query \"176640[mim]\" | efetch -format docsum > Omim/176640.xml\n", + "esearch -db clinvar -query \"176801[mim]\" | efetch -format docsum > Omim/176801.xml\n", + "esearch -db clinvar -query \"104311[mim]\" | efetch -format docsum > Omim/104311.xml\n", + "esearch -db clinvar -query \"600759[mim]\" | efetch -format docsum > Omim/600759.xml\n", + "esearch -db clinvar -query \"601556[mim]\" | efetch -format docsum > Omim/601556.xml\n", + "esearch -db clinvar -query \"601517[mim]\" | efetch -format docsum > Omim/601517.xml\n", + "esearch -db clinvar -query \"612895[mim]\" | efetch -format docsum > Omim/612895.xml\n", + "esearch -db clinvar -query \"608309[mim]\" | efetch -format docsum > Omim/608309.xml\n", + "esearch -db clinvar -query \"163890[mim]\" | efetch -format docsum > Omim/163890.xml\n", + "esearch -db clinvar -query \"147450[mim]\" | efetch -format docsum > Omim/147450.xml\n", + "esearch -db clinvar -query \"604985[mim]\" | efetch -format docsum > Omim/604985.xml\n", + "esearch -db clinvar -query \"606765[mim]\" | efetch -format docsum > Omim/606765.xml\n", + "esearch -db clinvar -query \"602345[mim]\" | efetch -format docsum > Omim/602345.xml\n", + "esearch -db clinvar -query \"191110[mim]\" | efetch -format docsum > Omim/191110.xml\n", + "esearch -db clinvar -query \"191342[mim]\" | efetch -format docsum > Omim/191342.xml\n", + "esearch -db clinvar -query \"601023[mim]\" | efetch -format docsum > Omim/601023.xml\n", + "esearch -db clinvar -query \"608537[mim]\" | efetch -format docsum > Omim/608537.xml\n", + "esearch -db clinvar -query \"601011[mim]\" | efetch -format docsum > Omim/601011.xml\n", + "esearch -db clinvar -query \"114206[mim]\" | efetch -format docsum > Omim/114206.xml\n", + "esearch -db clinvar -query \"603094[mim]\" | efetch -format docsum > Omim/603094.xml\n", + "esearch -db clinvar -query \"601530[mim]\" | efetch -format docsum > Omim/601530.xml\n", + "esearch -db clinvar -query \"607904[mim]\" | efetch -format docsum > Omim/607904.xml\n", + "esearch -db clinvar -query \"605704[mim]\" | efetch -format docsum > Omim/605704.xml" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "43c6599c-6ec6-48a5-8264-8e86c1869e63", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 58 Omim/OmimVar_id.txt\n" + ] + } + ], + "source": [ + "wc -l Omim/OmimVar_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "a56a9106-06c6-4d23-983f-227ca14f85a4", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "601978 exists.\n", + "602533 exists.\n", + "609007 exists.\n", + "111730 exists.\n", + "603448 exists.\n", + "608300 exists.\n", + "601143 exists.\n", + "614260 exists.\n", + "600543 exists.\n", + "605078 exists.\n", + "137070 exists.\n", + "211100 exists.\n", + "182100 exists.\n", + "111100 exists.\n", + "189980 exists.\n", + "606463 exists.\n", + "600429 exists.\n", + "603371 exists.\n", + "613109 exists.\n", + "604834 exists.\n", + "604473 exists.\n", + "300264 exists.\n", + "613004 exists.\n", + "308000 exists.\n", + "104760 exists.\n", + "102600 exists.\n", + "176264 exists.\n", + "605411 exists.\n", + "600734 exists.\n", + "607047 exists.\n", + "176763 exists.\n", + "602544 exists.\n", + "131340 exists.\n", + "176610 exists.\n", + "607922 exists.\n", + "176640 exists.\n", + "176801 exists.\n", + "104311 exists.\n", + "600759 exists.\n", + "601556 exists.\n", + "601517 exists.\n", + "612895 exists.\n", + "608309 exists.\n", + "163890 exists.\n", + "147450 exists.\n", + "604985 exists.\n", + "606765 exists.\n", + "602345 exists.\n", + "191110 exists.\n", + "191342 exists.\n", + "601023 exists.\n", + "608537 exists.\n", + "601011 exists.\n", + "114206 exists.\n", + "603094 exists.\n", + "601530 exists.\n", + "607904 exists.\n", + "605704 exists.\n" + ] + } + ], + "source": [ + "while read p; do\n", + "if test -f Omim/$p.xml; then\n", + " echo \"$p exists.\"\n", + "else\n", + " echo \"$p does not exist.\"\n", + "fi\n", + "done < Omim/OmimVar_id.txt" + ] + }, + { + "cell_type": "markdown", + "id": "60f69460-27c4-4a92-995f-80a7540cb610", + "metadata": {}, + "source": [ + "Switch to python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eba16469-12ac-4d00-8888-d6d997ce29f4", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "420ee5bf-976b-4805-9895-93e202f20ba2", + "metadata": {}, + "outputs": [], + "source": [ + "import xml.etree.ElementTree as ET\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "e9e90379-be34-418e-ad34-1d78fca075f2", + "metadata": {}, + "outputs": [], + "source": [ + "import xml.etree.ElementTree as ET\n", + "\n", + "def extract_linked_ids(xml_path, target_omim_prefix, outfile):\n", + " tree = ET.parse(xml_path)\n", + " root = tree.getroot()\n", + "\n", + " for variation_xrefs in root.iter('variation_xrefs'):\n", + " block = []\n", + " matched_omim_id = None\n", + "\n", + " for xref in variation_xrefs.findall('variation_xref'):\n", + " db_source = xref.findtext('db_source')\n", + " db_id = xref.findtext('db_id')\n", + "\n", + " if db_source and db_id:\n", + " if db_source == \"OMIM\" and db_id.startswith(target_omim_prefix):\n", + " matched_omim_id = db_id\n", + " block.append((db_source, db_id))\n", + "\n", + " if matched_omim_id:\n", + " outfile.write(f\"OMIM ID found: {matched_omim_id}\\n\")\n", + " for source, id_ in block:\n", + " if source != \"OMIM\":\n", + " outfile.write(f\"{source}:{id_}\\n\")\n", + " outfile.write(\"\\n\") # Blank line between blocks" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "f2fb49e1-feb3-41c3-81a2-a1c5e5f0c9bf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded OMIM IDs: ['601978', '602533', '609007', '111730', '603448']\n" + ] + } + ], + "source": [ + "# Load OMIM IDs from file into a list\n", + "with open(\"Omim/OmimVar_id.txt\", \"r\") as f:\n", + " omim_ids = [line.strip() for line in f if line.strip()]\n", + "\n", + "# Optional: print first few IDs\n", + "print(\"Loaded OMIM IDs:\", omim_ids[:5])" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "45c074e5-9026-49d0-8a03-18537db41451", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Fixed: 609007 → saved to Omim_fixed/609007.xml\n", + "✅ Fixed: 601143 → saved to Omim_fixed/601143.xml\n", + "✅ Fixed: 604985 → saved to Omim_fixed/604985.xml\n", + "✅ Fixed: 608537 → saved to Omim_fixed/608537.xml\n", + "✅ Fixed: 601011 → saved to Omim_fixed/601011.xml\n", + "✅ Fixed: 114206 → saved to Omim_fixed/114206.xml\n", + "✅ Fixed: 607904 → saved to Omim_fixed/607904.xml\n" + ] + } + ], + "source": [ + "import os\n", + "import re\n", + "\n", + "# There were issues with some XMLs being malformed. So editing the problematic ones to make one common root.\n", + "problematic_ids = [\n", + " \"609007\", \"601143\", \"604985\", \"608537\", \"601011\", \"114206\", \"607904\"\n", + "]\n", + "\n", + "input_folder = \"Omim\"\n", + "output_folder = \"Omim_fixed\"\n", + "os.makedirs(output_folder, exist_ok=True)\n", + "\n", + "for omim_id in problematic_ids:\n", + " input_file = os.path.join(input_folder, f\"{omim_id}.xml\")\n", + " output_file = os.path.join(output_folder, f\"{omim_id}.xml\")\n", + "\n", + " with open(input_file, \"r\") as f:\n", + " xml_content = f.read()\n", + "\n", + " # Remove leading/trailing whitespace\n", + " xml_content = xml_content.strip()\n", + "\n", + " # Remove any existing XML declaration or DOCTYPE lines\n", + " xml_content = re.sub(r'<\\?xml[^>]+\\?>', '', xml_content)\n", + " xml_content = re.sub(r']*>', '', xml_content)\n", + "\n", + " # Wrap content in and insert declarations at the top\n", + " fixed_xml = (\n", + " '\\n'\n", + " '\\n'\n", + " '\\n'\n", + " f'{xml_content.strip()}\\n'\n", + " ''\n", + " )\n", + "\n", + " # Write the fixed file\n", + " with open(output_file, \"w\") as f:\n", + " f.write(fixed_xml)\n", + "\n", + " print(f\"✅ Fixed: {omim_id} → saved to {output_file}\")" + ] + }, + { + "cell_type": "markdown", + "id": "8e34e0e1-96c3-492b-ae4b-1b725de062c2", + "metadata": {}, + "source": [ + "Iterating over all XMLs and parsing them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dae32152-5f63-4ba6-9968-b6d443189fca", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "21eaff4b-e351-466a-a13f-85b10da15803", + "metadata": {}, + "outputs": [], + "source": [ + "good_ids = [id for id in omim_ids if id not in problematic_ids]" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "d7075e77-a7ab-42c6-b4c5-91eedd698a05", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "51 7\n" + ] + } + ], + "source": [ + "print(len(good_ids), len(problematic_ids))" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "4fdfe8dc-3826-407a-8472-7a63da9ca53c", + "metadata": {}, + "outputs": [], + "source": [ + "for id in good_ids:\n", + " with open(f'Omim/{id}_parsed.txt', \"w\") as f:\n", + " try:\n", + " extract_linked_ids(f'Omim/{id}.xml', id, f)\n", + " except:\n", + " print(id)\n", + " break\n", + " \n", + "for id in problematic_ids:\n", + " with open(f'Omim/{id}_parsed.txt', \"w\") as f:\n", + " try:\n", + " extract_linked_ids(f'Omim_fixed/{id}.xml', id, f)\n", + " except:\n", + " print(id)\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48c5706f-14b9-4903-bf51-1df39bf700ea", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + "if test -f Omim/\"$p\"_parsed.txt; then\n", + " echo \"$p exists.\"\n", + "else\n", + " echo \"$p does not exist.\"\n", + "fi\n", + "done < Omim/OmimVar_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9c028c9b-6d83-4718-9f69-cfb3ec9b85de", + "metadata": {}, + "outputs": [], + "source": [ + "cat Omim/*_parsed.txt > Omim_parsed.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d0634b17-0831-4db6-ae86-305c97735e8c", + "metadata": {}, + "outputs": [], + "source": [ + "sed -i '' '/^ClinGen/d' Omim_parsed.txt\n", + "sed -i '' '/^UniProtKB/d' Omim_parsed.txt\n", + "sed -i '' '/^ClinVar/d' Omim_parsed.txt\n", + "sed -i '' '/^dbVar/d' Omim_parsed.txt\n", + "sed -i '' '/^Genetic/d' Omim_parsed.txt\n", + "sed -i '' '/^LOVD/d' Omim_parsed.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "2101ed00-feb4-4004-981a-e3c61976d339", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Parsed file into Omim_parsed.tsv\n" + ] + } + ], + "source": [ + "#!/bin/bash\n", + "\n", + "input_file=\"Omim_parsed.txt\" # Your input file\n", + "output_file=\"Omim_parsed.tsv\" # Output TSV file\n", + "\n", + "# Write header\n", + "echo -e \"omim_id\\tdbsnp_id\" > \"$output_file\"\n", + "\n", + "# Initialize variables\n", + "omim_id=\"\"\n", + "dbsnp_id=\"\"\n", + "\n", + "# Read the file line-by-line\n", + "while IFS= read -r line || [ -n \"$line\" ]; do\n", + " # If it's an OMIM line\n", + " if [[ $line == OMIM\\ ID\\ found:* ]]; then\n", + " # If we had a previous OMIM without dbSNP, write it now\n", + " if [[ -n $omim_id ]]; then\n", + " echo -e \"${omim_id}\\t${dbsnp_id}\" >> \"$output_file\"\n", + " fi\n", + " omim_id=\"${line#OMIM ID found: }\"\n", + " dbsnp_id=\"\" # Reset dbSNP\n", + " elif [[ $line == dbSNP:* ]]; then\n", + " dbsnp_id=\"${line#dbSNP:}\"\n", + " fi\n", + "done < \"$input_file\"\n", + "\n", + "# Write the last record\n", + "if [[ -n $omim_id ]]; then\n", + " echo -e \"${omim_id}\\t${dbsnp_id}\" >> \"$output_file\"\n", + "fi\n", + "\n", + "echo \"✅ Parsed file into $output_file\"" + ] + }, + { + "cell_type": "markdown", + "id": "a35a9986-62df-4893-93de-21733eb68404", + "metadata": {}, + "source": [ + "Adding 624 dbSNP IDs to the dbSNP file for retrieval" + ] + }, + { + "cell_type": "markdown", + "id": "6acb6389-7b0e-4dc6-871c-952528646920", + "metadata": {}, + "source": [ + "### ClinVar" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "29531b7b-33f1-4e15-b0b9-8090c4bde11f", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "2f98ba8b-3120-416d-a13e-a8f7235992bd", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + " Build250414-1300.1\n", + " \n", + " 17584\n", + " single nucleotide variant\n", + " VCV000017584\n", + " VCV000017584.5\n", + " NM_001904.4(CTNNB1):c.101G>A (p.Gly34Glu)\n", + " \n", + " \n", + " 32623\n", + " \n", + " \n", + " ClinGen\n", + " CA127277\n", + " \n", + " \n", + " UniProtKB\n", + " P35222#VAR_017620\n", + " \n", + " \n", + " OMIM\n", + " 116806.0008\n", + " \n", + " \n", + " dbSNP\n", + " 28931589\n", + " \n", + " \n", + " NM_001904.4(CTNNB1):c.101G>A (p.Gly34Glu)\n", + " c.101G>A\n", + " \n", + " \n", + " current\n", + " GRCh38\n", + " 3\n", + " 3p22.1\n", + " 41224613\n", + " 41224613\n", + " 41224613\n", + " 41224613\n", + " GCF_000001405.38\n", + " \n", + " \n", + " previous\n", + " GRCh37\n", + " 3\n", + " 3p22.1\n", + " 41266104\n", + " 41266104\n", + " 41266104\n", + " 41266104\n", + " GCF_000001405.25\n", + " \n", + " \n", + " \n", + " \n", + " Exome Aggregation Consortium (ExAC)\n", + " 0.00001\n", + " \n", + " \n", + " single nucleotide variant\n", + " NC_000003.12:41224612:G:A\n", + " \n", + " \n", + " \n", + " \n", + " SCV000039437\n", + " SCV000599908\n", + " \n", + " \n", + " RCV000019149\n", + " RCV000443977\n", + " \n", + " \n", + " \n", + " Pathogenic; other\n", + " 2016/05/01 00:00\n", + " no assertion criteria provided\n", + " \n", + " \n", + " \n", + " \n", + " Orphanet\n", + " 616\n", + " \n", + " \n", + " MedGen\n", + " C0025149\n", + " \n", + " \n", + " MeSH\n", + " D008527\n", + " \n", + " \n", + " MONDO\n", + " MONDO:0007959\n", + " \n", + " \n", + " OMIM\n", + " 155255\n", + " \n", + " \n", + " Human Phenotype Ontology\n", + " HP:0002885\n", + " \n", + " \n", + " Medulloblastoma\n", + " \n", + " \n", + " \n", + " \n", + " Orphanet\n", + " 91414\n", + " \n", + " \n", + " MedGen\n", + " C0206711\n", + " \n", + " \n", + " MeSH\n", + " D018296\n", + " \n", + " \n", + " MONDO\n", + " MONDO:0007564\n", + " \n", + " \n", + " OMIM\n", + " 132600\n", + " \n", + " \n", + " Human Phenotype Ontology\n", + " HP:0030434\n", + " \n", + " \n", + " Pilomatrixoma\n", + " \n", + " \n", + " \n", + " \n", + " 1/01/01 00:00\n", + " \n", + " \n", + " 1/01/01 00:00\n", + " \n", + " CTNNB1\n", + " 03\n", + " 00000000000041224613\n", + " \n", + " \n", + " CTNNB1\n", + " 1499\n", + " +\n", + " submitted\n", + " \n", + " \n", + " LOC126806658\n", + " 126806658\n", + " +\n", + " submitted\n", + " \n", + " \n", + " \n", + " missense variant\n", + " \n", + " G34E, G27E\n", + " \n", + "\n" + ] + } + ], + "source": [ + "esearch -db clinvar -query 17584 | efetch -format docsum" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "04a62ef6-5c7d-4644-bc8f-26148b470dbf", + "metadata": {}, + "outputs": [], + "source": [ + "grep ClinVar parsed_variants.tsv | cut -f3 > ClinVar/ClinVar_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "29542800-1697-481f-a0b7-5236dee9752e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 232 ClinVar/ClinVar_id.txt\n" + ] + } + ], + "source": [ + "wc -l ClinVar/ClinVar_id.txt" + ] + }, + { + "cell_type": "markdown", + "id": "e37a7a54-6049-4a86-808d-37a3f64721ac", + "metadata": {}, + "source": [ + "Saved all of the esearch queries to clinvar_esearch.sh . 232 of them" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6c56c9f6-a6f5-487f-b474-a0dcf4b0f763", + "metadata": {}, + "outputs": [], + "source": [ + "chmod +x ClinVar/clinvar_esearch.sh" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "78286511-609e-4f02-9717-4d094e9feebb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 232 ClinVar/clinvar_esearch.sh\n" + ] + } + ], + "source": [ + "wc -l ClinVar/clinvar_esearch.sh" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "321b9cb0-0cc4-4442-bcbc-01e0d7a2f50c", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "ClinVar/./clinvar_esearch.sh" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d6ae0dde-45c0-4853-b5dc-11ed6d195eec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "376308 is empty\n", + "376242 is empty\n", + "376235 is empty\n", + "376233 is empty\n", + "375895 is empty\n", + "376282 is empty\n", + "376280 is empty\n", + "396706 is empty\n", + "375971 is empty\n", + "376068 is empty\n", + "376728 is empty\n", + "160870 is empty\n", + "376464 is empty\n", + "376461 is empty\n", + "375873 is empty\n", + "376220 is empty\n", + "375871 is empty\n", + "375872 is empty\n", + "376221 is empty\n", + "376069 is empty\n" + ] + } + ], + "source": [ + "while read p; do\n", + "[ -s ClinVar/$p.xml ] || echo \"$p is empty\"\n", + "done < ClinVar/ClinVar_id.txt" + ] + }, + { + "cell_type": "markdown", + "id": "db67cf54-0487-4736-a323-5b0417b50295", + "metadata": {}, + "source": [ + "There are 20 XMLs as seen above that have been deleted so I cannot access them" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "61f507ed-7ccf-4a43-aee9-0f70aea28791", + "metadata": {}, + "outputs": [], + "source": [ + "esearch -db clinvar -query 177620 | efetch -format docsum > ClinVar/177620.xml" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "f3b18da5-4f8f-46be-8a51-f20fb15a40fc", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + "[ -s ClinVar/$p.xml ] || rm ClinVar/$p.xml\n", + "done < ClinVar/ClinVar_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "fd9671a2-9c21-46be-ab39-37a871ffbebd", + "metadata": {}, + "outputs": [], + "source": [ + "sed -i '' '/^376308$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376242$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376235$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376233$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^375895$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376282$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376280$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^396706$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^375971$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376068$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376728$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^160870$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376464$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376461$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^375873$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376220$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^375871$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^375872$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376221$/d' ClinVar/ClinVar_id.txt\n", + "sed -i '' '/^376069$/d' ClinVar/ClinVar_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "88d4c572-be3d-4b9c-a00e-d194f2b46351", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 212 ClinVar/ClinVar_id.txt\n" + ] + } + ], + "source": [ + "wc -l ClinVar/ClinVar_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "36907abc-374c-4803-bc4f-bac8890246b8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 214\n" + ] + } + ], + "source": [ + "ls ClinVar | wc -l" + ] + }, + { + "cell_type": "markdown", + "id": "90173878-a854-4453-83a4-f32465f9425a", + "metadata": {}, + "source": [ + "214 is good and checks out. 214 - 2 = 212 which is how many ids we have" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1371ee83-ef5a-48fa-b43d-f880a744a5ae", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d4e6c38d-a397-423a-ac26-02b1240cab25", + "metadata": {}, + "outputs": [], + "source": [ + "import xml.etree.ElementTree as ET\n", + "import os\n", + "\n", + "# Paths\n", + "id_file = \"ClinVar/ClinVar_id.txt\"\n", + "input_folder = \"ClinVar\"\n", + "output_file = \"ClinVar_parsed_output.tsv\"\n", + "\n", + "# Read all IDs from the input file\n", + "with open(id_file, \"r\") as f:\n", + " clinvar_ids = [line.strip() for line in f if line.strip()]\n", + "\n", + "# Prepare output file\n", + "with open(output_file, \"w\") as out:\n", + " # Write header\n", + " out.write(\"ClinVar_ID\\tseq_id\\tposition\\tref\\talt\\n\")\n", + "\n", + " for cid in clinvar_ids:\n", + " xml_path = os.path.join(input_folder, f\"{cid}.xml\")\n", + " if not os.path.exists(xml_path):\n", + " print(f\"⚠️ File not found: {xml_path}\")\n", + " continue\n", + "\n", + " try:\n", + " # Parse XML\n", + " tree = ET.parse(xml_path)\n", + " root = tree.getroot()\n", + "\n", + " # Find all canonical_spdi tags\n", + " for spdi in root.iter(\"canonical_spdi\"):\n", + " text = spdi.text\n", + " if text and \":\" in text:\n", + " parts = text.split(\":\")\n", + " if len(parts) == 4:\n", + " seq_id, pos, ref, alt = parts\n", + " out.write(f\"{cid}\\t{seq_id}\\t{pos}\\t{ref}\\t{alt}\\n\")\n", + "\n", + " except ET.ParseError as e:\n", + " print(f\"❌ Parse error in {cid}.xml: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ad902e02-e988-4b69-ac36-435bfd317c9f", + "metadata": {}, + "outputs": [], + "source": [ + "parsed = ['16928','16929','183391','183393','183395','8823','420108','9409','376307','220711','376310','376305','376303','77637','376384','233484','127526','182409','376306','182423','17577','17576','17580','17587','17588','17579','17583','17582','376231','17584','376232','17589','17578','376228','177620','16609','45263','16613','16339','16359','16332','16333','16342','16348','16276','16273','16272','375972','16274','15933','15934','15935','15936','801','184937','802','12602','12613','35554','180848','160364','376033','219296','9834','9381','39571','39572','14801','13860','13863','13852','12582','12583','12580','12578','16677','16685','16686','16688','186141','13881','13882','13886','13883','376126','13888','13889','13890','162466','162468','162465','375876','13901','13900','373003','39648','73058','375874','177778','162469','162470','5286','225431','225433','225434','225432','31944','13655','13652','13653','13659','91945','12674','164995','13244','13245','13246','13247','13251','13250','13249','409162','418436','7829','427590','187657','7814','7837','7836','7838','7833','189486','428256','186396','404151','375958','7813','189403','185200','189484','7815','92828','189448','9511','9512','13087','428681','13919','13911','38629','37102','13951','8117','8118','13961','375941','12511','213936','217016','12374','12356','12366','12347','12365','43594','12364','12355','127819','376570','12372','2216','43604','93326','2223','417961','14464','6390','41166','41209','4893','4886','4892','161992','161993','161995']" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ae292770-d8b9-4d11-a20a-b8d336d2aed3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "185" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(parsed)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "57ccde64-cc40-4c96-b334-455be7752d3f", + "metadata": {}, + "outputs": [], + "source": [ + "remaining = [id for id in clinvar_ids if id not in parsed]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "49d32e6a-58f7-4325-b779-592fdb9addc6", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir ClinVar_remaining" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "28893df1-1863-4442-8d3c-3c944cba9244", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Copied: 268075.xml\n", + "✅ Copied: 150740.xml\n", + "✅ Copied: 59680.xml\n", + "✅ Copied: 59682.xml\n", + "✅ Copied: 148363.xml\n", + "✅ Copied: 58696.xml\n", + "✅ Copied: 57282.xml\n", + "✅ Copied: 59782.xml\n", + "✅ Copied: 153718.xml\n", + "✅ Copied: 148679.xml\n", + "✅ Copied: 16270.xml\n", + "✅ Copied: 59715.xml\n", + "✅ Copied: 394884.xml\n", + "✅ Copied: 153231.xml\n", + "✅ Copied: 151754.xml\n", + "✅ Copied: 149554.xml\n", + "✅ Copied: 153441.xml\n", + "✅ Copied: 148269.xml\n", + "✅ Copied: 57074.xml\n", + "✅ Copied: 394609.xml\n", + "✅ Copied: 58030.xml\n", + "✅ Copied: 58029.xml\n", + "✅ Copied: 58028.xml\n", + "✅ Copied: 441904.xml\n", + "✅ Copied: 146814.xml\n", + "✅ Copied: 144406.xml\n", + "✅ Copied: 57042.xml\n" + ] + } + ], + "source": [ + "import os\n", + "import shutil \n", + "# Paths\n", + "source_dir = \"ClinVar\"\n", + "dest_dir = \"ClinVar_remaining\"\n", + "\n", + "# Ensure destination folder exists\n", + "os.makedirs(dest_dir, exist_ok=True)\n", + "\n", + "# Iterate and copy files\n", + "for clinvar_id in remaining:\n", + " src = os.path.join(source_dir, f\"{clinvar_id}.xml\")\n", + " dst = os.path.join(dest_dir, f\"{clinvar_id}.xml\")\n", + "\n", + " if os.path.exists(src):\n", + " shutil.copy(src, dst)\n", + " print(f\"✅ Copied: {clinvar_id}.xml\")\n", + " else:\n", + " print(f\"⚠️ Missing: {clinvar_id}.xml\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8cd41b62-cb7b-4380-9d79-de1befc1637c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 27\n" + ] + } + ], + "source": [ + "!ls Clinvar_remaining | wc -l" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "eb58056e-71ad-4602-847f-457366f2b963", + "metadata": {}, + "outputs": [], + "source": [ + "!cat Clinvar_remaining/* > Clinvar_remaining/all_remaining_variants.xml" + ] + }, + { + "cell_type": "markdown", + "id": "2b779d44-a620-42c8-8f0c-5ab96dcec165", + "metadata": {}, + "source": [ + "They are all copy number gain variations. Nothing that I can do for this project. So we will stick with our 185 parsed" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "29e34b3a-8a05-448a-a22f-6f9a0712f221", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 186 ClinVar_parsed_output.tsv\n" + ] + } + ], + "source": [ + "!wc -l ClinVar_parsed_output.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb3f65c6-6c01-4aab-ba6b-b46c7834ff1e", + "metadata": {}, + "outputs": [], + "source": [ + "rm -r Clinvar_remaining" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db6bf93c-404b-43fd-8699-5cd8de1ae03b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "a0610722-b83d-4d72-b0cf-75135eaa7141", + "metadata": {}, + "source": [ + "### dbSNP" + ] + }, + { + "cell_type": "markdown", + "id": "7b650b97-61c1-4db3-8e0c-9c4df6d4aac6", + "metadata": {}, + "source": [ + "Have to get the variants from OmimVar" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "470b0539-8f7f-49f0-8f95-e9293e3872d3", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "751381ad-54e6-4063-a1f2-71c4ffb2e617", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + " Build250306-1408.1\n", + " \n", + " 1131690863\n", + " 1131690863\n", + " 0\n", + " uncertain-significance,pathogenic\n", + " \n", + " \n", + " RB1\n", + " 5925\n", + " \n", + " \n", + " LOC112268118\n", + " 112268118\n", + " \n", + " \n", + " NC_000013.11\n", + " 13\n", + " EVA,CSS-BFX,CLINVAR\n", + " NC_000013.11:48362846:C:A,NC_000013.11:48362846:C:G,NC_000013.11:48362846:C:T\n", + " coding_sequence_variant,stop_gained,500B_downstream_variant,synonymous_variant,missense_variant,downstream_transcript_variant\n", + " by-cluster\n", + " HGVS=NC_000013.11:g.48362847C>A,NC_000013.11:g.48362847C>G,NC_000013.11:g.48362847C>T,NC_000013.10:g.48936983C>A,NC_000013.10:g.48936983C>G,NC_000013.10:g.48936983C>T,NG_009009.1:g.64101C>A,NG_009009.1:g.64101C>G,NG_009009.1:g.64101C>T,NM_000321.3:c.751C>A,NM_000321.3:c.751C>G,NM_000321.3:c.751C>T,NM_000321.2:c.751C>A,NM_000321.2:c.751C>G,NM_000321.2:c.751C>T,NM_001407166.1:c.751C>A,NM_001407166.1:c.751C>G,NM_001407166.1:c.751C>T,NM_001407165.1:c.751C>A,NM_001407165.1:c.751C>G,NM_001407165.1:c.751C>T,NP_000312.2:p.Arg251Gly,NP_000312.2:p.Arg251Ter|SEQ=[C/A/G/T]|LEN=1|GENE=RB1:5925,LOC112268118:112268118\n", + " 9606\n", + " 150\n", + " 157\n", + " 2017/07/17 11:16\n", + " 2024/11/03 17:09\n", + " 2137537937,6403986513,8442109874,8936184886\n", + " N\n", + " snv\n", + " 13:48362847\n", + " 13:48936983\n", + " 1131690863\n", + " 1\n", + " 0048362847\n", + " 0\n", + " \n", + "\n" + ] + } + ], + "source": [ + "esearch -db snp -query rs1131690863 | efetch -format docsum" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5cc99d6c-79f3-49fe-8bfa-ddda5821870a", + "metadata": {}, + "outputs": [], + "source": [ + "mkdir dbSNP" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0fe72bad-ed7c-4785-a475-2b39cc31974b", + "metadata": {}, + "outputs": [], + "source": [ + "grep dbSNP parsed_variants.tsv | cut -f3 > dbSNP/dbSNP_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "6f19e412-c93a-498e-8995-e8ec7b2a2398", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 201 dbSNP/dbSNP_id.txt\n" + ] + } + ], + "source": [ + "wc -l dbSNP/dbSNP_id.txt" + ] + }, + { + "cell_type": "markdown", + "id": "f9f1615b-81eb-48a7-adfd-b6f5a0161e79", + "metadata": {}, + "source": [ + "Added from Omim and removed repeats" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "4bb0fcc0-8f66-4fc2-9b7a-02b321194636", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 761 dbSNP/dbSNP_id.txt\n" + ] + } + ], + "source": [ + "wc -l dbSNP/dbSNP_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "e2e729f7-4bec-4f40-986c-fd0103186030", + "metadata": {}, + "outputs": [], + "source": [ + "# Saved the scripts to download all 761\n", + "chmod +x dbSNP/dbSNP_search.sh" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "2b271743-e43d-46ef-b689-2f928007cb4f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b(B\u001b[m\u001b[31m\u001b[1m\u001b[7m ERROR: \u001b(B\u001b[m\u001b[31m\u001b[1m Missing -db argument\u001b(B\u001b[m\n", + "\u001b(B\u001b[m\u001b[31m\u001b[1m\u001b[7m ERROR: \u001b(B\u001b[m\u001b[31m\u001b[1m Missing -db argument\u001b(B\u001b[m\n" + ] + } + ], + "source": [ + "dbSNP/./dbSNP_search.sh" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "1bf2df42-f562-4a29-85b2-7caea7919d58", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + "if ! test -f dbSNP/\"$p\".xml; then\n", + " echo \"$p does not exist.\"\n", + "fi\n", + "done < dbSNP/dbSNP_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "1b8d5d2d-83b1-4e23-8595-00c9ad01148b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rs121908237 is empty\n", + "rs137852480 is empty\n", + "rs13785281 is empty\n" + ] + } + ], + "source": [ + "while read p; do\n", + "[ -s dbSNP/\"$p\".xml ] || echo \"$p is empty\"\n", + "done < dbSNP/dbSNP_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "8a5de403-f6da-402b-a97a-32127db07014", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "esearch -db snp -query rs121908237 | efetch -format docsum > dbSNP/rs121908237.xml" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "e3dd144a-4aaa-4d72-84ee-2c43ce48f627", + "metadata": {}, + "outputs": [], + "source": [ + "esearch -db snp -query rs137852480 | efetch -format docsum > dbSNP/rs137852480.xml" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "93b02ebc-fe12-47a5-aa35-1c51445fcdcf", + "metadata": {}, + "outputs": [], + "source": [ + "esearch -db snp -query rs13785281 | efetch -format docsum > dbSNP/rs13785281.xml" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "5bc843db-1b32-4ec7-82f6-d1dec6589415", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rs13785281 is empty\n" + ] + } + ], + "source": [ + "while read p; do\n", + "[ -s dbSNP/\"$p\".xml ] || echo \"$p is empty\"\n", + "done < dbSNP/dbSNP_id.txt" + ] + }, + { + "cell_type": "markdown", + "id": "e55ec094-ee54-459d-92b0-2d767c1c428e", + "metadata": {}, + "source": [ + "rs13785281 is not found and is removed from the id file" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "0558f5d9-0627-4187-a451-90111fa2b1d9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 760 dbSNP/dbSNP_id.txt\n" + ] + } + ], + "source": [ + "wc -l dbSNP/dbSNP_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "1f09c627-0e45-4632-9918-0ee00e34350b", + "metadata": {}, + "outputs": [], + "source": [ + "while read -r p; do\n", + " if ! grep -q SPDI \"dbSNP/$p.xml\"; then\n", + " echo \"$p\"\n", + " fi\n", + "done < dbSNP/dbSNP_id.txt" + ] + }, + { + "cell_type": "markdown", + "id": "1873b4d1-96bd-431a-98ad-b47c361bbefb", + "metadata": {}, + "source": [ + "No output means every file has SPDI. yay" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bfd640d-765a-4cb8-a459-27c6ff897572", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "9a487229-2a65-4437-9c58-639774197373", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import xml.etree.ElementTree as ET\n", + "\n", + "# File paths\n", + "input_ids_file = \"dbSNP/dbSNP_id.txt\"\n", + "input_folder = \"dbSNP\"\n", + "output_file = \"dbSNP_output.tsv\"\n", + "\n", + "# Read SNP IDs\n", + "with open(input_ids_file, \"r\") as f:\n", + " dbsnp_ids = [line.strip() for line in f if line.strip()]\n", + "\n", + "# Open TSV output file\n", + "with open(output_file, \"w\") as out:\n", + " out.write(\"dbsnp_id\\tsequence_id\\tposition\\tref\\talt\\n\")\n", + "\n", + " for dbsnp_id in dbsnp_ids:\n", + " xml_path = os.path.join(input_folder, f\"{dbsnp_id}.xml\")\n", + "\n", + " if not os.path.exists(xml_path):\n", + " print(f\"⚠️ Missing: {xml_path}\")\n", + " continue\n", + "\n", + " try:\n", + " tree = ET.parse(xml_path)\n", + " root = tree.getroot()\n", + "\n", + " for spdi in root.iter(\"SPDI\"):\n", + " if spdi.text:\n", + " spdi_items = spdi.text.strip().split(\",\")\n", + " for item in spdi_items:\n", + " parts = item.strip().split(\":\")\n", + " if len(parts) == 4:\n", + " seq_id, pos, ref, alt = parts\n", + " out.write(f\"{dbsnp_id}\\t{seq_id}\\t{pos}\\t{ref}\\t{alt}\\n\")\n", + " else:\n", + " print(f\"⚠️ Invalid SPDI format in {dbsnp_id}: {item}\")\n", + "\n", + " except ET.ParseError as e:\n", + " print(f\"❌ Parse error in {dbsnp_id}.xml: {e}\")\n", + " except Exception as e:\n", + " print(f\"❌ Unexpected error in {dbsnp_id}.xml: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "2cbf7146-7d94-4905-adc0-f7d1e6443074", + "metadata": {}, + "source": [ + "Removed the duplicate lines and am left with 1408 mutations" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "7727dcc2-ed83-4f8d-808e-d8e24643a53a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 1409 dbSNP_output.tsv\n" + ] + } + ], + "source": [ + "!wc -l dbSNP_output.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9687bc9-97e0-41e9-82a1-5cfb986ae13b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "c4bd83bb-deea-41c0-87dc-d0982b0cc00b", + "metadata": {}, + "source": [ + "### COSM" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "65fb0b92-2e7b-4080-935a-e74b58bf0329", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "645e17ee-fe7c-4b48-bece-3c17de3dbd9c", + "metadata": {}, + "outputs": [], + "source": [ + "mkdir COSM" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "d04ca4ce-29ab-446d-8df9-95984f3c403f", + "metadata": {}, + "outputs": [], + "source": [ + "grep COSM parsed_variants.tsv | cut -f 3 > COSM/COSM_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "12380e56-6f2a-4f8d-9bcf-f97275e6e39b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 202 COSM/COSM_ids.txt\n" + ] + } + ], + "source": [ + "wc -l COSM/COSM_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "3c63290a-2d6d-4cd7-949d-ae3fe77f0136", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1677139\n", + "1989836\n", + "12523\n", + "13800\n", + "12475\n", + "12504\n", + "12506\n", + "13281\n", + "12512\n", + "12476\n" + ] + } + ], + "source": [ + "head COSM/COSM_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "29147778-754b-4ab6-a4ef-49cd5f314503", + "metadata": {}, + "outputs": [], + "source": [ + "while read id; do\n", + "curl --silent \"https://rest.ensembl.org/variation/human/\"$id\"?content-type=application/json\" > COSM/\"$id\".txt\n", + "done < COSM/COSM_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "df9c5163-8841-4279-940a-86d74b69f6a3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 203\n" + ] + } + ], + "source": [ + "ls COSM/* | wc -l" + ] + }, + { + "cell_type": "markdown", + "id": "bcc33132-cb25-49a7-8615-d2dc32278e4d", + "metadata": {}, + "source": [ + "download the COSM database from here https://cancer.sanger.ac.uk/cosmic/download/cosmic/v101/completetargetedscreensmutanttsv" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "c95fce22-ab4d-44d7-9321-d10bf1dfb368", + "metadata": {}, + "outputs": [], + "source": [ + "sed 's/$/\\t/' COSM/COSM_ids.txt > COSM/COSM_ids_tab.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "65045648-4b11-4264-a11a-3b47d58e0bc6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "grep -F -f COSM/COSM_ids_tab.txt Cosmic_CompleteTargetedScreensMutant_v101_GRCh38.tsv > COSM_matched.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "d2c438e0-9110-4f04-b155-71bf04e399f4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 372391 COSM_matched.tsv\n" + ] + } + ], + "source": [ + "wc -l COSM_matched.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "b7a458ce-3980-4ba8-a74f-e16f4534a6cc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 160 COSM_matched_id_unique.txt\n" + ] + } + ], + "source": [ + "cut -f 8 COSM_matched.tsv > COSM_matched_id.txt\n", + "sort -u COSM_matched_id.txt > COSM_matched_id_unique.txt\n", + "wc -l COSM_matched_id_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "22e104b8-587c-4b11-bb40-ef05a4fd1899", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "COSM12475\n", + "COSM12506\n", + "COSM12512\n", + "COSM13766\n", + "COSM13786\n", + "COSM13675\n", + "COSM13224\n", + "COSM13723\n", + "COSM13474\n", + "COSM12505\n", + "COSM785\n", + "COSM238553\n", + "COSM5564006\n", + "COSM5015793\n", + "COSM1673476\n", + "COSM6196669\n", + "COSM878\n", + "COSM965\n", + "COSM4766182\n" + ] + } + ], + "source": [ + "while read -r p; do\n", + " if ! grep -q $p COSM_matched_id_unique.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < COSM/COSM_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "22d892ef-d82b-49e7-a870-f909c3b4bce6", + "metadata": {}, + "outputs": [], + "source": [ + "echo 'COSM12475\n", + "COSM12506\n", + "COSM12512\n", + "COSM13766\n", + "COSM13786\n", + "COSM13675\n", + "COSM13224\n", + "COSM13723\n", + "COSM13474\n", + "COSM12505\n", + "COSM785\n", + "COSM238553\n", + "COSM5564006\n", + "COSM5015793\n", + "COSM1673476\n", + "COSM6196669\n", + "COSM878\n", + "COSM965\n", + "COSM4766182' > COSM_unmatched_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "de79b3a9-2aae-46c1-b5e9-3d5e6c6ea7ff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 19 COSM_unmatched_id.txt\n" + ] + } + ], + "source": [ + "wc -l COSM_unmatched_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "46f5194d-d9db-4896-8d9b-5145e77b95ac", + "metadata": {}, + "outputs": [], + "source": [ + "sed 's/$/\\t/' COSM_unmatched_id.txt > COSM_unmatched_tab_id.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "c1980527-71c4-460f-8798-7b75da61dab4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "grep -F -f COSM_unmatched_tab_id.txt Cosmic_CompleteTargetedScreensMutant_v101_GRCh37.tsv > COSM_unmatched.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "e30fa788-2015-4eb7-bf66-5b37a50531f3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 207 COSM_unmatched.tsv\n" + ] + } + ], + "source": [ + "wc -l COSM_unmatched.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "e942fe15-a77f-49a1-be18-7f11f8b97cfd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 5 COSM_unmatched_id_unique.txt\n" + ] + } + ], + "source": [ + "cut -f 8 COSM_unmatched.tsv > COSM_unmatched_id_parsed.txt\n", + "sort -u COSM_unmatched_id_parsed.txt > COSM_unmatched_id_unique.txt\n", + "wc -l COSM_unmatched_id_unique.txt" + ] + }, + { + "cell_type": "markdown", + "id": "ee9c11d0-0ea2-4c3d-b25b-12cffff1d877", + "metadata": {}, + "source": [ + "**Removing the COSM unmatched IDs from the text file**" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "id": "5986b990-3b24-40ac-a8c9-4f2eca0c5203", + "metadata": {}, + "outputs": [], + "source": [ + "rm COSM/COSM_total_parsed.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "c80b60e7-34e2-46bb-b6a4-7ad4732f7737", + "metadata": {}, + "outputs": [], + "source": [ + "cat COSM/COSM_matched.tsv >> COSM/COSM_total_parsed.tsv\n", + "cat COSM/COSM_unmatched.tsv >> COSM/COSM_total_parsed.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "id": "6daae3a8-092c-4652-ac4c-d78ed3c0bdea", + "metadata": {}, + "outputs": [], + "source": [ + "cp COSM/COSM_ids.txt COSM/COSM_ids_final.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "9f91b602-8beb-4556-9605-893d92117faa", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "COSM12475\n", + "COSM12506\n", + "COSM12512\n", + "COSM13766\n", + "COSM13675\n", + "COSM13224\n", + "COSM13474\n", + "COSM238553\n", + "COSM5564006\n", + "COSM5015793\n", + "COSM1673476\n", + "COSM6196669\n", + "COSM5159\n", + "COSM5313\n", + "COSM5154\n", + "COSM5105\n", + "COSM5204\n", + "COSM5141\n", + "COSM5283\n", + "COSM5079\n", + "COSM5046\n", + "COSM86063\n", + "COSM5142\n", + "COSM5322\n", + "COSM23625\n", + "COSM3736941\n", + "COSM5052\n", + "COSM1167954\n", + "COSM5143\n", + "COSM5119\n", + "COSM5148\n", + "COSM861\n", + "COSM878\n", + "COSM859\n", + "COSM860\n", + "COSM862\n", + "COSM864\n", + "COSM965\n", + "COSM1237919\n", + "COSM13152\n", + "COSM33076\n", + "COSM17983\n", + "COSM25676\n", + "COSM17855\n", + "COSM142849\n", + "COSM4387483\n", + "COSM4766182\n" + ] + } + ], + "source": [ + "while read -r p; do\n", + " if ! grep -q $p COSM/COSM_total_parsed.tsv; then\n", + " echo $p\n", + " sed -i '' '/'$p'/d' COSM/COSM_ids_final.txt\n", + " fi\n", + "done < COSM/COSM_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "98b263d1-0b58-4389-b043-accaf7b300db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 132 COSM/COSM_ids_final.txt\n" + ] + } + ], + "source": [ + "wc -l COSM/COSM_ids_final.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "id": "d92b5492-3048-46e1-a32a-6a7c0137aa19", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 166 COSM/COSM_total_parsed_id_unique.txt\n" + ] + } + ], + "source": [ + "cut -f 8 COSM/COSM_total_parsed.tsv > COSM/COSM_total_parsed_id.txt\n", + "sort -u COSM/COSM_total_parsed_id.txt > COSM/COSM_total_parsed_id_unique.txt\n", + "wc -l COSM/COSM_total_parsed_id_unique.txt\n", + "\n", + "rm COSM/COSM_total_parsed_id.txt\n", + "rm COSM/COSM_total_parsed_id_unique.txt" + ] + }, + { + "cell_type": "markdown", + "id": "6e6a0d3b-bc90-4164-8363-98f0eded180e", + "metadata": {}, + "source": [ + "### Parsing the Matched TSV File" + ] + }, + { + "cell_type": "markdown", + "id": "8febdfca-ae58-4874-a2ef-6446deb91273", + "metadata": {}, + "source": [ + "Got it into excel and deleting columns that don't matter" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "id": "e6a403e1-75be-48b6-b4f0-4d454cba047e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 1140 COSM/COSM_total_parsed.tsv\n" + ] + } + ], + "source": [ + "wc -l COSM/COSM_total_parsed.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "677cab77-1631-4035-9601-4c59d942a0c0", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8123bf3-f57c-4a1f-aa7a-25dea97371cb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "6bca1dde-ae4d-461f-b340-32807c35f8b3", + "metadata": {}, + "source": [ + "### COSF" + ] + }, + { + "cell_type": "markdown", + "id": "b5ec170c-a7f4-4216-b6ef-f7836232481f", + "metadata": {}, + "source": [ + "download the COSF database from here https://cancer.sanger.ac.uk/cosmic/download/cosmic/v101/fusion" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "0b12424c-c36c-4c52-b395-e93edec5d983", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "grep COSF parsed_variants.tsv | cut -f 3 > COSF/cosf_ids_temp.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "7b480d0f-3d6d-4ff5-98bb-1a9f4e570c4f", + "metadata": {}, + "outputs": [], + "source": [ + "sort -u COSF/cosf_ids_temp.txt > COSF/cosf_ids_temp_uniq.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "09e3e382-150b-40da-93ab-71d6aff06fdf", + "metadata": {}, + "outputs": [], + "source": [ + "rm COSF/cosf_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "c0b89452-303c-4ec3-8f77-88d9576d8173", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + "echo COSF$p >> COSF/cosf_ids.txt\n", + "\n", + "done < COSF/cosf_ids_temp_uniq.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "7778fc82-53fe-4ebd-a234-47434cc6bb3f", + "metadata": {}, + "outputs": [], + "source": [ + "rm COSF/cosf_ids_temp.txt\n", + "rm COSF/cosf_ids_temp_uniq.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "e8241f09-30a9-43e8-a4c3-0d1a340aad2f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "COSF121\n", + "COSF1216\n", + "COSF1220\n", + "COSF1224\n", + "COSF1231\n", + "COSF125\n", + "COSF1271\n", + "COSF128\n", + "COSF1319\n", + "COSF1320\n" + ] + } + ], + "source": [ + "head COSF/cosf_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "ab74bdbe-65f6-4596-a425-54868ed6859c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 65 COSF/cosf_ids.txt\n" + ] + } + ], + "source": [ + "wc -l COSF/cosf_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f0fc6fc-41d2-4a64-b214-91d3359f0db0", + "metadata": {}, + "outputs": [], + "source": [ + "cat Cosmic_Fusion_v101_GRCh38.tsv >> COSF/Cosmic_Fusion.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "01d86304-9e9a-4e83-a5cc-e8f221c8c36c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 18M\tCOSF/Cosmic_Fusion.tsv\n" + ] + } + ], + "source": [ + "du -h COSF/Cosmic_Fusion.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "e848ea6f-cec3-482d-8ff4-923dbbf6ce3b", + "metadata": {}, + "outputs": [], + "source": [ + "sed 's/$/\\t/' COSF/cosf_ids.txt > COSF/cosf_ids_tab.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "id": "d2d8a7b5-6c29-459c-8ff4-94abed7250e0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Extracted COSF entries saved to: COSF/kegg_data_cosf.tsv\n" + ] + } + ], + "source": [ + "#!/bin/bash\n", + "\n", + "# Paths (edit these as needed)\n", + "COSF_ID_FILE=\"COSF/cosf_ids_tab.txt\"\n", + "COSMIC_TSV=\"COSF/Cosmic_Fusion.tsv\"\n", + "OUTPUT_TSV=\"COSF/kegg_data_cosf.tsv\"\n", + "\n", + "# Header based on README\n", + "HEADER=\"COSMIC_SAMPLE_ID\\tSAMPLE_NAME\\tCOSMIC_PHENOTYPE_ID\\tCOSMIC_FUSION_ID\\tFUSION_SYNTAX\\tFIVE_PRIME_CHROMOSOME\\tFIVE_PRIME_STRAND\\tFIVE_PRIME_TRANSCRIPT_ID\\tFIVE_PRIME_GENE_SYMBOL\\tFIVE_PRIME_LAST_OBSERVE_EXON\\tFIVE_PRIME_GENOME_START_FROM\\tFIVE_PRIME_GENOME_START_TO\\tFIVE_PRIME_GENOME_STOP_FROM\\tFIVE_PRIME_GENOME_STOP_TO\\tTHREE_PRIME_CHROMOSOME\\tTHREE_PRIME_STRAND\\tTHREE_PRIME_TRANSCRIPT_ID\\tTHREE_PRIME_GENE_SYMBOL\\tTHREE_PRIME_FIRST_OBSERVE_EXON\\tTHREE_PRIME_GENOME_START_FROM\\tTHREE_PRIME_GENOME_START_TO\\tTHREE_PRIME_GENOME_STOP_FROM\\tTHREE_PRIME_GENOME_STOP_TO\\tFUSION_TYPE\\tPUBMED_PMID\"\n", + "\n", + "# Write header to output\n", + "echo -e \"$HEADER\" > \"$OUTPUT_TSV\"\n", + "\n", + "grep -F -f $COSF_ID_FILE $COSMIC_TSV >> $OUTPUT_TSV\n", + "\n", + "echo \"✅ Extracted COSF entries saved to: $OUTPUT_TSV\"" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "id": "a1175ebc-4bcb-499d-bd07-8b0b77df9954", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 29 COSF/kegg_data_cosf_parsed_uniq.txt\n" + ] + } + ], + "source": [ + "cut -f 4 COSF/kegg_data_cosf.tsv > COSF/kegg_data_cosf_parsed.txt\n", + "sort -u COSF/kegg_data_cosf_parsed.txt > COSF/kegg_data_cosf_parsed_uniq.txt\n", + "wc -l COSF/kegg_data_cosf_parsed_uniq.txt\n", + "\n", + "rm COSF/kegg_data_cosf_parsed.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "60e6f9d9-b4d9-4a90-9bfd-8523a26f2d85", + "metadata": {}, + "outputs": [], + "source": [ + "cp COSF/cosf_ids.txt COSF/cosf_ids_final.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "0673d326-99a6-4957-82e4-2151b4a5f2aa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "COSF1220\n", + "COSF1224\n", + "COSF125\n", + "COSF128\n", + "COSF1330\n", + "COSF1490\n", + "COSF154\n", + "COSF155\n", + "COSF166\n", + "COSF168\n", + "COSF1756\n", + "COSF1758\n", + "COSF1805\n", + "COSF187\n", + "COSF189\n", + "COSF1949\n", + "COSF1960\n", + "COSF2067\n", + "COSF2124\n", + "COSF218\n", + "COSF220\n", + "COSF2246\n", + "COSF2248\n", + "COSF248\n", + "COSF300\n", + "COSF302\n", + "COSF355\n", + "COSF356\n", + "COSF394\n", + "COSF396\n", + "COSF463\n", + "COSF501\n", + "COSF504\n", + "COSF528\n", + "COSF806\n", + "COSF808\n" + ] + } + ], + "source": [ + "while read -r p; do\n", + " if ! grep -q $p COSF/kegg_data_cosf_parsed_uniq.txt; then\n", + " echo $p\n", + " sed -i '' '/'$p'/d' COSF/cosf_ids_final.txt\n", + " fi\n", + "done < COSF/cosf_ids.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "e1b77653-09e4-40e6-8ac3-2cf3be1442cf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 29 COSF/cosf_ids_final.txt\n" + ] + } + ], + "source": [ + "wc -l COSF/cosf_ids_final.txt" + ] + }, + { + "cell_type": "markdown", + "id": "670cec3f-237f-4789-a725-7d4d5a366815", + "metadata": {}, + "source": [ + "I was looking at the data and they don't give any proper ways to get the exact nt sequence, so I am leaving this out." + ] + }, + { + "cell_type": "markdown", + "id": "e18961b9-735c-47c2-bd68-0b6184c05375", + "metadata": {}, + "source": [ + "# Matching Variant and Nt sequence to each Network/Pathway" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be8b5cfb-3309-4abf-a7ae-250efae122a0", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "429d5f3b-b9e2-4ac4-9992-e6755a578bf6", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "c8cee289-0c1c-4091-81d8-37121fcd8644", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceID
010133v1OmimVar10133
11019v1ClinVar268075
21019v1ClinVar150740
31019v1dbVarnsv917029
41019v2ClinVar16928
............
7839817v1COSM6196638
784999v2COSM4766182
785999v2COSM1379150
786999v2COSM4766211
787999v2COSM4766271
\n", + "

788 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID\n", + "0 10133v1 OmimVar 10133\n", + "1 1019v1 ClinVar 268075\n", + "2 1019v1 ClinVar 150740\n", + "3 1019v1 dbVar nsv917029\n", + "4 1019v2 ClinVar 16928\n", + ".. ... ... ...\n", + "783 9817v1 COSM 6196638\n", + "784 999v2 COSM 4766182\n", + "785 999v2 COSM 1379150\n", + "786 999v2 COSM 4766211\n", + "787 999v2 COSM 4766271\n", + "\n", + "[788 rows x 3 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "parsed_variants = pd.read_csv(\"parsed_variants.tsv\", sep='\\t')\n", + "parsed_variants" + ] + }, + { + "cell_type": "markdown", + "id": "f0329361-f9df-471f-8ff0-a7265ada0ad2", + "metadata": {}, + "source": [ + "### ClinVar" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "7003f600-8b18-4f50-b041-3dd71af940e2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDseq_idpositionrefalt
016928NC_000012.1257751647GA
116929NC_000012.1257751646CT
2183391NC_000012.1212717896CAGGCGGAGCACCCCAAGCCCAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC
3183393NC_000012.1212718044CT
4183395NC_000012.1212718210CTCTCT
..................
1804886NC_000011.1067483197CT
1814892NC_000011.1067490803CA
182161992NC_000015.1050490442TC
183161993NC_000015.1050490443CG
184161995NC_000015.1050490449CG
\n", + "

185 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " ID seq_id position ref \\\n", + "0 16928 NC_000012.12 57751647 G \n", + "1 16929 NC_000012.12 57751646 C \n", + "2 183391 NC_000012.12 12717896 CAGGCGGAGCACCCCAAGCC \n", + "3 183393 NC_000012.12 12718044 C \n", + "4 183395 NC_000012.12 12718210 CTCT \n", + ".. ... ... ... ... \n", + "180 4886 NC_000011.10 67483197 C \n", + "181 4892 NC_000011.10 67490803 C \n", + "182 161992 NC_000015.10 50490442 T \n", + "183 161993 NC_000015.10 50490443 C \n", + "184 161995 NC_000015.10 50490449 C \n", + "\n", + " alt \n", + "0 A \n", + "1 T \n", + "2 CAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC \n", + "3 T \n", + "4 CT \n", + ".. ... \n", + "180 T \n", + "181 A \n", + "182 C \n", + "183 G \n", + "184 G \n", + "\n", + "[185 rows x 5 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clinvar_data = pd.read_csv(\"ClinVar_parsed_output.tsv\",sep='\\t')\n", + "clinvar_data = clinvar_data.rename(columns={\"ClinVar_ID\": \"ID\"})\n", + "clinvar_data['ID'] = clinvar_data['ID'].astype('string')\n", + "clinvar_data" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "f0319623-fcb7-4d14-a4ec-cbb9f117af50", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of missing ClinVar variant is 49\n" + ] + } + ], + "source": [ + "# Ensure ClinVar_ID is treated as string to avoid dtype mismatch\n", + "clinvar_ids = clinvar_data[\"ID\"].astype(str).unique()\n", + "\n", + "missing_num = 0\n", + "\n", + "# Iterate and print missing ClinVar IDs\n", + "for _, row in parsed_variants.iterrows():\n", + " if row[\"Source\"] == \"ClinVar\" and str(row[\"ID\"]) not in clinvar_ids:\n", + " missing_num+=1\n", + "print(f'Number of missing ClinVar variant is {missing_num}')" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "0d757a26-2a7d-41a8-9914-4bcbd8f166f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDseq_idpositionrefalt
01019v2ClinVar16928NC_000012.1257751647GA
11019v2ClinVar16929NC_000012.1257751646CT
21027v3ClinVar183391NC_000012.1212717896CAGGCGGAGCACCCCAAGCCCAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC
31027v3ClinVar183393NC_000012.1212718044CT
41027v3ClinVar183395NC_000012.1212718210CTCTCT
........................
1809049v1ClinVar4886NC_000011.1067483197CT
1819049v1ClinVar4892NC_000011.1067490803CA
1829101v1ClinVar161992NC_000015.1050490442TC
1839101v1ClinVar161993NC_000015.1050490443CG
1849101v1ClinVar161995NC_000015.1050490449CG
\n", + "

185 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID seq_id position ref \\\n", + "0 1019v2 ClinVar 16928 NC_000012.12 57751647 G \n", + "1 1019v2 ClinVar 16929 NC_000012.12 57751646 C \n", + "2 1027v3 ClinVar 183391 NC_000012.12 12717896 CAGGCGGAGCACCCCAAGCC \n", + "3 1027v3 ClinVar 183393 NC_000012.12 12718044 C \n", + "4 1027v3 ClinVar 183395 NC_000012.12 12718210 CTCT \n", + ".. ... ... ... ... ... ... \n", + "180 9049v1 ClinVar 4886 NC_000011.10 67483197 C \n", + "181 9049v1 ClinVar 4892 NC_000011.10 67490803 C \n", + "182 9101v1 ClinVar 161992 NC_000015.10 50490442 T \n", + "183 9101v1 ClinVar 161993 NC_000015.10 50490443 C \n", + "184 9101v1 ClinVar 161995 NC_000015.10 50490449 C \n", + "\n", + " alt \n", + "0 A \n", + "1 T \n", + "2 CAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC \n", + "3 T \n", + "4 CT \n", + ".. ... \n", + "180 T \n", + "181 A \n", + "182 C \n", + "183 G \n", + "184 G \n", + "\n", + "[185 rows x 7 columns]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clinvar_final = parsed_variants.merge(clinvar_data, on='ID')\n", + "clinvar_final" + ] + }, + { + "cell_type": "markdown", + "id": "0942ebc8-dfae-4f96-9762-2b83c01b5e29", + "metadata": {}, + "source": [ + "### dbSNP" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "33f3303c-51a4-46d2-9941-8331aee362c6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDdbsnp_idseq_idpositionrefalt
0104311rs661NC_000014.973217224GA
1104311rs661NC_000014.973217224GT
2606463rs364897NC_000001.11155238214TA
3606463rs364897NC_000001.11155238214TC
4606463rs368060NC_000001.11155235216CG
.....................
1403rs672601307rs672601307NC_000015.1050490442TC
1404rs672601308rs672601308NC_000015.1050490443CG
1405rs672601308rs672601308NC_000015.1050490443CT
1406rs672601311rs672601311NC_000015.1050490449CG
1407rs672601311rs672601311NC_000015.1050490449CT
\n", + "

1408 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " ID dbsnp_id seq_id position ref alt\n", + "0 104311 rs661 NC_000014.9 73217224 G A\n", + "1 104311 rs661 NC_000014.9 73217224 G T\n", + "2 606463 rs364897 NC_000001.11 155238214 T A\n", + "3 606463 rs364897 NC_000001.11 155238214 T C\n", + "4 606463 rs368060 NC_000001.11 155235216 C G\n", + "... ... ... ... ... .. ..\n", + "1403 rs672601307 rs672601307 NC_000015.10 50490442 T C\n", + "1404 rs672601308 rs672601308 NC_000015.10 50490443 C G\n", + "1405 rs672601308 rs672601308 NC_000015.10 50490443 C T\n", + "1406 rs672601311 rs672601311 NC_000015.10 50490449 C G\n", + "1407 rs672601311 rs672601311 NC_000015.10 50490449 C T\n", + "\n", + "[1408 rows x 6 columns]" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dbsnp_data = pd.read_csv(\"dbSNP_output.tsv\",sep='\\t')\n", + "dbsnp_data = dbsnp_data.rename(columns={\"True Id\": \"ID\",\"sequence_id\":'seq_id'})\n", + "dbsnp_data['ID'] = dbsnp_data['ID'].astype('string')\n", + "dbsnp_data" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "0223dcf2-3eba-43a1-847a-709f17b069a1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of missing dbSNP and OmimVar variant is 244\n" + ] + } + ], + "source": [ + "# Ensure ClinVar_ID is treated as string to avoid dtype mismatch\n", + "dbsnp_data_ids = dbsnp_data[\"ID\"].astype(str).unique()\n", + "\n", + "missing_num = 0\n", + "\n", + "# Iterate and print missing ClinVar IDs\n", + "for _, row in parsed_variants.iterrows():\n", + " if (row[\"Source\"] == \"dbSNP\" or row[\"Source\"] == \"OmimVar\") and str(row[\"ID\"]) not in clinvar_ids:\n", + " missing_num+=1\n", + "print(f'Number of missing dbSNP and OmimVar variant is {missing_num}')" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "b146e274-ef72-42d0-ba06-c859786bda89", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDdbsnp_idseq_idpositionrefalt
01019v2dbSNPrs11547328rs11547328NC_000012.1257751647GA
11019v2dbSNPrs11547328rs11547328NC_000012.1257751647GC
21019v2dbSNPrs11547328rs11547328NC_000012.1257751647GT
31019v2dbSNPrs104894340rs104894340NC_000012.1257751646CA
41019v2dbSNPrs104894340rs104894340NC_000012.1257751646CG
...........................
14179101v1dbSNPrs672601311rs672601311NC_000015.1050490449CG
14189101v1dbSNPrs672601311rs672601311NC_000015.1050490449CT
14199217v1OmimVar605704rs74315431NC_000020.1158418317CT
14209217v1OmimVar605704rs281875284NC_000020.1158418288CG
14219217v1OmimVar605704rs281875284NC_000020.1158418288CT
\n", + "

1422 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID dbsnp_id seq_id position ref \\\n", + "0 1019v2 dbSNP rs11547328 rs11547328 NC_000012.12 57751647 G \n", + "1 1019v2 dbSNP rs11547328 rs11547328 NC_000012.12 57751647 G \n", + "2 1019v2 dbSNP rs11547328 rs11547328 NC_000012.12 57751647 G \n", + "3 1019v2 dbSNP rs104894340 rs104894340 NC_000012.12 57751646 C \n", + "4 1019v2 dbSNP rs104894340 rs104894340 NC_000012.12 57751646 C \n", + "... ... ... ... ... ... ... .. \n", + "1417 9101v1 dbSNP rs672601311 rs672601311 NC_000015.10 50490449 C \n", + "1418 9101v1 dbSNP rs672601311 rs672601311 NC_000015.10 50490449 C \n", + "1419 9217v1 OmimVar 605704 rs74315431 NC_000020.11 58418317 C \n", + "1420 9217v1 OmimVar 605704 rs281875284 NC_000020.11 58418288 C \n", + "1421 9217v1 OmimVar 605704 rs281875284 NC_000020.11 58418288 C \n", + "\n", + " alt \n", + "0 A \n", + "1 C \n", + "2 T \n", + "3 A \n", + "4 G \n", + "... .. \n", + "1417 G \n", + "1418 T \n", + "1419 T \n", + "1420 G \n", + "1421 T \n", + "\n", + "[1422 rows x 8 columns]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dbsnp_final = parsed_variants.merge(dbsnp_data, on='ID')\n", + "dbsnp_final" + ] + }, + { + "cell_type": "markdown", + "id": "4f41d1e1-6e7e-49c7-81b8-456875e0f40a", + "metadata": {}, + "source": [ + "### COSM" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "99aa0a39-ce5d-4c4e-90a9-7b8f36672843", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GeneTranscriptIDCOSMIDNucChangeAAChangeChrStartEndStrandRefAlleleAltAlleleID
0CTNNB1ENST00000643031.1COSM5692c.134C>Ap.S45Y34122464641224646+CA5692
1CTNNB1ENST00000642248.1COSM5689c.134C>Gp.S45C34122464641224646+CG5689
2CDKN2AENST00000579755.1COSM13508c.375G>Ap.G125=92197102721971027-CT13508
3CTNNB1ENST00000396183.7COSM5681c.95A>Gp.D32G34122460741224607+AG5681
4CDKN2AENST00000530628.2COSM13807c.389G>Tp.G130V92197101321971013-CA13807
.......................................
1134CDKN2AENST00000579755.1COSM13723c.308G>Ap.G103E92197109321971093-CT13723
1135CDKN2AENST00000578845.2COSM13723c.112G>Ap.G38S92197109321971093-CT13723
1136CDKN2AENST00000579122.1COSM12505c.59C>Ap.A20E92197476821974768-GT12505
1137FLT3ENST00000380982.4COSM785c.2503G>Cp.D835H132859264228592642-CG785
1138CDKN2AENST00000579122.1COSM13723c.265G>Ap.G89S92197109321971093-CT13723
\n", + "

1139 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " Gene TranscriptID COSMID NucChange AAChange Chr Start \\\n", + "0 CTNNB1 ENST00000643031.1 COSM5692 c.134C>A p.S45Y 3 41224646 \n", + "1 CTNNB1 ENST00000642248.1 COSM5689 c.134C>G p.S45C 3 41224646 \n", + "2 CDKN2A ENST00000579755.1 COSM13508 c.375G>A p.G125= 9 21971027 \n", + "3 CTNNB1 ENST00000396183.7 COSM5681 c.95A>G p.D32G 3 41224607 \n", + "4 CDKN2A ENST00000530628.2 COSM13807 c.389G>T p.G130V 9 21971013 \n", + "... ... ... ... ... ... .. ... \n", + "1134 CDKN2A ENST00000579755.1 COSM13723 c.308G>A p.G103E 9 21971093 \n", + "1135 CDKN2A ENST00000578845.2 COSM13723 c.112G>A p.G38S 9 21971093 \n", + "1136 CDKN2A ENST00000579122.1 COSM12505 c.59C>A p.A20E 9 21974768 \n", + "1137 FLT3 ENST00000380982.4 COSM785 c.2503G>C p.D835H 13 28592642 \n", + "1138 CDKN2A ENST00000579122.1 COSM13723 c.265G>A p.G89S 9 21971093 \n", + "\n", + " End Strand RefAllele AltAllele ID \n", + "0 41224646 + C A 5692 \n", + "1 41224646 + C G 5689 \n", + "2 21971027 - C T 13508 \n", + "3 41224607 + A G 5681 \n", + "4 21971013 - C A 13807 \n", + "... ... ... ... ... ... \n", + "1134 21971093 - C T 13723 \n", + "1135 21971093 - C T 13723 \n", + "1136 21974768 - G T 12505 \n", + "1137 28592642 - C G 785 \n", + "1138 21971093 - C T 13723 \n", + "\n", + "[1139 rows x 12 columns]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cosm_data = pd.read_csv(\"COSM/COSM_total_parsed.tsv\",sep='\\t')\n", + "cosm_data['ID'] = cosm_data['COSMID'].str[4:]\n", + "cosm_data" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "cc7b32a9-6742-4cfd-a075-cf61fc098cc4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of missing COSM variant is 202\n" + ] + } + ], + "source": [ + "# Ensure ClinVar_ID is treated as string to avoid dtype mismatch\n", + "cosm_data_ids = cosm_data[\"ID\"].astype(str).unique()\n", + "\n", + "missing_num = 0\n", + "\n", + "# Iterate and print missing ClinVar IDs\n", + "for _, row in parsed_variants.iterrows():\n", + " if row[\"Source\"] == \"COSM\" and str(row[\"ID\"]) not in clinvar_ids:\n", + " missing_num+=1\n", + "print(f'Number of missing COSM variant is {missing_num}')" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "322fb3ef-866a-4f5b-8a2d-3ee5d33e6276", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDGeneTranscriptIDCOSMIDNucChangeAAChangeChrStartEndStrandRefAlleleAltAllele
01019v2COSM1677139CDK4ENST00000312990.10COSM1677139c.70C>Tp.R24C125775164857751648-GA
11019v2COSM1677139CDK4ENST00000549606.5COSM1677139c.-158+527C>Tp.?125775164857751648-GA
21019v2COSM1677139CDK4ENST00000257904.10COSM1677139c.70C>Tp.R24C125775164857751648-GA
31019v2COSM1989836CDK4ENST00000312990.10COSM1989836c.71G>Ap.R24H125775164757751647-CT
41019v2COSM1989836CDK4ENST00000549606.5COSM1989836c.-158+528G>Ap.?125775164757751647-CT
.............................................
1134999v2COSM4766271CDH1ENST00000612417.4COSM4766271c.662A>Gp.D221G166880882368808823+AG
1135999v2COSM4766271CDH1ENST00000611625.4COSM4766271c.662A>Gp.D221G166880882368808823+AG
1136999v2COSM4766271CDH1ENST00000422392.6COSM4766271c.662A>Gp.D221G166880882368808823+AG
1137999v2COSM4766271CDH1ENST00000621016.4COSM4766271c.662A>Gp.D221G166880882368808823+AG
1138999v2COSM4766271CDH1ENST00000261769.9COSM4766271c.662A>Gp.D221G166880882368808823+AG
\n", + "

1139 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID Gene TranscriptID COSMID \\\n", + "0 1019v2 COSM 1677139 CDK4 ENST00000312990.10 COSM1677139 \n", + "1 1019v2 COSM 1677139 CDK4 ENST00000549606.5 COSM1677139 \n", + "2 1019v2 COSM 1677139 CDK4 ENST00000257904.10 COSM1677139 \n", + "3 1019v2 COSM 1989836 CDK4 ENST00000312990.10 COSM1989836 \n", + "4 1019v2 COSM 1989836 CDK4 ENST00000549606.5 COSM1989836 \n", + "... ... ... ... ... ... ... \n", + "1134 999v2 COSM 4766271 CDH1 ENST00000612417.4 COSM4766271 \n", + "1135 999v2 COSM 4766271 CDH1 ENST00000611625.4 COSM4766271 \n", + "1136 999v2 COSM 4766271 CDH1 ENST00000422392.6 COSM4766271 \n", + "1137 999v2 COSM 4766271 CDH1 ENST00000621016.4 COSM4766271 \n", + "1138 999v2 COSM 4766271 CDH1 ENST00000261769.9 COSM4766271 \n", + "\n", + " NucChange AAChange Chr Start End Strand RefAllele \\\n", + "0 c.70C>T p.R24C 12 57751648 57751648 - G \n", + "1 c.-158+527C>T p.? 12 57751648 57751648 - G \n", + "2 c.70C>T p.R24C 12 57751648 57751648 - G \n", + "3 c.71G>A p.R24H 12 57751647 57751647 - C \n", + "4 c.-158+528G>A p.? 12 57751647 57751647 - C \n", + "... ... ... .. ... ... ... ... \n", + "1134 c.662A>G p.D221G 16 68808823 68808823 + A \n", + "1135 c.662A>G p.D221G 16 68808823 68808823 + A \n", + "1136 c.662A>G p.D221G 16 68808823 68808823 + A \n", + "1137 c.662A>G p.D221G 16 68808823 68808823 + A \n", + "1138 c.662A>G p.D221G 16 68808823 68808823 + A \n", + "\n", + " AltAllele \n", + "0 A \n", + "1 A \n", + "2 A \n", + "3 T \n", + "4 T \n", + "... ... \n", + "1134 G \n", + "1135 G \n", + "1136 G \n", + "1137 G \n", + "1138 G \n", + "\n", + "[1139 rows x 14 columns]" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cosm_final = parsed_variants.merge(cosm_data, on='ID')\n", + "cosm_final" + ] + }, + { + "cell_type": "markdown", + "id": "83db14d7-1688-4b9d-a47b-0afec2f57a10", + "metadata": {}, + "source": [ + "## Combining them together" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "468461b0-1b06-4993-9950-e5ba26b11aa0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDseq_idpositionrefalt
01019v2ClinVar16928NC_000012.1257751647GA
11019v2ClinVar16929NC_000012.1257751646CT
21027v3ClinVar183391NC_000012.1212717896CAGGCGGAGCACCCCAAGCCCAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC
31027v3ClinVar183393NC_000012.1212718044CT
41027v3ClinVar183395NC_000012.1212718210CTCTCT
........................
1809049v1ClinVar4886NC_000011.1067483197CT
1819049v1ClinVar4892NC_000011.1067490803CA
1829101v1ClinVar161992NC_000015.1050490442TC
1839101v1ClinVar161993NC_000015.1050490443CG
1849101v1ClinVar161995NC_000015.1050490449CG
\n", + "

185 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID seq_id position ref \\\n", + "0 1019v2 ClinVar 16928 NC_000012.12 57751647 G \n", + "1 1019v2 ClinVar 16929 NC_000012.12 57751646 C \n", + "2 1027v3 ClinVar 183391 NC_000012.12 12717896 CAGGCGGAGCACCCCAAGCC \n", + "3 1027v3 ClinVar 183393 NC_000012.12 12718044 C \n", + "4 1027v3 ClinVar 183395 NC_000012.12 12718210 CTCT \n", + ".. ... ... ... ... ... ... \n", + "180 9049v1 ClinVar 4886 NC_000011.10 67483197 C \n", + "181 9049v1 ClinVar 4892 NC_000011.10 67490803 C \n", + "182 9101v1 ClinVar 161992 NC_000015.10 50490442 T \n", + "183 9101v1 ClinVar 161993 NC_000015.10 50490443 C \n", + "184 9101v1 ClinVar 161995 NC_000015.10 50490449 C \n", + "\n", + " alt \n", + "0 A \n", + "1 T \n", + "2 CAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC \n", + "3 T \n", + "4 CT \n", + ".. ... \n", + "180 T \n", + "181 A \n", + "182 C \n", + "183 G \n", + "184 G \n", + "\n", + "[185 rows x 7 columns]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clinvar_final" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "6cda9996-9725-4993-ab9e-ca8b74ced30a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDseq_idpositionrefalt
01019v2dbSNPrs11547328NC_000012.1257751647GA
11019v2dbSNPrs11547328NC_000012.1257751647GC
21019v2dbSNPrs11547328NC_000012.1257751647GT
31019v2dbSNPrs104894340NC_000012.1257751646CA
41019v2dbSNPrs104894340NC_000012.1257751646CG
........................
14179101v1dbSNPrs672601311NC_000015.1050490449CG
14189101v1dbSNPrs672601311NC_000015.1050490449CT
14199217v1OmimVar605704NC_000020.1158418317CT
14209217v1OmimVar605704NC_000020.1158418288CG
14219217v1OmimVar605704NC_000020.1158418288CT
\n", + "

1422 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID seq_id position ref alt\n", + "0 1019v2 dbSNP rs11547328 NC_000012.12 57751647 G A\n", + "1 1019v2 dbSNP rs11547328 NC_000012.12 57751647 G C\n", + "2 1019v2 dbSNP rs11547328 NC_000012.12 57751647 G T\n", + "3 1019v2 dbSNP rs104894340 NC_000012.12 57751646 C A\n", + "4 1019v2 dbSNP rs104894340 NC_000012.12 57751646 C G\n", + "... ... ... ... ... ... .. ..\n", + "1417 9101v1 dbSNP rs672601311 NC_000015.10 50490449 C G\n", + "1418 9101v1 dbSNP rs672601311 NC_000015.10 50490449 C T\n", + "1419 9217v1 OmimVar 605704 NC_000020.11 58418317 C T\n", + "1420 9217v1 OmimVar 605704 NC_000020.11 58418288 C G\n", + "1421 9217v1 OmimVar 605704 NC_000020.11 58418288 C T\n", + "\n", + "[1422 rows x 7 columns]" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dbsnp_final = dbsnp_final.drop(columns=['dbsnp_id'])\n", + "dbsnp_final" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "fac7aacc-79c6-4cef-8d4e-7be27bca34e2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDseq_idpositionrefalt
01019v2ClinVar16928NC_000012.1257751647GA
11019v2ClinVar16929NC_000012.1257751646CT
21027v3ClinVar183391NC_000012.1212717896CAGGCGGAGCACCCCAAGCCCAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC
31027v3ClinVar183393NC_000012.1212718044CT
41027v3ClinVar183395NC_000012.1212718210CTCTCT
........................
14179101v1dbSNPrs672601311NC_000015.1050490449CG
14189101v1dbSNPrs672601311NC_000015.1050490449CT
14199217v1OmimVar605704NC_000020.1158418317CT
14209217v1OmimVar605704NC_000020.1158418288CG
14219217v1OmimVar605704NC_000020.1158418288CT
\n", + "

1607 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID seq_id position \\\n", + "0 1019v2 ClinVar 16928 NC_000012.12 57751647 \n", + "1 1019v2 ClinVar 16929 NC_000012.12 57751646 \n", + "2 1027v3 ClinVar 183391 NC_000012.12 12717896 \n", + "3 1027v3 ClinVar 183393 NC_000012.12 12718044 \n", + "4 1027v3 ClinVar 183395 NC_000012.12 12718210 \n", + "... ... ... ... ... ... \n", + "1417 9101v1 dbSNP rs672601311 NC_000015.10 50490449 \n", + "1418 9101v1 dbSNP rs672601311 NC_000015.10 50490449 \n", + "1419 9217v1 OmimVar 605704 NC_000020.11 58418317 \n", + "1420 9217v1 OmimVar 605704 NC_000020.11 58418288 \n", + "1421 9217v1 OmimVar 605704 NC_000020.11 58418288 \n", + "\n", + " ref alt \n", + "0 G A \n", + "1 C T \n", + "2 CAGGCGGAGCACCCCAAGCC CAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC \n", + "3 C T \n", + "4 CTCT CT \n", + "... ... ... \n", + "1417 C G \n", + "1418 C T \n", + "1419 C T \n", + "1420 C G \n", + "1421 C T \n", + "\n", + "[1607 rows x 7 columns]" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clinvar_dbsnp = pd.concat([clinvar_final, dbsnp_final])\n", + "clinvar_dbsnp" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "6956d533-6f6e-4526-9350-f4db614e14da", + "metadata": {}, + "outputs": [], + "source": [ + "clinvar_dbsnp = clinvar_dbsnp.rename(columns={\"seq_id\":\"TranscriptID\",\"position\":\"Start\",\"ref\":\"RefAllele\",\"alt\":\"AltAllele\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "343b56f9-f078-47e6-9c88-5ceff0a8b537", + "metadata": {}, + "outputs": [], + "source": [ + "clinvar_dbsnp[\"End\"] = clinvar_dbsnp[\"Start\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "8717fee6-a548-487b-87e9-f1fef6d2429a", + "metadata": {}, + "outputs": [], + "source": [ + "clinvar_dbsnp['Chr'] = clinvar_dbsnp['TranscriptID'].str[7:9].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "238baf78-d5f4-4c2b-80b4-ce2b4df67cde", + "metadata": {}, + "outputs": [], + "source": [ + "clinvar_dbsnp = clinvar_dbsnp[['ENTRY', 'Source', 'ID', 'TranscriptID','Chr', 'Start', 'End','RefAllele','AltAllele']]" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "a1482c60-3453-4f16-89bd-9fc73ba6b622", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDTranscriptIDChrStartEndRefAlleleAltAllele
01019v2ClinVar16928NC_000012.12125775164757751647GA
11019v2ClinVar16929NC_000012.12125775164657751646CT
21027v3ClinVar183391NC_000012.12121271789612717896CAGGCGGAGCACCCCAAGCCCAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC
31027v3ClinVar183393NC_000012.12121271804412718044CT
41027v3ClinVar183395NC_000012.12121271821012718210CTCTCT
..............................
14179101v1dbSNPrs672601311NC_000015.10155049044950490449CG
14189101v1dbSNPrs672601311NC_000015.10155049044950490449CT
14199217v1OmimVar605704NC_000020.11205841831758418317CT
14209217v1OmimVar605704NC_000020.11205841828858418288CG
14219217v1OmimVar605704NC_000020.11205841828858418288CT
\n", + "

1607 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID TranscriptID Chr Start End \\\n", + "0 1019v2 ClinVar 16928 NC_000012.12 12 57751647 57751647 \n", + "1 1019v2 ClinVar 16929 NC_000012.12 12 57751646 57751646 \n", + "2 1027v3 ClinVar 183391 NC_000012.12 12 12717896 12717896 \n", + "3 1027v3 ClinVar 183393 NC_000012.12 12 12718044 12718044 \n", + "4 1027v3 ClinVar 183395 NC_000012.12 12 12718210 12718210 \n", + "... ... ... ... ... ... ... ... \n", + "1417 9101v1 dbSNP rs672601311 NC_000015.10 15 50490449 50490449 \n", + "1418 9101v1 dbSNP rs672601311 NC_000015.10 15 50490449 50490449 \n", + "1419 9217v1 OmimVar 605704 NC_000020.11 20 58418317 58418317 \n", + "1420 9217v1 OmimVar 605704 NC_000020.11 20 58418288 58418288 \n", + "1421 9217v1 OmimVar 605704 NC_000020.11 20 58418288 58418288 \n", + "\n", + " RefAllele AltAllele \n", + "0 G A \n", + "1 C T \n", + "2 CAGGCGGAGCACCCCAAGCC CAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC \n", + "3 C T \n", + "4 CTCT CT \n", + "... ... ... \n", + "1417 C G \n", + "1418 C T \n", + "1419 C T \n", + "1420 C G \n", + "1421 C T \n", + "\n", + "[1607 rows x 9 columns]" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clinvar_dbsnp" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "8207bc04-9787-4621-818e-7e9cc17770ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDTranscriptIDNucChangeChrStartEndRefAlleleAltAllele
01019v2COSM1677139ENST00000312990.10c.70C>T125775164857751648GA
11019v2COSM1677139ENST00000549606.5c.-158+527C>T125775164857751648GA
21019v2COSM1677139ENST00000257904.10c.70C>T125775164857751648GA
31019v2COSM1989836ENST00000312990.10c.71G>A125775164757751647CT
41019v2COSM1989836ENST00000549606.5c.-158+528G>A125775164757751647CT
.................................
1134999v2COSM4766271ENST00000612417.4c.662A>G166880882368808823AG
1135999v2COSM4766271ENST00000611625.4c.662A>G166880882368808823AG
1136999v2COSM4766271ENST00000422392.6c.662A>G166880882368808823AG
1137999v2COSM4766271ENST00000621016.4c.662A>G166880882368808823AG
1138999v2COSM4766271ENST00000261769.9c.662A>G166880882368808823AG
\n", + "

1139 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID TranscriptID NucChange Chr Start \\\n", + "0 1019v2 COSM 1677139 ENST00000312990.10 c.70C>T 12 57751648 \n", + "1 1019v2 COSM 1677139 ENST00000549606.5 c.-158+527C>T 12 57751648 \n", + "2 1019v2 COSM 1677139 ENST00000257904.10 c.70C>T 12 57751648 \n", + "3 1019v2 COSM 1989836 ENST00000312990.10 c.71G>A 12 57751647 \n", + "4 1019v2 COSM 1989836 ENST00000549606.5 c.-158+528G>A 12 57751647 \n", + "... ... ... ... ... ... .. ... \n", + "1134 999v2 COSM 4766271 ENST00000612417.4 c.662A>G 16 68808823 \n", + "1135 999v2 COSM 4766271 ENST00000611625.4 c.662A>G 16 68808823 \n", + "1136 999v2 COSM 4766271 ENST00000422392.6 c.662A>G 16 68808823 \n", + "1137 999v2 COSM 4766271 ENST00000621016.4 c.662A>G 16 68808823 \n", + "1138 999v2 COSM 4766271 ENST00000261769.9 c.662A>G 16 68808823 \n", + "\n", + " End RefAllele AltAllele \n", + "0 57751648 G A \n", + "1 57751648 G A \n", + "2 57751648 G A \n", + "3 57751647 C T \n", + "4 57751647 C T \n", + "... ... ... ... \n", + "1134 68808823 A G \n", + "1135 68808823 A G \n", + "1136 68808823 A G \n", + "1137 68808823 A G \n", + "1138 68808823 A G \n", + "\n", + "[1139 rows x 10 columns]" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cosm_final = cosm_final.drop(columns={\"Gene\",\"COSMID\",\"AAChange\",\"Strand\"})\n", + "cosm_final" + ] + }, + { + "cell_type": "markdown", + "id": "e9c0fd4d-14e1-4018-b848-678717d265f0", + "metadata": {}, + "source": [ + "**Final Concatenation**" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "ced3d93a-e5a8-4283-8925-8257540a5e99", + "metadata": {}, + "outputs": [], + "source": [ + "final_data = pd.concat([cosm_final,clinvar_dbsnp])" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "8e2d6e3e-38ca-4aef-bc32-7f7ea3f45126", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDTranscriptIDNucChangeChrStartEndRefAlleleAltAllele
01019v2COSM1677139ENST00000312990.10c.70C>T125775164857751648GA
11019v2COSM1677139ENST00000549606.5c.-158+527C>T125775164857751648GA
21019v2COSM1677139ENST00000257904.10c.70C>T125775164857751648GA
31019v2COSM1989836ENST00000312990.10c.71G>A125775164757751647CT
41019v2COSM1989836ENST00000549606.5c.-158+528G>A125775164757751647CT
.................................
14179101v1dbSNPrs672601311NC_000015.10NaN155049044950490449CG
14189101v1dbSNPrs672601311NC_000015.10NaN155049044950490449CT
14199217v1OmimVar605704NC_000020.11NaN205841831758418317CT
14209217v1OmimVar605704NC_000020.11NaN205841828858418288CG
14219217v1OmimVar605704NC_000020.11NaN205841828858418288CT
\n", + "

2746 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID TranscriptID NucChange Chr \\\n", + "0 1019v2 COSM 1677139 ENST00000312990.10 c.70C>T 12 \n", + "1 1019v2 COSM 1677139 ENST00000549606.5 c.-158+527C>T 12 \n", + "2 1019v2 COSM 1677139 ENST00000257904.10 c.70C>T 12 \n", + "3 1019v2 COSM 1989836 ENST00000312990.10 c.71G>A 12 \n", + "4 1019v2 COSM 1989836 ENST00000549606.5 c.-158+528G>A 12 \n", + "... ... ... ... ... ... .. \n", + "1417 9101v1 dbSNP rs672601311 NC_000015.10 NaN 15 \n", + "1418 9101v1 dbSNP rs672601311 NC_000015.10 NaN 15 \n", + "1419 9217v1 OmimVar 605704 NC_000020.11 NaN 20 \n", + "1420 9217v1 OmimVar 605704 NC_000020.11 NaN 20 \n", + "1421 9217v1 OmimVar 605704 NC_000020.11 NaN 20 \n", + "\n", + " Start End RefAllele AltAllele \n", + "0 57751648 57751648 G A \n", + "1 57751648 57751648 G A \n", + "2 57751648 57751648 G A \n", + "3 57751647 57751647 C T \n", + "4 57751647 57751647 C T \n", + "... ... ... ... ... \n", + "1417 50490449 50490449 C G \n", + "1418 50490449 50490449 C T \n", + "1419 58418317 58418317 C T \n", + "1420 58418288 58418288 C G \n", + "1421 58418288 58418288 C T \n", + "\n", + "[2746 rows x 10 columns]" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_data" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "3bec1451-dfd8-4597-b7a7-6b1ec5d70b13", + "metadata": {}, + "outputs": [], + "source": [ + "final_data.to_csv(\"all_variant_data.tsv\",sep='\\t',index=False, header=True)" + ] + }, + { + "cell_type": "markdown", + "id": "a4723ff8-9848-4f96-8461-2175e986a8f2", + "metadata": {}, + "source": [ + "In Excel removed duplicates based on the same Variant ID, Chromosome number, ref allele and alt allele\n", + "\n", + "After removing 1 lines from manual inspection, I am left with 761 variants and their associated variant ids" + ] + }, + { + "cell_type": "markdown", + "id": "1f3c322f-17b8-4ab9-8a8f-a7b60ea73ab0", + "metadata": {}, + "source": [ + "# Variant ID to Network" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "48fff44f-22ab-4c30-a24b-2bc029e72463", + "metadata": {}, + "outputs": [], + "source": [ + "gene_variant = pd.read_csv(\"gene_variants.tsv\", sep='\\t', names=['Network','ENTRY'])" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "bba198c9-a63e-466b-871e-b0ee30f84e56", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NetworkENTRY
0N0000225v1
1N0000225v2
2N000033815v1
3N000042322v1
4N000042322v2
.........
323N017142760v1
324N018095052v1
325N018737428v3
326N018763084v1
327N018772066v1
\n", + "

328 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " Network ENTRY\n", + "0 N00002 25v1\n", + "1 N00002 25v2\n", + "2 N00003 3815v1\n", + "3 N00004 2322v1\n", + "4 N00004 2322v2\n", + ".. ... ...\n", + "323 N01714 2760v1\n", + "324 N01809 5052v1\n", + "325 N01873 7428v3\n", + "326 N01876 3084v1\n", + "327 N01877 2066v1\n", + "\n", + "[328 rows x 2 columns]" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gene_variant" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "8d2f02f0-bd56-4693-88c7-2f0124a12fa4", + "metadata": {}, + "outputs": [], + "source": [ + "all_variant_data = pd.read_csv(\"all_variant_data.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "707e21dc-85b0-48da-9d2e-b20c1351d035", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ENTRYSourceIDTranscriptIDNucChangeChrStartEndRefAlleleAltAllele
01019v2ClinVar16929NC_000012.12NaN125775164657751646CT
11019v2dbSNPrs104894340NC_000012.12NaN125775164657751646CA
21019v2dbSNPrs104894340NC_000012.12NaN125775164657751646CG
31019v2ClinVar16928NC_000012.12NaN125775164757751647GA
41019v2dbSNPrs11547328NC_000012.12NaN125775164757751647GC
.................................
7569817v1COSM6196635ENST00000393623.6c.706G>T191049219610492196CA
7579817v1COSM6196637ENST00000393623.6c.548A>G191049948610499486TC
758999v2COSM4766271ENST00000621016.4c.662A>G166880882368808823AG
759999v2COSM4766211ENST00000621016.4c.755T>G166881026468810264TG
760999v2COSM1379150ENST00000621016.4c.769G>A166881027868810278GA
\n", + "

761 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " ENTRY Source ID TranscriptID NucChange Chr Start \\\n", + "0 1019v2 ClinVar 16929 NC_000012.12 NaN 12 57751646 \n", + "1 1019v2 dbSNP rs104894340 NC_000012.12 NaN 12 57751646 \n", + "2 1019v2 dbSNP rs104894340 NC_000012.12 NaN 12 57751646 \n", + "3 1019v2 ClinVar 16928 NC_000012.12 NaN 12 57751647 \n", + "4 1019v2 dbSNP rs11547328 NC_000012.12 NaN 12 57751647 \n", + ".. ... ... ... ... ... ... ... \n", + "756 9817v1 COSM 6196635 ENST00000393623.6 c.706G>T 19 10492196 \n", + "757 9817v1 COSM 6196637 ENST00000393623.6 c.548A>G 19 10499486 \n", + "758 999v2 COSM 4766271 ENST00000621016.4 c.662A>G 16 68808823 \n", + "759 999v2 COSM 4766211 ENST00000621016.4 c.755T>G 16 68810264 \n", + "760 999v2 COSM 1379150 ENST00000621016.4 c.769G>A 16 68810278 \n", + "\n", + " End RefAllele AltAllele \n", + "0 57751646 C T \n", + "1 57751646 C A \n", + "2 57751646 C G \n", + "3 57751647 G A \n", + "4 57751647 G C \n", + ".. ... ... ... \n", + "756 10492196 C A \n", + "757 10499486 T C \n", + "758 68808823 A G \n", + "759 68810264 T G \n", + "760 68810278 G A \n", + "\n", + "[761 rows x 10 columns]" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_variant_data" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "fcc506c3-c957-4e8a-acbd-bdb0c9dc6318", + "metadata": {}, + "outputs": [], + "source": [ + "variant_data_together_wo_nt = all_variant_data.merge(gene_variant, on=\"ENTRY\")" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "e679f511-77da-40c1-9f5e-25162fd7f714", + "metadata": {}, + "outputs": [], + "source": [ + "variant_data_together_wo_nt.to_csv(\"variant_data_together_wo_nt.tsv\", sep='\\t',index=False, header=True)" + ] + }, + { + "cell_type": "markdown", + "id": "1cf263d2-a41b-422c-b095-4a18184158c6", + "metadata": {}, + "source": [ + "# Parsing Unique Networks and getting Gene Pathway" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4586fd55-9de0-4d1c-b81f-92bdcce839ec", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d8f896ab-5859-438b-97f9-392c6f7c837b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 182 network_variant_data_unique.txt\n" + ] + } + ], + "source": [ + "cut -f 1 variant_data_together_wo_nt.tsv > network_variant_data.txt\n", + "sort -u network_variant_data.txt > network_variant_data_unique.txt\n", + "sed -i '' '/Network/d' network_variant_data_unique.txt\n", + "wc -l network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9a1fa0c9-94a0-40f7-831a-557532512878", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + " if ! grep -q ENTRY network_variant/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3bd437dc-7caa-4b30-9587-84397015be0f", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + " if ! grep -q NAME network_variant/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "94f49e6f-eea7-4eb3-b495-23bc0593633d", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + " if ! grep -q DEFINITION network_variant/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "baf9f804-fc45-4560-8bbc-fe9e43cebb09", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + " if ! grep -q EXPANDED network_variant/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "931168b5-9a73-4dcb-ad78-fcc41a911503", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "N00302\n", + "N00303\n", + "N00304\n", + "N00305\n", + "N00600\n", + "N00643\n", + "N00679\n", + "N00789\n", + "N01064\n", + "N01065\n", + "N01419\n", + "N01422\n", + "N01444\n", + "N01714\n" + ] + } + ], + "source": [ + "while read p; do\n", + " if ! grep -q PATHWAY network_variant/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "dd3b8bb6-0b8d-43f9-8241-f093e6b7a063", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + " if ! grep -q CLASS network_variant/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e2d1d5ad-9662-4c7a-abe2-bf155a6e0257", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "N01683\n", + "N01689\n", + "N01697\n", + "N01698\n", + "N01699\n", + "N01700\n", + "N01702\n", + "N01704\n", + "N01714\n" + ] + } + ], + "source": [ + "while read p; do\n", + " if ! grep -q DISEASE network_variant/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d8eeded0-fda2-4a90-9dcb-b4cb841d77b9", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + " if ! grep -q GENE network_variant/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < network_variant_data_unique.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "bad6343d-6c08-4738-9902-ee17f3832b40", + "metadata": {}, + "outputs": [], + "source": [ + "sed -i '' '/N01683/d' network_variant_data_unique.txt\n", + "sed -i '' '/N01689/d' network_variant_data_unique.txt\n", + "sed -i '' '/N01697/d' network_variant_data_unique.txt\n", + "sed -i '' '/N01698/d' network_variant_data_unique.txt\n", + "sed -i '' '/N01699/d' network_variant_data_unique.txt\n", + "sed -i '' '/N01700/d' network_variant_data_unique.txt\n", + "sed -i '' '/N01702/d' network_variant_data_unique.txt\n", + "sed -i '' '/N01704/d' network_variant_data_unique.txt\n", + "sed -i '' '/N01714/d' network_variant_data_unique.txt" + ] + }, + { + "cell_type": "markdown", + "id": "eab3e1bd-3725-4037-839d-ed06e02eff4c", + "metadata": {}, + "source": [ + "Networks without a disease tag and thus without a ground truth paragraph" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "3a05997d-80bf-4f89-9630-7adf8b6b2866", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 173 network_variant_data_unique.txt\n" + ] + } + ], + "source": [ + "wc -l network_variant_data_unique.txt" + ] + }, + { + "cell_type": "markdown", + "id": "30578490-ad11-4bed-b683-80fa41f8c41e", + "metadata": {}, + "source": [ + "**Switch to python**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b58d998-6919-4961-a18e-89a5dfb96d1c", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6a0904c9-366a-48b5-9291-efc09039478f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "85bd0c1f-cc3a-4fed-94f0-e2dd7d5bb598", + "metadata": {}, + "outputs": [], + "source": [ + "# Define column structure\n", + "network_info = pd.DataFrame(columns=[\"Entry\", \"Name\", \"Definition\", \"Expanded\", \"Pathway\", \"Class\", \"Disease\", \"Gene\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "f568e7e9-d28c-44b5-8224-fefbd31735bc", + "metadata": {}, + "outputs": [], + "source": [ + "# Read all variant IDs\n", + "with open('network_variant_data_unique.txt', 'r') as f:\n", + " network_var_id = [line.strip() for line in f if line.strip()]\n", + "\n", + "# Function to extract single-line values (handles leading whitespace too)\n", + "def get_single_line_value(lines, key):\n", + " for line in lines:\n", + " if line.lstrip().startswith(key):\n", + " return line.split(key, 1)[-1].strip()\n", + " return \"\"\n", + "\n", + "# Function to extract multiline values that follow a key line (indented lines)\n", + "def get_multiline_values(lines, key):\n", + " values = []\n", + " recording = False\n", + " for i, line in enumerate(lines):\n", + " if line.startswith(key):\n", + " # Capture first line's content after the key\n", + " initial_value = line[len(key):].strip()\n", + " if initial_value:\n", + " values.append(initial_value)\n", + " recording = True\n", + " continue\n", + " if recording:\n", + " if re.match(r'^\\s{2,}', line): # line starts with 2+ spaces\n", + " values.append(line.strip())\n", + " else:\n", + " break # stop when indentation breaks\n", + " return \"| \".join(values)\n", + "\n", + "# Process each network_variant file\n", + "for variant_id in network_var_id:\n", + " file_path = f'network_variant/{variant_id}.txt'\n", + "\n", + " try:\n", + " with open(file_path, 'r') as f:\n", + " lines = f.readlines()\n", + "\n", + " row = {\n", + " \"Entry\": variant_id,\n", + " \"Name\": get_single_line_value(lines, \"NAME\"),\n", + " \"Definition\": get_single_line_value(lines, \"DEFINITION\"),\n", + " \"Expanded\": get_single_line_value(lines, \"EXPANDED\"),\n", + " \"Pathway\": get_multiline_values(lines, \"PATHWAY\"),\n", + " \"Class\": get_multiline_values(lines, \"CLASS\"),\n", + " \"Disease\": get_multiline_values(lines, \"DISEASE\"),\n", + " \"Gene\": get_multiline_values(lines, \"GENE\")\n", + " }\n", + "\n", + " network_info = pd.concat([network_info, pd.DataFrame([row])], ignore_index=True)\n", + "\n", + " except FileNotFoundError:\n", + " print(f\"[Warning] File not found: {file_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "ed1fef16-cf95-44ef-bfd7-552c631b725e", + "metadata": {}, + "outputs": [], + "source": [ + "network_info = network_info.set_index('Entry')" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "a2188680-0f9d-4e7f-89f1-9dc5aa95094f", + "metadata": {}, + "outputs": [], + "source": [ + "no_pathway = [\"N00302\",\"N00303\",\"N00304\",\"N00305\",\"N00600\",\"N00643\",\"N00679\",\"N00789\",\"N01064\",\"N01065\",\"N01419\",\"N01422\",\"N01444\"]\n", + "for id in no_pathway:\n", + " network_info.at[id, 'Pathway'] = pd.NA" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "194ccc3a-54c5-483b-90d7-b5bcda4bdfe4", + "metadata": {}, + "outputs": [], + "source": [ + "network_info = network_info.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "e3d1de46-c7c8-4615-b9ff-2d4d08e979f0", + "metadata": {}, + "outputs": [], + "source": [ + "# Columns to process\n", + "cols_to_clean = [\"Pathway\", \"Class\", \"Disease\",\"Gene\"]\n", + "\n", + "def extract_data(cell):\n", + " if pd.isna(cell):\n", + " return cell # Leave NaN as is\n", + " gene_dict = {}\n", + " for part in cell.split(\"|\"):\n", + " tokens = part.strip().split()\n", + " if len(tokens) >= 2:\n", + " gene_dict[tokens[0]] = ' '.join(tokens[1:])\n", + " elif len(tokens) == 1:\n", + " gene_dict[tokens[0]] = \"\"\n", + " return gene_dict\n", + "\n", + "# Apply the transformation to each column\n", + "for col in cols_to_clean:\n", + " network_info[col] = network_info[col].apply(extract_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "72a6992e-def7-4ada-abc6-080c31cec3fe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EntryNameDefinitionExpandedPathwayClassDiseaseGene
0N00002BCR-ABL fusion kinase to RAS-ERK signaling pat...BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->...(25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38...{'hsa05220': 'Chronic myeloid leukemia'}{'nt06276': 'Chronic myeloid leukemia', 'nt062...{'H00004': 'Chronic myeloid leukemia'}{'25': 'ABL1; ABL proto-oncogene 1, non-recept...
1N00003Mutation-activated KIT to RAS-ERK signaling pa...KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48...{'hsa05221': 'Acute myeloid leukemia'}{'nt06275': 'Acute myeloid leukemia', 'nt06210...{'H00003': 'Acute myeloid leukemia'}{'3815': 'KIT; KIT proto-oncogene receptor tyr...
2N00004Duplication or mutation-activated FLT3 to RAS-...FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK(2322v2,2322v1) -> 2885 -> (6654,6655) -> (326...{'hsa05221': 'Acute myeloid leukemia'}{'nt06275': 'Acute myeloid leukemia', 'nt06210...{'H00003': 'Acute myeloid leukemia'}{'2322': 'FLT3; fms related tyrosine kinase 3'...
3N00005Mutation-activated MET to RAS-ERK signaling pa...MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER...4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48...{'hsa05225': 'Hepatocellular carcinoma', 'hsa0...{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma', 'H00021...{'4233': 'MET; MET proto-oncogene, receptor ty...
4N00007EML4-ALK fusion kinase to RAS-ERK signaling pa...EML4-ALK -> RAS -> RAF -> MEK -> ERK -> CCND1(238v1,238v2) -> (3265,3845,4893) -> (369,673,...{'hsa05223': 'Non-small cell lung cancer'}{'nt06266': 'Non-small cell lung cancer', 'nt0...{'H00014': 'Non-small cell lung cancer'}{'238': 'ALK; ALK receptor tyrosine kinase', '...
...........................
168N01422HPRT1 deficiency in purine salvage pathway(Hypoxanthine,Guanine) // HPRT1*(C00262,C00242) // 3251v1<NA>{'nt06027': 'Purine salvage pathway'}{'H00194': 'Lesch-Nyhan syndrome'}{'3251': 'HPRT1; hypoxanthine phosphoribosyltr...
169N01444NXN mutation to WNT5A-ROR signaling pathwayNXN* -| DVL64359v1 -| (1855,1856,1857)<NA>{'nt06505': 'WNT signaling'}{'H00485': 'Robinow syndrome'}{'64359': 'NXN; nucleoredoxin', '1855': 'DVL1;...
170N01809Mutation-caused epigenetic silencing of MMACHCPRDX1* =| MMACHC5052v1 =| 25974{'hsa04980': 'Cobalamin transport and metaboli...{'nt06538': 'Cobalamin transport and metabolism'}{'H02221': 'Methylmalonic aciduria and homocys...{'5052': 'PRDX1; peroxiredoxin 1', '25974': 'M...
171N01873VHL mutation to HIF-2 signaling pathway(VHL*+RBX1+ELOC+ELOB+CUL2) // EPAS1 == ARNT =>...(7428v3+9978+6921+6923+8453) // 2034 == 405 =>...{'hsa05211': 'Renal cell carcinoma'}{'nt06542': 'HIF signaling'}{'H00021': 'Renal cell carcinoma', 'H00559': '...{'7428': 'VHL; von Hippel-Lindau tumor suppres...
172N01877ERBB4 mutation to GF-RTK-PI3K signaling pathwayNRG // ERBB4*(3084,9542,10718,145957) // 2066v1{'hsa04012': 'ErbB signaling pathway'}{'nt06543': 'NRG-ERBB signaling'}{'H00058': 'Amyotrophic lateral sclerosis (ALS)'}{'3084': 'NRG1; neuregulin 1', '9542': 'NRG2; ...
\n", + "

173 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Entry Name \\\n", + "0 N00002 BCR-ABL fusion kinase to RAS-ERK signaling pat... \n", + "1 N00003 Mutation-activated KIT to RAS-ERK signaling pa... \n", + "2 N00004 Duplication or mutation-activated FLT3 to RAS-... \n", + "3 N00005 Mutation-activated MET to RAS-ERK signaling pa... \n", + "4 N00007 EML4-ALK fusion kinase to RAS-ERK signaling pa... \n", + ".. ... ... \n", + "168 N01422 HPRT1 deficiency in purine salvage pathway \n", + "169 N01444 NXN mutation to WNT5A-ROR signaling pathway \n", + "170 N01809 Mutation-caused epigenetic silencing of MMACHC \n", + "171 N01873 VHL mutation to HIF-2 signaling pathway \n", + "172 N01877 ERBB4 mutation to GF-RTK-PI3K signaling pathway \n", + "\n", + " Definition \\\n", + "0 BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->... \n", + "1 KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK \n", + "2 FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK \n", + "3 MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER... \n", + "4 EML4-ALK -> RAS -> RAF -> MEK -> ERK -> CCND1 \n", + ".. ... \n", + "168 (Hypoxanthine,Guanine) // HPRT1* \n", + "169 NXN* -| DVL \n", + "170 PRDX1* =| MMACHC \n", + "171 (VHL*+RBX1+ELOC+ELOB+CUL2) // EPAS1 == ARNT =>... \n", + "172 NRG // ERBB4* \n", + "\n", + " Expanded \\\n", + "0 (25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38... \n", + "1 3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48... \n", + "2 (2322v2,2322v1) -> 2885 -> (6654,6655) -> (326... \n", + "3 4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48... \n", + "4 (238v1,238v2) -> (3265,3845,4893) -> (369,673,... \n", + ".. ... \n", + "168 (C00262,C00242) // 3251v1 \n", + "169 64359v1 -| (1855,1856,1857) \n", + "170 5052v1 =| 25974 \n", + "171 (7428v3+9978+6921+6923+8453) // 2034 == 405 =>... \n", + "172 (3084,9542,10718,145957) // 2066v1 \n", + "\n", + " Pathway \\\n", + "0 {'hsa05220': 'Chronic myeloid leukemia'} \n", + "1 {'hsa05221': 'Acute myeloid leukemia'} \n", + "2 {'hsa05221': 'Acute myeloid leukemia'} \n", + "3 {'hsa05225': 'Hepatocellular carcinoma', 'hsa0... \n", + "4 {'hsa05223': 'Non-small cell lung cancer'} \n", + ".. ... \n", + "168 \n", + "169 \n", + "170 {'hsa04980': 'Cobalamin transport and metaboli... \n", + "171 {'hsa05211': 'Renal cell carcinoma'} \n", + "172 {'hsa04012': 'ErbB signaling pathway'} \n", + "\n", + " Class \\\n", + "0 {'nt06276': 'Chronic myeloid leukemia', 'nt062... \n", + "1 {'nt06275': 'Acute myeloid leukemia', 'nt06210... \n", + "2 {'nt06275': 'Acute myeloid leukemia', 'nt06210... \n", + "3 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "4 {'nt06266': 'Non-small cell lung cancer', 'nt0... \n", + ".. ... \n", + "168 {'nt06027': 'Purine salvage pathway'} \n", + "169 {'nt06505': 'WNT signaling'} \n", + "170 {'nt06538': 'Cobalamin transport and metabolism'} \n", + "171 {'nt06542': 'HIF signaling'} \n", + "172 {'nt06543': 'NRG-ERBB signaling'} \n", + "\n", + " Disease \\\n", + "0 {'H00004': 'Chronic myeloid leukemia'} \n", + "1 {'H00003': 'Acute myeloid leukemia'} \n", + "2 {'H00003': 'Acute myeloid leukemia'} \n", + "3 {'H00048': 'Hepatocellular carcinoma', 'H00021... \n", + "4 {'H00014': 'Non-small cell lung cancer'} \n", + ".. ... \n", + "168 {'H00194': 'Lesch-Nyhan syndrome'} \n", + "169 {'H00485': 'Robinow syndrome'} \n", + "170 {'H02221': 'Methylmalonic aciduria and homocys... \n", + "171 {'H00021': 'Renal cell carcinoma', 'H00559': '... \n", + "172 {'H00058': 'Amyotrophic lateral sclerosis (ALS)'} \n", + "\n", + " Gene \n", + "0 {'25': 'ABL1; ABL proto-oncogene 1, non-recept... \n", + "1 {'3815': 'KIT; KIT proto-oncogene receptor tyr... \n", + "2 {'2322': 'FLT3; fms related tyrosine kinase 3'... \n", + "3 {'4233': 'MET; MET proto-oncogene, receptor ty... \n", + "4 {'238': 'ALK; ALK receptor tyrosine kinase', '... \n", + ".. ... \n", + "168 {'3251': 'HPRT1; hypoxanthine phosphoribosyltr... \n", + "169 {'64359': 'NXN; nucleoredoxin', '1855': 'DVL1;... \n", + "170 {'5052': 'PRDX1; peroxiredoxin 1', '25974': 'M... \n", + "171 {'7428': 'VHL; von Hippel-Lindau tumor suppres... \n", + "172 {'3084': 'NRG1; neuregulin 1', '9542': 'NRG2; ... \n", + "\n", + "[173 rows x 8 columns]" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "network_info" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "4844c8c8-7efc-4b21-ba7a-bd6eef0a7cf3", + "metadata": {}, + "outputs": [], + "source": [ + "network_info.to_csv(\"network_variant_final_info.tsv\",sep='\\t', header=True, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "c432ed92-f45d-4893-8666-a71fa6256076", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['H00003', 'H00004', 'H00013', 'H00014', 'H00018', 'H00019', 'H00020', 'H00021', 'H00022', 'H00024', 'H00026', 'H00031', 'H00032', 'H00033', 'H00034', 'H00038', 'H00039', 'H00042', 'H00048', 'H00056', 'H00057', 'H00058', 'H00059', 'H00061', 'H00063', 'H00126', 'H00135', 'H00194', 'H00195', 'H00246', 'H00247', 'H00251', 'H00260', 'H00423', 'H00485', 'H00559', 'H01032', 'H01102', 'H01398', 'H01431', 'H01522', 'H01603', 'H02049', 'H02221']\n" + ] + } + ], + "source": [ + "all_disease_keys = []\n", + "\n", + "for disease in network_info['Disease']:\n", + " if isinstance(disease, dict):\n", + " all_disease_keys.extend(disease.keys())\n", + "\n", + "unique_disease_keys = sorted(set(all_disease_keys))\n", + "print(unique_disease_keys)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "9b0a7a42-fef6-4300-9272-3973be631880", + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess\n", + "\n", + "disease_dict = {}\n", + "\n", + "for disease in unique_disease_keys:\n", + " try:\n", + " # Run the shell command and capture output\n", + " result = subprocess.run(\n", + " f\"kegg_pull rest get {disease} | grep DESCRIPTION\",\n", + " shell=True,\n", + " capture_output=True,\n", + " text=True\n", + " )\n", + " # Save the stdout (if grep found something)\n", + " if result.stdout:\n", + " disease_dict[disease] = result.stdout.strip()\n", + " else:\n", + " disease_dict[disease] = None # or \"DESCRIPTION not found\"\n", + " except Exception as e:\n", + " disease_dict[disease] = f\"Error: {str(e)}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "7ce66864-aa4d-47f3-9843-b2a96d2e188b", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'H00003': 'DESCRIPTION Acute myeloid leukemia (AML) is a disease that is characterized by uncontrolled proliferation of clonal neoplastic cells and accumulation in the bone marrow of blasts with an impaired differentiation program. AML accounts for approximately 80% of all adult leukemias and remains the most common cause of leukemia death. Two major types of genetic events have been described that are crucial for leukemic transformation. A proposed necessary first event is disordered cell growth and upregulation of cell survival genes. The most common of these activating events were observed in the RTK Flt3, in N-Ras and K-Ras, in Kit, and sporadically in other RTKs. Alterations in myeloid transcription factors governing hematopoietic differentiation provide second necessary event for leukemogenesis. Transcription factor fusion proteins such as PML-RARalpha (in Acute promyelocytic leukemia, a subtype of AML), AML-ETO or PLZF-RARalpha block myeloid cell differentiation by repressing target genes. In other cases, the transcription factors themselves are mutated.',\n", + " 'H00004': 'DESCRIPTION Chronic myeloid leukemia (CML) is a clonal myeloproliferative disorder of a pluripotent stem cell. The natural history of CML has a triphasic clinical course comprising of an initial chronic phase (CP), which is characterized by expansion of functionally normal myeloid cells, followed by an accelerated phase (AP) and finally a more aggressive blast phase (BP), with loss of terminal differentiation capacity. On the cellular level, CML is associated with a specific chromosome abnormality, the t(9; 22) reciprocal translocation that forms the Philadelphia (Ph) chromosome. The Ph chromosome is the result of a molecular rearrangement between the c-ABL proto-oncogene on chromosome 9 and the BCR (breakpoint cluster region) gene on chromosome 22. The BCR/ABL fusion gene encodes p210 BCR/ABL, an oncoprotein, which, unlike the normal p145 c-Abl, has constitutive tyrosine kinase activity and is predominantly localized in the cytoplasm. While fusion of c-ABL and BCR is believed to be the primary cause of the chronic phase of CML, progression to blast crisis requires other molecular changes. Common secondary abnormalities include mutations in TP53, RB, and p16/INK4A, or overexpression of genes such as EVI1. Additional chromosome translocations are also observed,such as t(3;21)(q26;q22), which generates AML1-EVI1.',\n", + " 'H00013': 'DESCRIPTION Lung cancer is a leading cause of cancer death among men and women in industrialized countries. Small cell lung carcinoma (SCLC) is a highly aggressive neoplasm, which accounts for approximately 25% of all lung cancer cases. Molecular mechanisms altered in SCLC include induced expression of oncogene, MYC, and loss of tumorsuppressor genes, such as p53, PTEN, RB, and FHIT. The overexpression of MYC proteins in SCLC is largely a result of gene amplification. Such overexpression leads to more rapid proliferation and loss of terminal differentiation. Mutation or deletion of p53 or PTEN can lead to more rapid proliferation and reduced apoptosis. The retinoblastoma gene RB1 encodes a nuclear phosphoprotein that helps to regulate cell-cycle progression. The fragile histidine triad gene FHIT encodes the enzyme diadenosine triphosphate hydrolase, which is thought to have an indirect role in proapoptosis and cell-cycle control.',\n", + " 'H00014': 'DESCRIPTION Lung cancer is a leading cause of cancer death among men and women in industrialized countries. Non-small-cell lung cancer (NSCLC) accounts for approximately 85% of lung cancer and represents a heterogeneous group of cancers, consisting mainly of squamous cell (SCC), adeno (AC) and large-cell carcinoma. Molecular mechanisms altered in NSCLC include activation of oncogenes, such as K-RAS, EGFR and EML4-ALK, and inactivation of tumorsuppressor genes, such as p53, p16INK4a, RAR-beta, and RASSF1. Point mutations within the K-RAS gene inactivate GTPase activity and the p21-RAS protein continuously transmits growth signals to the nucleus. Mutations or overexpression of EGFR leads to a proliferative advantage. EML4-ALK fusion leads to constitutive ALK activation, which causes cell proliferation, invasion, and inhibition of apoptosis. Inactivating mutation of p53 can lead to more rapid proliferation and reduced apoptosis. The protein encoded by the p16INK4a inhibits formation of CDK-cyclin-D complexes by competitive binding of CDK4 and CDK6. Loss of p16INK4a expression is a common feature of NSCLC. RAR-beta is a nuclear receptor that bears vitamin-A-dependent transcriptional activity. RASSF1A is able to form heterodimers with Nore-1, an RAS effector. Therefore loss of RASSF1A might shift the balance of RAS activity towards a growth-promoting effect.',\n", + " 'H00018': \"DESCRIPTION Gastric cancer (GC) is one of the world's most common cancers. According to Lauren's histological classification gastric cancer is divided into two distinct histological groups - the intestinal and diffuse types. Several genetic changes have been identified in intestinal-type GC. The intestinal metaplasia is characterized by mutations in p53 gene, reduced expression of retinoic acid receptor beta (RAR-beta) and hTERT expression. Gastric adenomas furthermore display mutations in the APC gene, reduced p27 expression and cyclin E amplification. In addition, amplification and overexpression of c-ErbB2, reduced TGF-beta receptor type I (TGFBRI) expression and complete loss of p27 expression are commonly observed in more advanced GC. The main molecular changes observed in diffuse-type GCs include loss of E-cadherin function by mutations in CDH1and amplification of MET and FGFR2F.\",\n", + " 'H00019': \"DESCRIPTION Infiltrating ductal adenocarcinoma is the most common malignancy of the pancreas. When most investigators use the term 'pancreatic cancer' they are referring to pancreatic ductal adenocarcinoma (PDA). Normal duct epithelium progresses to infiltrating cancer through a series of histologically defined precursors. The overexpression of HER-2/neu and activating point mutations in the K-ras gene occur early, inactivation of the p16 gene at an intermediate stage, and the inactivation of p53, SMAD4, and BRCA2 occur relatively late. Activated K-ras engages multiple effector pathways. Although EGF receptors are conventionally regarded as upstream activators of RAS proteins, they can also act as RAS signal transducers via RAS-induced autocrine activation of the EGFR family ligands. Moreover, PDA shows extensive genomic instability and aneuploidy. Telomere attrition and mutations in p53 and BRCA2 are likely to contribute to these phenotypes. Inactivation of the SMAD4 tumour suppressor gene leads to loss of the inhibitory influence of the transforming growth factor-beta signalling pathway.\",\n", + " 'H00020': 'DESCRIPTION Colorectal cancer (CRC) is the second largest cause of cancer-related deaths in Western countries. CRC arises from the colorectal epithelium as a result of the accumulation of genetic alterations in defined oncogenes and tumour suppressor genes (TSG). Two major mechanisms of genomic instability have been identified in sporadic CRC progression. The first, known as chromosomal instability (CIN), results from a series of genetic changes that involve the activation of oncogenes such as K-ras and inactivation of TSG such as p53, DCC/Smad4, and APC. The second, known as microsatellite instability (MSI), results from inactivation of the DNA mismatch repair genes MLH1 and/or MSH2 by hypermethylation of their promoter, and secondary mutation of genes with coding microsatellites, such as transforming growth factor receptor II (TGF-RII) and BAX. Hereditary syndromes have germline mutations in specific genes (mutation in the tumour suppressor gene APC on chromosome 5q in FAP, mutated DNA mismatch repair genes in HNPCC).',\n", + " 'H00021': 'DESCRIPTION Renal cell cancer (RCC) accounts for ~3% of human malignancies and its incidence appears to be rising. Although most cases of RCC seem to occur sporadically, an inherited predisposition to renal cancer accounts for 1-4% of cases. RCC is not a single disease, it has several morphological subtypes. Conventional RCC (clear cell RCC) accounts for ~80% of cases, followed by papillary RCC (10-15%), chromophobe RCC (5%), and collecting duct RCC (<1%). Genes potentially involved in sporadic neoplasms of each particular type are VHL, MET, BHD, and FH respectively. In the absence of VHL, hypoxia-inducible factor alpha (HIF-alpha) accumulates, leading to production of several growth factors, including vascular endothelial growth factor and platelet-derived growth factor. Activated MET mediates a number of biological effects including motility, invasion of extracellular matrix, cellular transformation, prevention of apoptosis and metastasis formation. Loss of functional FH leads to accumulation of fumarate in the cell, triggering inhibition of HPH and preventing targeted pVHL-mediated degradation of HIF-alpha. BHD mutations cause the Birt-Hogg-Dube syndrome and its associated chromophobe, hybrid oncocytic, and conventional (clear cell) RCC.',\n", + " 'H00022': 'DESCRIPTION The urothelium covers the luminal surface of almost the entire urinary tract, extending from the renal pelvis, through the ureter and bladder, to the proximal urethra. The majority of urothelial carcinoma are bladder carcinomas, and urothelial carcinomas of the renal pelvis and ureter account for only approximately 7% of the total. Urothelial tumours arise and evolve through divergent phenotypic pathways. Some tumours progress from urothelial hyperplasia to low-grade non-invasive superficial papillary tumours. More aggressive variants arise either from flat, high-grade carcinoma in situ (CIS) and progress to invasive tumours, or they arise de novo as invasive tumours. Low-grade papillary tumors frequently show a constitutive activation of the receptor tyrosine kinase-Ras pathway, exhibiting activating mutations in the HRAS and fibroblast growth factor receptor 3 (FGFR3) genes. In contrast, CIS and invasive tumors frequently show alterations in the TP53 and RB genes and pathways. Invasion and metastases are promoted by several factors that alter the tumour microenvironment, including the aberrant expression of E-cadherins (E-cad), matrix metalloproteinases (MMPs), angiogenic factors such as vascular endothelial growth factor (VEGF).',\n", + " 'H00024': 'DESCRIPTION Prostate cancer constitutes a major health problem in Western countries. It is the most frequently diagnosed cancer among men and the second leading cause of male cancer deaths. The identification of key molecular alterations in prostate-cancer cells implicates carcinogen defenses (GSTP1), growth-factor-signaling pathways (NKX3.1, PTEN, and p27), and androgens (AR) as critical determinants of the phenotype of prostate-cancer cells. Glutathione S-transferases (GSTP1) are detoxifying enzymes. Cells of prostatic intraepithelial neoplasia, devoid of GSTP1, undergo genomic damage mediated by carcinogens. NKX3.1, PTEN, and p27 regulate the growth and survival of prostate cells in the normal prostate. Inadequate levels of PTEN and NKX3.1 lead to a reduction in p27 levels and to increased proliferation and decreased apoptosis. Androgen receptor (AR) is a transcription factor that is normally activated by its androgen ligand. During androgen withdrawal therapy, the AR signal transduction pathway also could be activated by amplification of the AR gene, by AR gene mutations, or by altered activity of AR coactivators. Through these mechanisms, tumor cells lead to the emergence of androgen-independent prostate cancer.',\n", + " 'H00026': 'DESCRIPTION Endometrial cancer (EC) is the most common gynaecological malignancy and the fourth most common malignancy in women in the developed world after breast, colorectal and lung cancer. Two types of endometrial carcinoma are distinguished with respect to biology and clinical course. Type-I carcinoma is related to hyperestrogenism by association with endometrial hyperplasia, frequent expression of estrogen and progesterone receptors and younger age, whereas type-II carcinoma is unrelated to estrogen, associated with atrophic endometrium, frequent lack of estrogen and progesterone receptors and older age. The morphologic differences in these cancers are mirrored in their molecular genetic profile with type I showing defects in DNA-mismatch repair and mutations in PTEN, K-ras, and beta-catenin, and type II showing aneuploidy, p53 mutations, and her2/neu amplification.',\n", + " 'H00031': 'DESCRIPTION Breast cancer is the leading cause of cancer death among women worldwide. The vast majority of breast cancers are carcinomas that originate from cells lining the milk-forming ducts of the mammary gland. The molecular subtypes of breast cancer, which are based on the presence or absence of hormone receptors (estrogen and progesterone subtypes) and human epidermal growth factor receptor-2 (HER2), include: hormone receptor positive and HER2 negative (luminal A subtype), hormone receptor positive and HER2 positive (luminal B subtype), hormone receptor negative and HER2 positive (HER2 positive), and hormone receptor negative and HER2 negative (basal-like or triple-negative breast cancers (TNBCs)). Hormone receptor positive breast cancers are largely driven by the estrogen/ER pathway. In HER2 positive breast tumours, HER2 activates the PI3K/AKT and the RAS/RAF/MAPK pathways, and stimulate cell growth, survival and differentiation. In patients suffering from TNBC, the deregulation of various signalling pathways (Notch, Wnt/beta-catenin, and EGFR) have been confirmed.',\n", + " 'H00032': 'DESCRIPTION Thyroid cancer is the most common endocrine malignancy and accounts for the majority of endocrine cancer- related deaths each year. More than 95% of thyroid carcinomas are derived from follicular cells. Their behavior varies from the indolent growing, well-differentiated papillary and follicular carcinomas (PTC and FTC, respectively) to the extremely aggressive undifferentiated carcinoma (UC). Somatic rearrangements of RET and TRK are almost exclusively found in PTC and may be found in early stages. The most distinctive molecular features of FTC are the prominence of aneuploidy and the high prevalence of RAS mutations and PAX8-PPAR{gamma} rearrangements. p53 seems to play a crucial role in the dedifferentiation process of thyroid carcinoma.',\n", + " 'H00033': 'DESCRIPTION Adrenocortical carcinoma (ACC) is a rare endocrine malignancy defined by a heterogeneous clinical presentation, dismal prognosis, and lack of effective therapeutic regimens. The incidence of ACC ranges from 0.5 to 2 cases per million people per year, accounting for 0.02% of all reported cancers. Unfortunately, most patients present with metastatic disease which reduces the 5 year survival rate to less than 10%. Oncogenes and tumor-suppressor genes involved in adrenal carcinomas include mutations in the p53 tumor-suppressor gene and rearrangements of the chromosomal locus 11p15.5 associated with IGF II hyperexpression. Deletions of the ACTH receptor gene have recently been found in undifferentiated adenomas and in aggressive ACCs.',\n", + " 'H00034': 'DESCRIPTION Carcinoid tumors are relatively uncommon neoplasms that nonetheless comprise up to 85% of neuroendocrine gastrointestinal neoplasms. They most frequently occur in the midgut and develop from neuroendocrine cells that are normally and diffusely present in this location. Most carcinoids are sporadic but epidemiological studies report a familial risk. Moreover, carcinoids can occur within the multiple endocrine neoplasia (MEN) syndrome, a rare familiar tumor syndrome in which mutations in the MEN1 gene are manifested. Recently, it has been shown that a majority (78%) of sporadic carcinoids display loss of heterozygosity for markers around the MEN 1 region, thus suggesting involvement of this gene in the pathogenesis of both familial and sporadic carcinoids.',\n", + " 'H00038': 'DESCRIPTION Melanoma is a form of skin cancer that has a poor prognosis and which is on the rise in Western populations. Melanoma arises from the malignant transformation of pigment-producing cells, melanocytes. The only known environmental risk factor is exposure to ultraviolet (UV) light and in people with fair skin the risk is greatly increased. Melanoma pathogenesis is also driven by genetic factors. Oncogenic NRAS mutations activate both effector pathways Raf-MEK-ERK and PI3K-Akt. The Raf-MEK-ERK pathway may also be activated via mutations in the BRAF gene. The PI3K-Akt pathway may be activated through loss or mutation of the inhibitory tumor suppressor gene PTEN. These mutations arise early during melanoma pathogenesis and are preserved throughout tumor progression. Melanoma development has been shown to be strongly associated with inactivation of the p16INK4a/cyclin dependent kinases 4 and 6/retinoblastoma protein (p16INK4a/CDK4,6/pRb) and p14ARF/human double minute 2/p53 (p14ARF/HMD2/p53) tumor suppressor pathways. MITF and TP53 are implicated in further melanoma progression.',\n", + " 'H00039': 'DESCRIPTION Cancer of the skin is the most common cancer in Caucasians and basal cell carcinomas (BCC) account for 90% of all skin cancers. The vast majority of BCC cases are sporadic, though there is a rare familial syndrome basal cell nevus syndrome (BCNS, or Gorlin syndrome) that predisposes to development of BCC. In addition, there is strong epidemiological and genetic evidence that demonstrates UV exposure as a risk factor of prime importance. The development of basal cell carcinoma is associated with constitutive activation of sonic hedgehog signaling. The mutations in SMOH, PTCH1, and SHH in BCCs result in continuous activation of target genes. At a cellular level, sonic hedgehog signaling promotes cell proliferation. Mutations in TP53 are also found with high frequency (>50%) in sporadic BCC.',\n", + " 'H00042': 'DESCRIPTION Gliomas are the most common of the primary brain tumors and account for more than 40% of all central nervous system neoplasms. Gliomas include tumours that are composed predominantly of astrocytes (astrocytomas), oligodendrocytes (oligodendrogliomas), mixtures of various glial cells (for example,oligoastrocytomas) and ependymal cells (ependymomas). The most malignant form of infiltrating astrocytoma - glioblastoma multiforme (GBM) - is one of the most aggressive human cancers. GBM may develop de novo (primary glioblastoma) or by progression from low-grade or anaplastic astrocytoma (secondary glioblastoma). Primary glioblastomas develop in older patients and typically show genetic alterations (EGFR amplification, p16/INK4a deletion, and PTEN mutations) at frequencies of 24-34%. Secondary glioblastomas develop in younger patients and frequently show overexpression of PDGF and CDK4 as well as p53 mutations (65%) and loss of Rb playing major roles in such transformations. Loss of PTEN has been implicated in both pathways, although it is much more common in the pathogenesis of primary GBM.',\n", + " 'H00048': 'DESCRIPTION Hepatocellular carcinoma (HCC) is a major type of primary liver cancer and one of the rare human neoplasms etiologically linked to viral factors. It has been shown that, after HBV/HCV infection and alcohol or aflatoxin B1 exposure, genetic and epigenetic changes occur. The recurrent mutated genes were found to be highly enriched in multiple key driver signaling processes, including telomere maintenance, TP53, cell cycle regulation, the Wnt/beta-catenin pathway (CTNNB1 and AXIN1), the phosphatidylinositol-3 kinase (PI3K)/AKT/mammalian target of rapamycin (mTOR) pathway. Recent studies using whole-exome sequencing have revealed recurrent mutations in new driver genes involved in the chromatin remodelling (ARID1A and ARID2) and the oxidative stress (NFE2L2) pathways.',\n", + " 'H00056': 'DESCRIPTION Alzheimer disease (AD) is a chronic disorder that slowly destroys neurons and causes serious cognitive disability. AD is associated with senile plaques and neurofibrillary tangles (NFTs). Amyloid-beta (Abeta), a major component of senile plaques, has various pathological effects on cell and organelle function. To date genetic studies have revealed four genes that may be linked to autosomal dominant or familial early onset AD (FAD). These four genes include: amyloid precursor protein (APP), presenilin 1 (PS1), presenilin 2 (PS2), and apolipoprotein E (ApoE). All mutations associated with APP and PS proteins can lead to an increase in the production of Abeta peptides, specifically the more amyloidogenic form, Abeta42. It was proposed that Abeta forms Ca2+ permeable pores and binds to and modulates multiple synaptic proteins, including NMDAR, mGluR5, and VGCC, leading to the overfilling of neurons with calcium ions. Consequently, cellular Ca2+ disruptions will lead to neuronal apoptosis, autophagy deficits, mitochondrial abnormality, defective neurotransmission, impaired synaptic plasticity, and neurodegeneration in AD. FAD-linked PS1 mutation downregulates the unfolded protein response and leads to vulnerability to ER stress.',\n", + " 'H00057': 'DESCRIPTION Parkinson disease (PD) is a progressive neurodegenerative movement disorder that results primarily from the death of dopaminergic (DA) neurons in the substantia nigra pars compacta (SNc). Both environmental factors and mutations in familial PD-linked genes such as SNCA, Parkin, DJ-1, PINK1 and LRRK2 are associated with PD pathogenesis. These pathogenic mutations and environmental factors are known to cause disease due to oxidative stress, intracellular Ca2+ homeostasis impairment, mitochondrial dysfunctions and altered protein handling compromising key roles of DA neuronal function and survival. The demise of DA neurons located in the SNc leads to a drop in the dopaminergic input to the striatum, which is hypothesized to impede movement by inducing hypo and hyper activity in striatal spiny projection neurons (SPNs) of the direct (dSPNs) and indirect (iSPNs) pathways in the basal ganglia, respectively.',\n", + " 'H00058': 'DESCRIPTION Amyotrophic lateral sclerosis (ALS) is a neurodegenerative disorder characterized by a progressive degeneration of motor neurons in the brain and spinal cord. In 90% of patients, ALS is sporadic, with no clear genetic linkage. On the other hand, the remaining 10% of cases show familial inheritance, with mutations in SOD1, TDP43(TARDBP), FUS, or C9orf72 genes being the most frequent causes. In spite of such difference, familial ALS and sporadic ALS have similarities in their pathological features. Proposed disease mechanisms contributing to motor neuron degeneration in ALS are: impaired proteostasis, aberrant RNA processing, mitochondrial disfunction and oxidative stress, microglia activation, and axonal dysfunction.',\n", + " 'H00059': 'DESCRIPTION Huntington disease (HD) is an autosomal-dominant neurodegenerative disorder that primarily affects medium spiny striatal neurons (MSN). The symptoms are choreiform, involuntary movements, personality changes and dementia. HD is caused by a CAG repeat expansion in the IT15 gene, which results in a long stretch of polyglutamine (polyQ) close to the amino-terminus of the HD protein huntingtin (Htt). Mutant Htt (mHtt) has effects both in the cytoplasm and in the nucleus. Full-length Htt is cleaved by proteases in the cytoplasm, leading to the formation of cytoplasmic and neuritic aggregates. mHtt also alters vesicular transport and recycling, causes cytosolic and mitochondrial Ca2+ overload, triggers endoplasmic reticulum stress through proteasomal dysfunction, and impairs autophagy function, increasing neuronal death susceptibility. N-terminal fragments containing the polyQ stretch translocate to the nucleus where they impair transcription and induce neuronal death.',\n", + " 'H00061': 'DESCRIPTION Prion diseases, also termed transmissible spongiform encephalopathies (TSEs), are a group of fatal neurodegenerative diseases that affect humans and a number of other animal species. The etiology of these diseases is thought to be associated with the conversion of a normal protein, PrPC, into an infectious, pathogenic form, PrPSc. The conversion is induced by prion infections (for example, variant Creutzfeldt-Jakob disease (vCJD), iatrogenic CJD, Kuru), mutations (familial CJD, Gerstmann-Straussler-Scheinker syndrome, fatal familial insomnia (FFI)) or unknown factors (sporadic CJD (sCJD)), and is thought to occur after PrPC has reached the plasma membrane or is re-internalized for degradation. The PrPSc form shows greater protease resistance than PrPC and accumulates in affected individuals, often in the form of extracellular plaques. Pathways that may lead to neuronal death comprise oxidative stress, regulated activation of complement, ubiquitin-proteasome and endosomal-lysosomal systems, synaptic alterations and dendritic atrophy, corticosteroid response, and endoplasmic reticulum stress. In addition, the conformational transition could lead to the lost of a beneficial activity of the natively folded protein, PrPC.',\n", + " 'H00063': 'DESCRIPTION The autosomal dominant spinocerebellar ataxias (SCAs) are a group of progressive neurodegenerative diseases characterised by loss of balance and motor coordination due to the primary dysfunction of the cerebellum. Compelling evidence points to major aetiological roles for transcriptional dysregulation, protein aggregation and clearance, autophagy, the ubiquitin-proteasome system, alterations of calcium homeostasis, mitochondria defects, toxic RNA gain-of-function mechanisms and eventual cell death with apoptotic features of neurons during SCA disease progression.',\n", + " 'H00126': 'DESCRIPTION Gaucher disease is an autosomal recessive lysosomal storage disorder caused by deficient beta-glucocerebrosidase (glucosylceramidase) activity or saposin C which is an activator of beta-glucocerebrosidase in sphingolipid metabolism. The enzymatic defects lead to the accumulation of glucosylceramide (GC) in lysosomes of affected cells. Despite the fact that Gaucher Disease consists of a phenotype, with varying degrees of severity, it has been sub-divided in three subtypes according to the presence or absence of neurological involvement. The sub-types are Type 1, 2 and 3.',\n", + " 'H00135': 'DESCRIPTION Krabbe disease is an autosomal recessive disorder caused by deficient activity of galactosylceramidase.',\n", + " 'H00194': 'DESCRIPTION Deficiency of hypoxanthine-guanine phosphoribosyltransferase activity is an inborn error of purine metabolism characterized by hyperuricemia with hyperuricosuria and a continuum spectrum of neurological manifestations.',\n", + " 'H00195': 'DESCRIPTION Adenine phosphoribosyltransferase deficiency (APRTD) is an autosomal recessive disorder of purine metabolism and causes urolithiasis due to accumulation of the insoluble purine 2,8-dihydroxyadenine.',\n", + " 'H00246': 'DESCRIPTION Familial hyperparathyroidism (HRPT) is characterized by parathyroid adenoma and hyperplasia with hypersecretion of parathyroid hormone and hypercalcaemia. It is caused by mutation in the HRPT2 (CDC73 or Parafibromin) gene that also causes the hyperparathyroidism-jaw tumor syndrome. Sporadic cases are also known to occur with somatic mutations within the MEN1 gene.',\n", + " 'H00247': \"DESCRIPTION Multiple endocrine neoplasias (MEN) are autosomal dominant syndrome which is characterized by the occurrence of tumors involving two or more endocrine glands. Four major forms of MEN are recognized, namely MEN1, MEN2A, MEN2B and MEN4. MEN1, which is also referred as Wermer's syndrome, is characterized by parathyroid adenoma, gastrinoma, and pituitary adenoma. Gastrinomas are the most common type, leading to the Zollinger-Ellison Syndrome (see H01522). MEN2 is characterized by medullary thyroid cancer (MTC) and includes three subtypes: MEN2A (Sipple's syndrome), MEN2B (MEN3) and familial MTC. Patients with MEN2A develop MTC in association with phaeochromocytoma and parathyroid tumors. Patients with MEN2B develop MTC in association with marfanoid habitus, mucosal neuromas, medullated corneal fibers and intestinal autonomic ganglion dysfunction, leading to megacolon. MEN4, also referred to as MENX, appears to have signs and symptoms similar to those of type 1. However MEN4 patients have mutations in other genes. The mutations in their responsible genes are found in Each MEN syndrome.\",\n", + " 'H00251': 'DESCRIPTION Thyroid dyshormonogenesis is a genetically heterogeneous group of inherited disorders in the enzymatic cascade of thyroid hormone synthesis that result in congenital hypothyroidism due to genetic defects in the synthesis of thyroid hormones.',\n", + " 'H00260': \"DESCRIPTION Primary pigmented micronodular adrenocortical disease (PPNAD) is a form of ACTH-independent adrenal hyperplasia resulting in endogenous Cushing's syndrome.\",\n", + " 'H00423': 'DESCRIPTION The sphingolipidoses are a group of monogenic inherited diseases caused by defects in the system of lysosomal sphingolipid degradation, with subsequent accumulation of non-degradable storage material in one or more organs.',\n", + " 'H00485': 'DESCRIPTION Robinow syndrome (RS) is a rare genetically heterogeneous condition characterized by hypertelorism, nasal features (large nasal bridge, short upturned nose, and anteverted nares), midface hypoplasia, mesomelic limb shortening, brachydactyly, clinodactyly, micropenis, and short stature. Both autosomal recessive and autosomal dominant inheritance have been described. The phenotypic presentation in both types of RS overlaps; however, subtle variances in the severity of craniofacial, musculoskeletal, cardiovascular, and urogenital characteristics may be present. In general, autosomal recessive RS (RRS) patients have more severe dysmorphology than autosomal dominant RS (DRS), especially in the musculoskeletal system.',\n", + " 'H00559': 'DESCRIPTION von Hippel-Lindau syndrome is an autosomal dominant disorder associated with tumors in the central nervous system and other organs. The most frequent tumors are cerebellar and retinal haemangioblastomas, pancreatic neuroendocrine tumors, renal cell carcinoma, phaeochromocytoma in the adrenal gland, epididymal cystadenoma, and endolymphatic sac tumors. Germline inactivation of VHL tumor suppressor protein leads to the upregulation of HIF and promotes to carcinogenesis.',\n", + " 'H01032': 'DESCRIPTION N-acetylglutamate synthase (NAGS) deficiency is a rare inborn error of metabolism affecting ammonia detoxification in the urea cycle. The N-acetylglutamate is the absolutely required allosteric activator of the first urea cycle enzyme carbamoylphosphate synthetase 1 (CPS1). In defects of NAGS, the urea cycle function can be severely affected resulting in fatal hyperammonemia in neonatal patients or at any later stage in life. Clinical features of NAGS deficiency include poor feeding, vomiting, altered level of consciousness, seizures, and coma.',\n", + " 'H01102': 'DESCRIPTION Pituitary adenomas are an important and frequently occurring form of intracranial tumor. They are usually benign but can give rise to severe clinical syndromes due to hormonal excess, or to visual/cranial disturbances due to mass effect. The tumor can be clinically nonfunctioning or hormone secreting. Among the latter, prolactin (PRL) and growth hormone (GH)-secreting adenomas are the most common. The majority of pituitary adenomas arise sporadically, although a subset occurs as component tumors of well-characterized familial cancer syndromes, such as multiple endocrine neoplasia (MEN) [DS:H00247], and Carney complex (CNC) [DS:H01820].',\n", + " 'H01398': 'DESCRIPTION Hyperammonemia is a metabolic condition characterized by elevated levels of ammonia in the blood, and may result in irreversible brain damage if not treated early and thoroughly. Hyperammonemia can be classified into primary or secondary hyperammonemia depending on the underlying pathophysiology. Detoxification of ammonia is mainly accomplished by the urea cycle in periportal hepatocytes. If the urea cycle is directly affected by a defect of any of the involved enzymes or transporters, this results in primary hyperammonemia.',\n", + " 'H01431': \"DESCRIPTION Cushing syndrome (CS) is a rare disorder resulting from prolonged exposure to excess glucocorticoids via exogenous and endogenous sources. The typical clinical features of CS are related to hypercortisolism and include accumulation of central fat, moon facies, neuromuscular weakness, osteoporosis or bone fractures, metabolic complications, and mood changes. Traditionally, endogenous CS is classified as adrenocorticotropic hormone (ACTH)-dependent (about 80%) or ACTH- independent (about 20%). Among ACTH-dependent forms, pituitary corticotroph adenoma (Cushing's disease) is most common. Most pituitary tumors are sporadic, resulting from monoclonal expansion of a single mutated cell. Recently recurrent activating somatic driver mutations in the ubiquitin-specific protease 8 gene (USP8) were identified in almost half of corticotroph adenoma. Germline mutations in MEN1 (encoding menin), AIP (encoding aryl-hydrocarbon receptor-interacting protein), PRKAR1A (encoding cAMP-dependent protein kinase type I alpha regulatory subunit) and CDKN1B (encoding cyclin-dependent kinase inhibitor 1B; also known as p27 Kip1) have been identified in familial forms of pituitary adenomas. However, the frequency of familial pituitary adenomas is less than 5% in patients with pituitary adenomas. Among ACTH-independent CS, adrenal adenoma is most common. Rare adrenal causes of CS include primary bilateral macronodular adrenal hyperplasia (BMAH) or primary pigmented nodular adrenocortical disease (PPNAD).\",\n", + " 'H01522': 'DESCRIPTION Zollinger-Ellison syndrome (ZES) is a rare endocrinopathy caused by tumors of the pancreas and duodenum. These tumors, called gastrinomas, release gastrin to produce large amounts of acid that result in severe gastroesophageal peptic ulcer disease and diarrhea. Most ZES cases are sporadic, but about over 20 percent are caused by an inherited genetic disorder called multiple endocrine neoplasia type 1 (MEN1) [DS:H00247]. The clinical presentation is not specific for this disease and there is overlap of symptoms similar to those of a peptic ulcer. The most common symptoms include abdominal pain and diarrhea, sometimes accompanied by heartburn, nausea, and weight loss. Peptic ulceration complicated by bleeding is present in 25% of patients, and is more frequently in patients with sporadic ZES than in those with MEN1. In addition, the gastrinomas may be cancerous. The cancer can be spread to other parts of the body, most commonly to regional lymph nodes and the liver. The treatment of the ZES includes surgical removal and medical management of gastric acid hypersecretion for the prevention of malignant transformation and the genesis of complications.',\n", + " 'H01603': 'DESCRIPTION Primary aldosteronism is a clinical syndrome characterized by excess secretion of aldosterone from the adrenal gland. It is manifested by hypertension and hyporeninemia. In the past, hypokalemia was thought to be a mandatory finding in primary aldosteronism. However, later studies confirmed that most patients with primary aldosteronism are normokalemic. The prevalence of primary aldosteronism among nonselected hypertensive persons is between 5% and 13%, and it is now recognized to be the most common form of secondary hypertension. There are the seven subtypes of primary aldosteronism. Aldosterone-producing adenoma (APA) and bilateral idiopathic hyperaldosteronism (IHA) are the most common subtypes of primary aldosteronism. Unilateral adrenal hyperplasia, aldosterone-producing adrenocortical carcinoma, ectopic aldosterone-producing adenoma, and familial hyperaldosteronism (type I and typeII) are unusual subtypes. Somatic mutations in KCNJ5, ATP1A1, ATP2B3, and CACNA1D have been described in APAs. Usually, adenomas are managed surgically and bilateral hyperplasia, medically.',\n", + " 'H02049': \"DESCRIPTION Bilateral macronodular adrenal hyperplasia (BMAH) is an adrenal disorder characterized by bilateral benign adrenocortical nodules associated with variable levels of cortisol excess. BMAH is an adrenal cause of Cushing's syndrome (CS). An increased activity of the cAMP/PKA pathway is found in the various forms of BMAH. Actors of the cAMP/PKA signaling pathway or genes causing a hereditary familial tumor syndrome including adenomatous polyposis coli gene (APC), menin (MEN1) and fumarate hydratase (FH) can favor or be responsible for the development of BMAH. Recently, a new gene, ARMC5, was identified as a frequent cause of sporadic or familial BMAH.\",\n", + " 'H02221': 'DESCRIPTION Methylmalonic aciduria and homocystinuria (MAHC) is caused by defects of intracellular cobalamin (vitamin B12) metabolism. Derivatives of cobalamin are essential cofactors for enzymes required in intermediary metabolism, and its defects lead to the accumulation of methylmalonic acid and/or homocysteine in blood and urine. Affected persons present with multisystem clinical abnormalities, including developmental, hematologic, neurologic, and metabolic findings.'}" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "disease_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "97d091ec-097c-4028-bcaa-5c3e01ff0d01", + "metadata": {}, + "outputs": [], + "source": [ + "# Columns to process\n", + "cols_to_edit = [\"Disease\"]\n", + "\n", + "def put_disease_data(cell):\n", + " if pd.isna(cell):\n", + " return cell # Leave NaN as is\n", + " gene_dict = {}\n", + " for key in cell.keys():\n", + " gene_dict[key] = disease_dict[key]\n", + " return gene_dict\n", + "\n", + "# Apply the transformation to each column\n", + "for col in cols_to_edit:\n", + " network_info[col] = network_info[col].apply(put_disease_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "05257651-5f54-4d05-aa23-b04c1a3f85f7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EntryNameDefinitionExpandedPathwayClassDiseaseGene
0N00002BCR-ABL fusion kinase to RAS-ERK signaling pat...BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->...(25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38...{'hsa05220': 'Chronic myeloid leukemia'}{'nt06276': 'Chronic myeloid leukemia', 'nt062...{'H00004': 'DESCRIPTION Chronic myeloid leukem...{'25': 'ABL1; ABL proto-oncogene 1, non-recept...
1N00003Mutation-activated KIT to RAS-ERK signaling pa...KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48...{'hsa05221': 'Acute myeloid leukemia'}{'nt06275': 'Acute myeloid leukemia', 'nt06210...{'H00003': 'DESCRIPTION Acute myeloid leukemia...{'3815': 'KIT; KIT proto-oncogene receptor tyr...
2N00004Duplication or mutation-activated FLT3 to RAS-...FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK(2322v2,2322v1) -> 2885 -> (6654,6655) -> (326...{'hsa05221': 'Acute myeloid leukemia'}{'nt06275': 'Acute myeloid leukemia', 'nt06210...{'H00003': 'DESCRIPTION Acute myeloid leukemia...{'2322': 'FLT3; fms related tyrosine kinase 3'...
3N00005Mutation-activated MET to RAS-ERK signaling pa...MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER...4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48...{'hsa05225': 'Hepatocellular carcinoma', 'hsa0...{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'DESCRIPTION Hepatocellular carcino...{'4233': 'MET; MET proto-oncogene, receptor ty...
4N00007EML4-ALK fusion kinase to RAS-ERK signaling pa...EML4-ALK -> RAS -> RAF -> MEK -> ERK -> CCND1(238v1,238v2) -> (3265,3845,4893) -> (369,673,...{'hsa05223': 'Non-small cell lung cancer'}{'nt06266': 'Non-small cell lung cancer', 'nt0...{'H00014': 'DESCRIPTION Lung cancer is a leadi...{'238': 'ALK; ALK receptor tyrosine kinase', '...
...........................
168N01422HPRT1 deficiency in purine salvage pathway(Hypoxanthine,Guanine) // HPRT1*(C00262,C00242) // 3251v1<NA>{'nt06027': 'Purine salvage pathway'}{'H00194': 'DESCRIPTION Deficiency of hypoxant...{'3251': 'HPRT1; hypoxanthine phosphoribosyltr...
169N01444NXN mutation to WNT5A-ROR signaling pathwayNXN* -| DVL64359v1 -| (1855,1856,1857)<NA>{'nt06505': 'WNT signaling'}{'H00485': 'DESCRIPTION Robinow syndrome (RS) ...{'64359': 'NXN; nucleoredoxin', '1855': 'DVL1;...
170N01809Mutation-caused epigenetic silencing of MMACHCPRDX1* =| MMACHC5052v1 =| 25974{'hsa04980': 'Cobalamin transport and metaboli...{'nt06538': 'Cobalamin transport and metabolism'}{'H02221': 'DESCRIPTION Methylmalonic aciduria...{'5052': 'PRDX1; peroxiredoxin 1', '25974': 'M...
171N01873VHL mutation to HIF-2 signaling pathway(VHL*+RBX1+ELOC+ELOB+CUL2) // EPAS1 == ARNT =>...(7428v3+9978+6921+6923+8453) // 2034 == 405 =>...{'hsa05211': 'Renal cell carcinoma'}{'nt06542': 'HIF signaling'}{'H00021': 'DESCRIPTION Renal cell cancer (RCC...{'7428': 'VHL; von Hippel-Lindau tumor suppres...
172N01877ERBB4 mutation to GF-RTK-PI3K signaling pathwayNRG // ERBB4*(3084,9542,10718,145957) // 2066v1{'hsa04012': 'ErbB signaling pathway'}{'nt06543': 'NRG-ERBB signaling'}{'H00058': 'DESCRIPTION Amyotrophic lateral sc...{'3084': 'NRG1; neuregulin 1', '9542': 'NRG2; ...
\n", + "

173 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Entry Name \\\n", + "0 N00002 BCR-ABL fusion kinase to RAS-ERK signaling pat... \n", + "1 N00003 Mutation-activated KIT to RAS-ERK signaling pa... \n", + "2 N00004 Duplication or mutation-activated FLT3 to RAS-... \n", + "3 N00005 Mutation-activated MET to RAS-ERK signaling pa... \n", + "4 N00007 EML4-ALK fusion kinase to RAS-ERK signaling pa... \n", + ".. ... ... \n", + "168 N01422 HPRT1 deficiency in purine salvage pathway \n", + "169 N01444 NXN mutation to WNT5A-ROR signaling pathway \n", + "170 N01809 Mutation-caused epigenetic silencing of MMACHC \n", + "171 N01873 VHL mutation to HIF-2 signaling pathway \n", + "172 N01877 ERBB4 mutation to GF-RTK-PI3K signaling pathway \n", + "\n", + " Definition \\\n", + "0 BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->... \n", + "1 KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK \n", + "2 FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK \n", + "3 MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER... \n", + "4 EML4-ALK -> RAS -> RAF -> MEK -> ERK -> CCND1 \n", + ".. ... \n", + "168 (Hypoxanthine,Guanine) // HPRT1* \n", + "169 NXN* -| DVL \n", + "170 PRDX1* =| MMACHC \n", + "171 (VHL*+RBX1+ELOC+ELOB+CUL2) // EPAS1 == ARNT =>... \n", + "172 NRG // ERBB4* \n", + "\n", + " Expanded \\\n", + "0 (25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38... \n", + "1 3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48... \n", + "2 (2322v2,2322v1) -> 2885 -> (6654,6655) -> (326... \n", + "3 4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48... \n", + "4 (238v1,238v2) -> (3265,3845,4893) -> (369,673,... \n", + ".. ... \n", + "168 (C00262,C00242) // 3251v1 \n", + "169 64359v1 -| (1855,1856,1857) \n", + "170 5052v1 =| 25974 \n", + "171 (7428v3+9978+6921+6923+8453) // 2034 == 405 =>... \n", + "172 (3084,9542,10718,145957) // 2066v1 \n", + "\n", + " Pathway \\\n", + "0 {'hsa05220': 'Chronic myeloid leukemia'} \n", + "1 {'hsa05221': 'Acute myeloid leukemia'} \n", + "2 {'hsa05221': 'Acute myeloid leukemia'} \n", + "3 {'hsa05225': 'Hepatocellular carcinoma', 'hsa0... \n", + "4 {'hsa05223': 'Non-small cell lung cancer'} \n", + ".. ... \n", + "168 \n", + "169 \n", + "170 {'hsa04980': 'Cobalamin transport and metaboli... \n", + "171 {'hsa05211': 'Renal cell carcinoma'} \n", + "172 {'hsa04012': 'ErbB signaling pathway'} \n", + "\n", + " Class \\\n", + "0 {'nt06276': 'Chronic myeloid leukemia', 'nt062... \n", + "1 {'nt06275': 'Acute myeloid leukemia', 'nt06210... \n", + "2 {'nt06275': 'Acute myeloid leukemia', 'nt06210... \n", + "3 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "4 {'nt06266': 'Non-small cell lung cancer', 'nt0... \n", + ".. ... \n", + "168 {'nt06027': 'Purine salvage pathway'} \n", + "169 {'nt06505': 'WNT signaling'} \n", + "170 {'nt06538': 'Cobalamin transport and metabolism'} \n", + "171 {'nt06542': 'HIF signaling'} \n", + "172 {'nt06543': 'NRG-ERBB signaling'} \n", + "\n", + " Disease \\\n", + "0 {'H00004': 'DESCRIPTION Chronic myeloid leukem... \n", + "1 {'H00003': 'DESCRIPTION Acute myeloid leukemia... \n", + "2 {'H00003': 'DESCRIPTION Acute myeloid leukemia... \n", + "3 {'H00048': 'DESCRIPTION Hepatocellular carcino... \n", + "4 {'H00014': 'DESCRIPTION Lung cancer is a leadi... \n", + ".. ... \n", + "168 {'H00194': 'DESCRIPTION Deficiency of hypoxant... \n", + "169 {'H00485': 'DESCRIPTION Robinow syndrome (RS) ... \n", + "170 {'H02221': 'DESCRIPTION Methylmalonic aciduria... \n", + "171 {'H00021': 'DESCRIPTION Renal cell cancer (RCC... \n", + "172 {'H00058': 'DESCRIPTION Amyotrophic lateral sc... \n", + "\n", + " Gene \n", + "0 {'25': 'ABL1; ABL proto-oncogene 1, non-recept... \n", + "1 {'3815': 'KIT; KIT proto-oncogene receptor tyr... \n", + "2 {'2322': 'FLT3; fms related tyrosine kinase 3'... \n", + "3 {'4233': 'MET; MET proto-oncogene, receptor ty... \n", + "4 {'238': 'ALK; ALK receptor tyrosine kinase', '... \n", + ".. ... \n", + "168 {'3251': 'HPRT1; hypoxanthine phosphoribosyltr... \n", + "169 {'64359': 'NXN; nucleoredoxin', '1855': 'DVL1;... \n", + "170 {'5052': 'PRDX1; peroxiredoxin 1', '25974': 'M... \n", + "171 {'7428': 'VHL; von Hippel-Lindau tumor suppres... \n", + "172 {'3084': 'NRG1; neuregulin 1', '9542': 'NRG2; ... \n", + "\n", + "[173 rows x 8 columns]" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "network_info" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "0cad7b6f-d863-49f9-b0a2-644da8beb947", + "metadata": {}, + "outputs": [], + "source": [ + "network_info.to_csv(\"network_variant_final_info.tsv\",sep='\\t', header=True, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "3a556f82-3468-44eb-be31-5e9bedf59c70", + "metadata": {}, + "outputs": [], + "source": [ + "!sed -i '' 's/DESCRIPTION //g' network_variant_final_info.tsv" + ] + }, + { + "cell_type": "markdown", + "id": "a34eb400-5a7d-41c2-b2be-5bb9a3febf57", + "metadata": {}, + "source": [ + "# Final Merge of Variant Data with Network Data" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "id": "83d484dd-69d7-4e50-9454-9369223f1dd2", + "metadata": {}, + "outputs": [], + "source": [ + "variant_data = pd.read_csv(\"variant_data_together_wo_nt.tsv\", sep='\\t')\n", + "network_info = pd.read_csv(\"network_variant_final_info.tsv\",sep='\\t')\n", + "network_info = network_info.rename(columns={\"Entry\":\"Network\", \"Definition\":\"Network Definition\",\"Expanded\":\"Network Expanded\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "id": "63f214f1-e32a-4275-a037-554fd89409aa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NetworkENTRYSourceIDTranscriptIDNucChangeChrStartEndRefAlleleAltAllele
0N000731019v2ClinVar16929NC_000012.12NaN125775164657751646CT
1N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646CA
2N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646CG
3N000731019v2ClinVar16928NC_000012.12NaN125775164757751647GA
4N000731019v2dbSNPrs11547328NC_000012.12NaN125775164757751647GC
....................................
1506N002449817v1COSM6196635ENST00000393623.6c.706G>T191049219610492196CA
1507N002449817v1COSM6196637ENST00000393623.6c.548A>G191049948610499486TC
1508N00258999v2COSM4766271ENST00000621016.4c.662A>G166880882368808823AG
1509N00258999v2COSM4766211ENST00000621016.4c.755T>G166881026468810264TG
1510N00258999v2COSM1379150ENST00000621016.4c.769G>A166881027868810278GA
\n", + "

1511 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Network ENTRY Source ID TranscriptID NucChange Chr \\\n", + "0 N00073 1019v2 ClinVar 16929 NC_000012.12 NaN 12 \n", + "1 N00073 1019v2 dbSNP rs104894340 NC_000012.12 NaN 12 \n", + "2 N00073 1019v2 dbSNP rs104894340 NC_000012.12 NaN 12 \n", + "3 N00073 1019v2 ClinVar 16928 NC_000012.12 NaN 12 \n", + "4 N00073 1019v2 dbSNP rs11547328 NC_000012.12 NaN 12 \n", + "... ... ... ... ... ... ... ... \n", + "1506 N00244 9817v1 COSM 6196635 ENST00000393623.6 c.706G>T 19 \n", + "1507 N00244 9817v1 COSM 6196637 ENST00000393623.6 c.548A>G 19 \n", + "1508 N00258 999v2 COSM 4766271 ENST00000621016.4 c.662A>G 16 \n", + "1509 N00258 999v2 COSM 4766211 ENST00000621016.4 c.755T>G 16 \n", + "1510 N00258 999v2 COSM 1379150 ENST00000621016.4 c.769G>A 16 \n", + "\n", + " Start End RefAllele AltAllele \n", + "0 57751646 57751646 C T \n", + "1 57751646 57751646 C A \n", + "2 57751646 57751646 C G \n", + "3 57751647 57751647 G A \n", + "4 57751647 57751647 G C \n", + "... ... ... ... ... \n", + "1506 10492196 10492196 C A \n", + "1507 10499486 10499486 T C \n", + "1508 68808823 68808823 A G \n", + "1509 68810264 68810264 T G \n", + "1510 68810278 68810278 G A \n", + "\n", + "[1511 rows x 11 columns]" + ] + }, + "execution_count": 118, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variant_data" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "a681f1fb-b921-4ec3-b9cb-43df32fe9ef8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NetworkNameNetwork DefinitionNetwork ExpandedPathwayClassDiseaseGene
0N00002BCR-ABL fusion kinase to RAS-ERK signaling pat...BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->...(25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38...{'hsa05220': 'Chronic myeloid leukemia'}{'nt06276': 'Chronic myeloid leukemia', 'nt062...{'H00004': 'Chronic myeloid leukemia (CML) is ...{'25': 'ABL1; ABL proto-oncogene 1, non-recept...
1N00003Mutation-activated KIT to RAS-ERK signaling pa...KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48...{'hsa05221': 'Acute myeloid leukemia'}{'nt06275': 'Acute myeloid leukemia', 'nt06210...{'H00003': 'Acute myeloid leukemia (AML) is a ...{'3815': 'KIT; KIT proto-oncogene receptor tyr...
2N00004Duplication or mutation-activated FLT3 to RAS-...FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK(2322v2,2322v1) -> 2885 -> (6654,6655) -> (326...{'hsa05221': 'Acute myeloid leukemia'}{'nt06275': 'Acute myeloid leukemia', 'nt06210...{'H00003': 'Acute myeloid leukemia (AML) is a ...{'2322': 'FLT3; fms related tyrosine kinase 3'...
3N00005Mutation-activated MET to RAS-ERK signaling pa...MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER...4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48...{'hsa05225': 'Hepatocellular carcinoma', 'hsa0...{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'4233': 'MET; MET proto-oncogene, receptor ty...
4N00007EML4-ALK fusion kinase to RAS-ERK signaling pa...EML4-ALK -> RAS -> RAF -> MEK -> ERK -> CCND1(238v1,238v2) -> (3265,3845,4893) -> (369,673,...{'hsa05223': 'Non-small cell lung cancer'}{'nt06266': 'Non-small cell lung cancer', 'nt0...{'H00014': 'Lung cancer is a leading cause of ...{'238': 'ALK; ALK receptor tyrosine kinase', '...
...........................
168N01422HPRT1 deficiency in purine salvage pathway(Hypoxanthine,Guanine) // HPRT1*(C00262,C00242) // 3251v1NaN{'nt06027': 'Purine salvage pathway'}{'H00194': 'Deficiency of hypoxanthine-guanine...{'3251': 'HPRT1; hypoxanthine phosphoribosyltr...
169N01444NXN mutation to WNT5A-ROR signaling pathwayNXN* -| DVL64359v1 -| (1855,1856,1857)NaN{'nt06505': 'WNT signaling'}{'H00485': 'Robinow syndrome (RS) is a rare ge...{'64359': 'NXN; nucleoredoxin', '1855': 'DVL1;...
170N01809Mutation-caused epigenetic silencing of MMACHCPRDX1* =| MMACHC5052v1 =| 25974{'hsa04980': 'Cobalamin transport and metaboli...{'nt06538': 'Cobalamin transport and metabolism'}{'H02221': 'Methylmalonic aciduria and homocys...{'5052': 'PRDX1; peroxiredoxin 1', '25974': 'M...
171N01873VHL mutation to HIF-2 signaling pathway(VHL*+RBX1+ELOC+ELOB+CUL2) // EPAS1 == ARNT =>...(7428v3+9978+6921+6923+8453) // 2034 == 405 =>...{'hsa05211': 'Renal cell carcinoma'}{'nt06542': 'HIF signaling'}{'H00021': 'Renal cell cancer (RCC) accounts f...{'7428': 'VHL; von Hippel-Lindau tumor suppres...
172N01877ERBB4 mutation to GF-RTK-PI3K signaling pathwayNRG // ERBB4*(3084,9542,10718,145957) // 2066v1{'hsa04012': 'ErbB signaling pathway'}{'nt06543': 'NRG-ERBB signaling'}{'H00058': 'Amyotrophic lateral sclerosis (ALS...{'3084': 'NRG1; neuregulin 1', '9542': 'NRG2; ...
\n", + "

173 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Network Name \\\n", + "0 N00002 BCR-ABL fusion kinase to RAS-ERK signaling pat... \n", + "1 N00003 Mutation-activated KIT to RAS-ERK signaling pa... \n", + "2 N00004 Duplication or mutation-activated FLT3 to RAS-... \n", + "3 N00005 Mutation-activated MET to RAS-ERK signaling pa... \n", + "4 N00007 EML4-ALK fusion kinase to RAS-ERK signaling pa... \n", + ".. ... ... \n", + "168 N01422 HPRT1 deficiency in purine salvage pathway \n", + "169 N01444 NXN mutation to WNT5A-ROR signaling pathway \n", + "170 N01809 Mutation-caused epigenetic silencing of MMACHC \n", + "171 N01873 VHL mutation to HIF-2 signaling pathway \n", + "172 N01877 ERBB4 mutation to GF-RTK-PI3K signaling pathway \n", + "\n", + " Network Definition \\\n", + "0 BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->... \n", + "1 KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK \n", + "2 FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK \n", + "3 MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER... \n", + "4 EML4-ALK -> RAS -> RAF -> MEK -> ERK -> CCND1 \n", + ".. ... \n", + "168 (Hypoxanthine,Guanine) // HPRT1* \n", + "169 NXN* -| DVL \n", + "170 PRDX1* =| MMACHC \n", + "171 (VHL*+RBX1+ELOC+ELOB+CUL2) // EPAS1 == ARNT =>... \n", + "172 NRG // ERBB4* \n", + "\n", + " Network Expanded \\\n", + "0 (25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38... \n", + "1 3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48... \n", + "2 (2322v2,2322v1) -> 2885 -> (6654,6655) -> (326... \n", + "3 4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48... \n", + "4 (238v1,238v2) -> (3265,3845,4893) -> (369,673,... \n", + ".. ... \n", + "168 (C00262,C00242) // 3251v1 \n", + "169 64359v1 -| (1855,1856,1857) \n", + "170 5052v1 =| 25974 \n", + "171 (7428v3+9978+6921+6923+8453) // 2034 == 405 =>... \n", + "172 (3084,9542,10718,145957) // 2066v1 \n", + "\n", + " Pathway \\\n", + "0 {'hsa05220': 'Chronic myeloid leukemia'} \n", + "1 {'hsa05221': 'Acute myeloid leukemia'} \n", + "2 {'hsa05221': 'Acute myeloid leukemia'} \n", + "3 {'hsa05225': 'Hepatocellular carcinoma', 'hsa0... \n", + "4 {'hsa05223': 'Non-small cell lung cancer'} \n", + ".. ... \n", + "168 NaN \n", + "169 NaN \n", + "170 {'hsa04980': 'Cobalamin transport and metaboli... \n", + "171 {'hsa05211': 'Renal cell carcinoma'} \n", + "172 {'hsa04012': 'ErbB signaling pathway'} \n", + "\n", + " Class \\\n", + "0 {'nt06276': 'Chronic myeloid leukemia', 'nt062... \n", + "1 {'nt06275': 'Acute myeloid leukemia', 'nt06210... \n", + "2 {'nt06275': 'Acute myeloid leukemia', 'nt06210... \n", + "3 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "4 {'nt06266': 'Non-small cell lung cancer', 'nt0... \n", + ".. ... \n", + "168 {'nt06027': 'Purine salvage pathway'} \n", + "169 {'nt06505': 'WNT signaling'} \n", + "170 {'nt06538': 'Cobalamin transport and metabolism'} \n", + "171 {'nt06542': 'HIF signaling'} \n", + "172 {'nt06543': 'NRG-ERBB signaling'} \n", + "\n", + " Disease \\\n", + "0 {'H00004': 'Chronic myeloid leukemia (CML) is ... \n", + "1 {'H00003': 'Acute myeloid leukemia (AML) is a ... \n", + "2 {'H00003': 'Acute myeloid leukemia (AML) is a ... \n", + "3 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", + "4 {'H00014': 'Lung cancer is a leading cause of ... \n", + ".. ... \n", + "168 {'H00194': 'Deficiency of hypoxanthine-guanine... \n", + "169 {'H00485': 'Robinow syndrome (RS) is a rare ge... \n", + "170 {'H02221': 'Methylmalonic aciduria and homocys... \n", + "171 {'H00021': 'Renal cell cancer (RCC) accounts f... \n", + "172 {'H00058': 'Amyotrophic lateral sclerosis (ALS... \n", + "\n", + " Gene \n", + "0 {'25': 'ABL1; ABL proto-oncogene 1, non-recept... \n", + "1 {'3815': 'KIT; KIT proto-oncogene receptor tyr... \n", + "2 {'2322': 'FLT3; fms related tyrosine kinase 3'... \n", + "3 {'4233': 'MET; MET proto-oncogene, receptor ty... \n", + "4 {'238': 'ALK; ALK receptor tyrosine kinase', '... \n", + ".. ... \n", + "168 {'3251': 'HPRT1; hypoxanthine phosphoribosyltr... \n", + "169 {'64359': 'NXN; nucleoredoxin', '1855': 'DVL1;... \n", + "170 {'5052': 'PRDX1; peroxiredoxin 1', '25974': 'M... \n", + "171 {'7428': 'VHL; von Hippel-Lindau tumor suppres... \n", + "172 {'3084': 'NRG1; neuregulin 1', '9542': 'NRG2; ... \n", + "\n", + "[173 rows x 8 columns]" + ] + }, + "execution_count": 119, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "network_info" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "ff9b9542-754c-414a-82c9-4eb8409b19b5", + "metadata": {}, + "outputs": [], + "source": [ + "final_data = variant_data.merge(network_info, on='Network')" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "id": "5f1e15c8-49f3-4be8-9f4e-e3f73e30f01c", + "metadata": {}, + "outputs": [], + "source": [ + "final_data.to_csv(\"final_network_with_variant.tsv\",sep='\\t',header=True, index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "99191ac9-875f-4e5c-89d6-6382b29a9564", + "metadata": {}, + "source": [ + "# Extracting Human Chromosomes" + ] + }, + { + "cell_type": "markdown", + "id": "fb75a991-a908-48f1-8615-3099ab06ac66", + "metadata": {}, + "source": [ + "Downloaded the human genome from here https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.26/" + ] + }, + { + "cell_type": "markdown", + "id": "048f0501-7a19-4a8f-8a21-5e975f26135b", + "metadata": {}, + "source": [ + "Got all the chromosomes and their ids that we have variants for\n", + "\n", + "NC_000001.11\n", + "NC_000002.12\n", + "NC_000003.12\n", + "NC_000004.12\n", + "NC_000005.10\n", + "NC_000006.12\n", + "NC_000007.14\n", + "NC_000009.12\n", + "NC_000010.11\n", + "NC_000011.10\n", + "NC_000012.12\n", + "NC_000013.11\n", + "NC_000014.9\n", + "NC_000015.10\n", + "NC_000016.10\n", + "NC_000017.11\n", + "NC_000018.10\n", + "NC_000019.10\n", + "NC_000020.11\n", + "NC_000021.9\n", + "NC_000023.11\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3d870ed0-b14d-42a8-8d55-b58dc49367f1", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc0c7ced-c649-4695-bc11-9a7bfb87e128", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO]\u001b[0m 21 patterns loaded from file\n" + ] + } + ], + "source": [ + "seqkit grep -r -n -f chromosomes.txt /ncbi_dataset/data/GCF_000001405.26/GCF_000001405.26_GRCh38_genomic.fna -o chromosomes.fasta" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ef62d80c-f572-4f4e-9443-e3653a178327", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "file format type num_seqs sum_len min_len avg_len max_len\n", + "chromosomes.fasta FASTA DNA 21 2,835,085,313 46,709,983 135,004,062.5 248,956,422\n" + ] + } + ], + "source": [ + "seqkit stats chromosomes.fasta" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2452900e-93d3-4707-b2a3-0b94224de2cc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.7G\tchromosomes.fasta\n" + ] + } + ], + "source": [ + "du -h chromosomes.fasta" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "460d091d-6f98-4c9e-95a3-137601780652", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NC_000001.11 Homo sapiens chromosome 1, GRCh38 Primary Assembly\n", + "NC_000002.12 Homo sapiens chromosome 2, GRCh38 Primary Assembly\n", + "NC_000003.12 Homo sapiens chromosome 3, GRCh38 Primary Assembly\n", + "NC_000004.12 Homo sapiens chromosome 4, GRCh38 Primary Assembly\n", + "NC_000005.10 Homo sapiens chromosome 5, GRCh38 Primary Assembly\n", + "NC_000006.12 Homo sapiens chromosome 6, GRCh38 Primary Assembly\n", + "NC_000007.14 Homo sapiens chromosome 7, GRCh38 Primary Assembly\n", + "NC_000009.12 Homo sapiens chromosome 9, GRCh38 Primary Assembly\n", + "NC_000010.11 Homo sapiens chromosome 10, GRCh38 Primary Assembly\n", + "NC_000011.10 Homo sapiens chromosome 11, GRCh38 Primary Assembly\n", + "NC_000012.12 Homo sapiens chromosome 12, GRCh38 Primary Assembly\n", + "NC_000013.11 Homo sapiens chromosome 13, GRCh38 Primary Assembly\n", + "NC_000014.9 Homo sapiens chromosome 14, GRCh38 Primary Assembly\n", + "NC_000015.10 Homo sapiens chromosome 15, GRCh38 Primary Assembly\n", + "NC_000016.10 Homo sapiens chromosome 16, GRCh38 Primary Assembly\n", + "NC_000017.11 Homo sapiens chromosome 17, GRCh38 Primary Assembly\n", + "NC_000018.10 Homo sapiens chromosome 18, GRCh38 Primary Assembly\n", + "NC_000019.10 Homo sapiens chromosome 19, GRCh38 Primary Assembly\n", + "NC_000020.11 Homo sapiens chromosome 20, GRCh38 Primary Assembly\n", + "NC_000021.9 Homo sapiens chromosome 21, GRCh38 Primary Assembly\n", + "NC_000023.11 Homo sapiens chromosome X, GRCh38 Primary Assembly\n" + ] + } + ], + "source": [ + "seqkit fx2tab chromosomes.fasta | cut -f1" + ] + }, + { + "cell_type": "markdown", + "id": "c923727a-9eae-407f-a85f-fb9317ccd3ce", + "metadata": {}, + "source": [ + "# Creating the Nt Variant Database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06b00f03-f71e-4575-80b2-9960be48dba8", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "04b7f027-f8d2-451f-9d1e-b784708079cf", + "metadata": {}, + "outputs": [], + "source": [ + "from Bio import SeqIO\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1ca8532d-3e55-494f-8712-a1ea56c2b96d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Var_IDNetworkENTRYSourceIDTranscriptIDNucChangeChrStartEndRefAlleleAltAlleleNameNetwork DefinitionNetwork ExpandedPathwayClassDiseaseGene
0KEGG_1N000731019v2ClinVar16929NC_000012.12NaN125775164657751646CTMutation-activated CDK4 to cell cycle G1/S(CCND+CDK4*) -> RB1 // E2F((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...
1KEGG_2N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646CAMutation-activated CDK4 to cell cycle G1/S(CCND+CDK4*) -> RB1 // E2F((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...
2KEGG_3N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646CGMutation-activated CDK4 to cell cycle G1/S(CCND+CDK4*) -> RB1 // E2F((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...
3KEGG_4N000731019v2ClinVar16928NC_000012.12NaN125775164757751647GAMutation-activated CDK4 to cell cycle G1/S(CCND+CDK4*) -> RB1 // E2F((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...
4KEGG_5N000731019v2dbSNPrs11547328NC_000012.12NaN125775164757751647GCMutation-activated CDK4 to cell cycle G1/S(CCND+CDK4*) -> RB1 // E2F((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...
............................................................
1444KEGG_1445N002449817v1COSM6196635ENST00000393623.6c.706G>T191049219610492196CAMutation-inactivated KEAP1 to KEAP1-NRF2 signa...KEAP1* // NRF2 => (HMOX1,NQO1,GST,TXNRD1)9817v1 // 4780 => (3162,1728,119391,221357,293...{'hsa05225': 'Hepatocellular carcinoma'}{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'9817': 'KEAP1; kelch like ECH associated pro...
1445KEGG_1446N002449817v1COSM6196637ENST00000393623.6c.548A>G191049948610499486TCMutation-inactivated KEAP1 to KEAP1-NRF2 signa...KEAP1* // NRF2 => (HMOX1,NQO1,GST,TXNRD1)9817v1 // 4780 => (3162,1728,119391,221357,293...{'hsa05225': 'Hepatocellular carcinoma'}{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'9817': 'KEAP1; kelch like ECH associated pro...
1446KEGG_1447N00258999v2COSM4766271ENST00000621016.4c.662A>G166880882368808823AGMutation-inactivated CDH1 to beta-catenin sign...CDH1* // CTNNB1 -> TCF/LEF => (MYC,CCND1)999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...
1447KEGG_1448N00258999v2COSM4766211ENST00000621016.4c.755T>G166881026468810264TGMutation-inactivated CDH1 to beta-catenin sign...CDH1* // CTNNB1 -> TCF/LEF => (MYC,CCND1)999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...
1448KEGG_1449N00258999v2COSM1379150ENST00000621016.4c.769G>A166881027868810278GAMutation-inactivated CDH1 to beta-catenin sign...CDH1* // CTNNB1 -> TCF/LEF => (MYC,CCND1)999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...
\n", + "

1449 rows × 19 columns

\n", + "
" + ], + "text/plain": [ + " Var_ID Network ENTRY Source ID TranscriptID \\\n", + "0 KEGG_1 N00073 1019v2 ClinVar 16929 NC_000012.12 \n", + "1 KEGG_2 N00073 1019v2 dbSNP rs104894340 NC_000012.12 \n", + "2 KEGG_3 N00073 1019v2 dbSNP rs104894340 NC_000012.12 \n", + "3 KEGG_4 N00073 1019v2 ClinVar 16928 NC_000012.12 \n", + "4 KEGG_5 N00073 1019v2 dbSNP rs11547328 NC_000012.12 \n", + "... ... ... ... ... ... ... \n", + "1444 KEGG_1445 N00244 9817v1 COSM 6196635 ENST00000393623.6 \n", + "1445 KEGG_1446 N00244 9817v1 COSM 6196637 ENST00000393623.6 \n", + "1446 KEGG_1447 N00258 999v2 COSM 4766271 ENST00000621016.4 \n", + "1447 KEGG_1448 N00258 999v2 COSM 4766211 ENST00000621016.4 \n", + "1448 KEGG_1449 N00258 999v2 COSM 1379150 ENST00000621016.4 \n", + "\n", + " NucChange Chr Start End RefAllele AltAllele \\\n", + "0 NaN 12 57751646 57751646 C T \n", + "1 NaN 12 57751646 57751646 C A \n", + "2 NaN 12 57751646 57751646 C G \n", + "3 NaN 12 57751647 57751647 G A \n", + "4 NaN 12 57751647 57751647 G C \n", + "... ... ... ... ... ... ... \n", + "1444 c.706G>T 19 10492196 10492196 C A \n", + "1445 c.548A>G 19 10499486 10499486 T C \n", + "1446 c.662A>G 16 68808823 68808823 A G \n", + "1447 c.755T>G 16 68810264 68810264 T G \n", + "1448 c.769G>A 16 68810278 68810278 G A \n", + "\n", + " Name \\\n", + "0 Mutation-activated CDK4 to cell cycle G1/S \n", + "1 Mutation-activated CDK4 to cell cycle G1/S \n", + "2 Mutation-activated CDK4 to cell cycle G1/S \n", + "3 Mutation-activated CDK4 to cell cycle G1/S \n", + "4 Mutation-activated CDK4 to cell cycle G1/S \n", + "... ... \n", + "1444 Mutation-inactivated KEAP1 to KEAP1-NRF2 signa... \n", + "1445 Mutation-inactivated KEAP1 to KEAP1-NRF2 signa... \n", + "1446 Mutation-inactivated CDH1 to beta-catenin sign... \n", + "1447 Mutation-inactivated CDH1 to beta-catenin sign... \n", + "1448 Mutation-inactivated CDH1 to beta-catenin sign... \n", + "\n", + " Network Definition \\\n", + "0 (CCND+CDK4*) -> RB1 // E2F \n", + "1 (CCND+CDK4*) -> RB1 // E2F \n", + "2 (CCND+CDK4*) -> RB1 // E2F \n", + "3 (CCND+CDK4*) -> RB1 // E2F \n", + "4 (CCND+CDK4*) -> RB1 // E2F \n", + "... ... \n", + "1444 KEAP1* // NRF2 => (HMOX1,NQO1,GST,TXNRD1) \n", + "1445 KEAP1* // NRF2 => (HMOX1,NQO1,GST,TXNRD1) \n", + "1446 CDH1* // CTNNB1 -> TCF/LEF => (MYC,CCND1) \n", + "1447 CDH1* // CTNNB1 -> TCF/LEF => (MYC,CCND1) \n", + "1448 CDH1* // CTNNB1 -> TCF/LEF => (MYC,CCND1) \n", + "\n", + " Network Expanded \\\n", + "0 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "1 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "2 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "3 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "4 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "... ... \n", + "1444 9817v1 // 4780 => (3162,1728,119391,221357,293... \n", + "1445 9817v1 // 4780 => (3162,1728,119391,221357,293... \n", + "1446 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "1447 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "1448 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "\n", + " Pathway \\\n", + "0 {'hsa05218': 'Melanoma'} \n", + "1 {'hsa05218': 'Melanoma'} \n", + "2 {'hsa05218': 'Melanoma'} \n", + "3 {'hsa05218': 'Melanoma'} \n", + "4 {'hsa05218': 'Melanoma'} \n", + "... ... \n", + "1444 {'hsa05225': 'Hepatocellular carcinoma'} \n", + "1445 {'hsa05225': 'Hepatocellular carcinoma'} \n", + "1446 {'hsa05226': 'Gastric cancer'} \n", + "1447 {'hsa05226': 'Gastric cancer'} \n", + "1448 {'hsa05226': 'Gastric cancer'} \n", + "\n", + " Class \\\n", + "0 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "1 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "2 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "3 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "4 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "... ... \n", + "1444 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "1445 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "1446 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "1447 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "1448 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "\n", + " Disease \\\n", + "0 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "1 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "2 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "3 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "4 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "... ... \n", + "1444 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", + "1445 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", + "1446 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "1447 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "1448 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "\n", + " Gene \n", + "0 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... \n", + "1 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... \n", + "2 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... \n", + "3 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... \n", + "4 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... \n", + "... ... \n", + "1444 {'9817': 'KEAP1; kelch like ECH associated pro... \n", + "1445 {'9817': 'KEAP1; kelch like ECH associated pro... \n", + "1446 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... \n", + "1447 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... \n", + "1448 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... \n", + "\n", + "[1449 rows x 19 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variant_data = pd.read_csv(\"final_network_with_variant.tsv\", sep='\\t')\n", + "variant_data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ae73bfae-91a9-40a9-bfdb-c14b1d3e14ea", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1449" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(variant_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6e515f9d-b9a6-4a24-bde6-2496a823b9ba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'N00073'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variant_data.iloc[1][\"Network\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "488f8ed2-2a5b-4831-a5b7-90f3e049614f", + "metadata": {}, + "outputs": [], + "source": [ + "fasta_file = \"chromosomes.fasta\"\n", + "record_dict = SeqIO.to_dict(SeqIO.parse(fasta_file, \"fasta\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6c04e6aa-d700-427c-a4ce-cba8225e3024", + "metadata": {}, + "outputs": [], + "source": [ + "chromosome_dictionary = {\n", + " \"1\": \"NC_000001.11\",\n", + " \"2\": \"NC_000002.12\",\n", + " \"3\": \"NC_000003.12\",\n", + " \"4\": \"NC_000004.12\",\n", + " \"5\": \"NC_000005.10\",\n", + " \"6\": \"NC_000006.12\",\n", + " \"7\": \"NC_000007.14\",\n", + " \"9\": \"NC_000009.12\",\n", + " \"10\": \"NC_000010.11\",\n", + " \"11\": \"NC_000011.10\",\n", + " \"12\": \"NC_000012.12\",\n", + " \"13\": \"NC_000013.11\",\n", + " \"14\": \"NC_000014.9\",\n", + " \"15\": \"NC_000015.10\",\n", + " \"16\": \"NC_000016.10\",\n", + " \"17\": \"NC_000017.11\",\n", + " \"18\": \"NC_000018.10\",\n", + " \"19\": \"NC_000019.10\",\n", + " \"20\": \"NC_000020.11\",\n", + " \"21\": \"NC_000021.9\",\n", + " \"23\": \"NC_000023.11\"\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7a3550d7-04a4-44f3-a7d5-b61d30890ef0", + "metadata": {}, + "source": [ + "### Verification that the reference is present at the exact position I have in my data" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b593ef66-65e3-411a-ac95-a33c9d37667a", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"verification.txt\", \"w\") as f:\n", + " for i in range(len(variant_data)):\n", + " # ---- Input ----\n", + " chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]\n", + " if (variant_data.iloc[i]['TranscriptID'][:4] == \"ENST\"):\n", + " start = variant_data.iloc[i]['Start'] - 1\n", + " else:\n", + " start = variant_data.iloc[i]['Start']\n", + " reference_allele = variant_data.iloc[i]['RefAllele']\n", + " end = len(reference_allele) + start\n", + "\n", + " chrom_seq = record_dict[chromosome_id].seq\n", + "\n", + " # Adjust for 0-based indexing in Python\n", + " genomic_ref = chrom_seq[start: start + len(reference_allele)]\n", + "\n", + " if genomic_ref.upper() != reference_allele.upper():\n", + " f.write(f\"⚠️ Warning: Entry number {i} with variant {variant_data.iloc[i]['ID']} expected '{reference_allele}', but found '{genomic_ref}'\\n\")\n", + " else:\n", + " f.write(f\"✅ Verified: {chromosome_id}:{start}-{end} → '{reference_allele}' matches genome\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "02044565-4f9c-45f9-b59a-63590b571dd1", + "metadata": {}, + "outputs": [], + "source": [ + "mkdir nt_seq" + ] + }, + { + "cell_type": "markdown", + "id": "361619c9-7b49-45dd-901a-625cf1642535", + "metadata": {}, + "source": [ + "### Performing the mutation and saving the reference and variant allele with a 1000 nt window" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "96392c0b-c3fd-49ee-a2c1-97cef4127617", + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(len(variant_data)):\n", + " with open(f\"nt_seq/{variant_data.iloc[i]['Var_ID']}.txt\", \"w\") as f:\n", + " # ---- Input ----\n", + " chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]\n", + " if (variant_data.iloc[i]['TranscriptID'][:4] == \"ENST\"):\n", + " start = variant_data.iloc[i]['Start'] - 1\n", + " else:\n", + " start = variant_data.iloc[i]['Start']\n", + " reference_allele = variant_data.iloc[i]['RefAllele']\n", + " variant_allele = variant_data.iloc[i]['AltAllele']\n", + "\n", + " end = len(reference_allele) + start\n", + " window = 1000\n", + " \n", + " chrom_seq = record_dict[chromosome_id].seq\n", + "\n", + " # Extract region\n", + " region_start = max(0, start - window)\n", + " region_end = end + window\n", + "\n", + " ref_seq = chrom_seq[region_start:region_end]\n", + " \n", + " if (variant_allele == \"deletion\"):\n", + " # Apply mutation\n", + " mutated_seq = ref_seq[:window] + variant_allele + ref_seq[window + len(reference_allele):]\n", + " \n", + " f.write(f\">{variant_data.iloc[i]['ID']}_reference_{reference_allele}\\n\")\n", + " f.write(f\"{ref_seq}\\n\")\n", + " f.write(f\">{variant_data.iloc[i]['ID']}_variant_{variant_allele}\\n\")\n", + " f.write(f\"{mutated_seq}\\n\")\n", + " else:\n", + " del_len = len(reference_allele)\n", + " # Apply mutation\n", + " mutated_seq = ref_seq[:window] + ref_seq[window + del_len:]\n", + " \n", + " f.write(f\">{variant_data.iloc[i]['ID']}_reference_{reference_allele}\\n\")\n", + " f.write(f\"{ref_seq}\\n\")\n", + " f.write(f\">{variant_data.iloc[i]['ID']}_variant_{variant_allele}\\n\")\n", + " f.write(f\"{mutated_seq}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e06b86fd-2d31-486e-82ed-80dbb7f3b627", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/BioReason/data/KEGG_Data_2.ipynb b/BioReason/data/KEGG_Data_2.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..6a09e6a13f84e999adbc66fbe917bd98e28b553d --- /dev/null +++ b/BioReason/data/KEGG_Data_2.ipynb @@ -0,0 +1,1208 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0103d07d", + "metadata": {}, + "source": [ + "# KEGG Data Processing Pipeline - Part 2: Variant Information Parsing and Sequence Generation\n", + "\n", + "## Overview\n", + "\n", + "This notebook is the second part of the KEGG data processing pipeline. It focuses on parsing variant information from KEGG data, generating nucleotide sequences with mutations, and creating disease mapping databases.\n", + "\n", + "## What This Notebook Does\n", + "\n", + "1. **Variant Information Parsing**: Extracts detailed information from KEGG variant files\n", + "2. **Sequence Generation**: Creates reference and variant nucleotide sequences with genomic context\n", + "3. **Disease Mapping**: Downloads and processes KEGG disease information\n", + "4. **Data Integration**: Merges variant data with genomic sequences and disease annotations\n", + "5. **Quality Control**: Validates reference sequences against the genome\n", + "\n", + "## Prerequisites\n", + "\n", + "**Required from Part 1 (KEGG_Data_1.ipynb):**\n", + "- `gene_variants.txt` - List of variant identifiers\n", + "- `variant_info/` directory - Individual variant information files\n", + "- `final_network_with_variant.tsv` - Network and variant mapping data\n", + "\n", + "**Additional Requirements:**\n", + "- Reference genome FASTA file (GRCh38)\n", + "- BioPython for sequence processing\n", + "- KEGG_pull for disease information retrieval\n", + "\n", + "## Required Packages\n", + "\n", + "```bash\n", + "pip install biopython pandas kegg-pull\n", + "```\n", + "\n", + "## Input Files Expected\n", + "\n", + "- `gene_variants.txt` - Variant identifiers from Part 1\n", + "- `variant_info/*.txt` - Individual variant information files\n", + "- `chromosomes.fasta` - Reference genome sequences\n", + "- `final_network_with_variant.tsv` - Network-variant mapping\n", + "\n", + "## Output Files Generated\n", + "\n", + "- `nt_seq/` - Directory containing reference and variant sequences\n", + "- `verification.txt` - Quality control results\n", + "- `diseases.txt` - List of disease identifiers\n", + "- `disease_info/` - Disease information files\n", + "- Updated `final_network_with_variant.tsv` with disease names\n", + "\n", + "## Important Notes\n", + "\n", + "- **Memory Usage**: Processing large genomic sequences requires significant RAM\n", + "- **Storage**: Generated sequence files can be several GB in size\n", + "- **Processing Time**: Full pipeline may take several hours depending on dataset size\n", + "- **Dependencies**: Requires successful completion of KEGG_Data_1.ipynb\n", + "\n", + "## Next Steps\n", + "\n", + "After completing this notebook, run `KEGG_Data_3.ipynb` for final dataset creation and sequence integration." + ] + }, + { + "cell_type": "markdown", + "id": "ccc3ca96", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "Set up paths and parameters for variant processing:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28d2629e", + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration - Update these paths for your environment\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "# Navigate to kegg_data directory\n", + "data_dir = Path('kegg_data')\n", + "if not data_dir.exists():\n", + " print(\"❌ kegg_data directory not found. Please run KEGG_Data_1.ipynb first.\")\n", + " raise FileNotFoundError(\"kegg_data directory missing\")\n", + "\n", + "os.chdir(data_dir)\n", + "\n", + "# Configuration parameters\n", + "CONFIG = {\n", + " # Input files (should exist from Part 1)\n", + " 'gene_variants_file': 'gene_variants.txt',\n", + " 'variant_info_dir': 'variant_info',\n", + " 'network_data_file': 'final_network_with_variant.tsv',\n", + " \n", + " # Reference genome (update path as needed)\n", + " 'reference_fasta': 'chromosomes.fasta', # Update to your reference genome path\n", + " \n", + " # Output directories\n", + " 'nt_seq_dir': 'nt_seq',\n", + " 'disease_info_dir': 'disease_info',\n", + " \n", + " # Processing parameters\n", + " 'sequence_window': 2000, # Nucleotides around variant\n", + " 'verification_file': 'verification.txt',\n", + " 'diseases_file': 'diseases.txt'\n", + "}\n", + "\n", + "# Verify required input files\n", + "required_files = ['gene_variants.txt', 'final_network_with_variant.tsv']\n", + "missing_files = []\n", + "for file in required_files:\n", + " if not os.path.exists(file):\n", + " missing_files.append(file)\n", + "\n", + "if missing_files:\n", + " print(f\"❌ Missing required files: {missing_files}\")\n", + " print(\"Please run KEGG_Data_1.ipynb first to generate these files.\")\n", + "else:\n", + " print(\"✅ All required input files found\")\n", + "\n", + "# Create output directories\n", + "for dir_name in [CONFIG['nt_seq_dir'], CONFIG['disease_info_dir']]:\n", + " Path(dir_name).mkdir(exist_ok=True)\n", + "\n", + "print(f\"Working directory: {os.getcwd()}\")\n", + "print(\"\\n📝 Update CONFIG['reference_fasta'] with path to your reference genome file\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d05a4d10-03de-42ae-89c1-5ddbe77043a7", + "metadata": {}, + "outputs": [], + "source": [ + "# Working directory already set in configuration section above\n", + "print(f\"Current working directory: {os.getcwd()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "96662dbb-ee2c-4a74-8e45-ab58a3496976", + "metadata": {}, + "outputs": [], + "source": [ + "sed -i '' 's/:/_/g' gene_variants.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "db4f4cf2-cd95-4df8-99b6-cc112857502f", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + " if ! grep -q NAME variant_info/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < gene_variants.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "11959296-d5cb-4fb4-9914-83596dd41c86", + "metadata": {}, + "outputs": [], + "source": [ + "while read p; do\n", + " if ! grep -q GENE variant_info/$p.txt; then\n", + " echo \"$p\"\n", + " fi\n", + "done < gene_variants.txt" + ] + }, + { + "cell_type": "markdown", + "id": "784d0394-1a14-471a-9def-f4877b4bbd4e", + "metadata": {}, + "source": [ + "# Pulling Info from the Variant File\n", + "\n", + "# Variant Information Parsing\n", + "\n", + "This section processes individual variant files to extract structured information including variant names, genes, and types." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62b4167a-6d5a-4120-99fe-5678227db6cc", + "metadata": {}, + "outputs": [], + "source": [ + "# Working directory already set - proceeding with variant information parsing\n", + "print(f\"Processing variant files from: {CONFIG['variant_info_dir']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ed32b62-e3a6-4cff-b4ab-a80f04725a1c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import re\n", + "from pathlib import Path\n", + "import os\n", + "\n", + "# Read all file names from gene_variants.txt\n", + "gene_variants_file = CONFIG['gene_variants_file']\n", + "if not os.path.exists(gene_variants_file):\n", + " print(f\"❌ Gene variants file not found: {gene_variants_file}\")\n", + " print(\"Please run KEGG_Data_1.ipynb first to generate this file\")\n", + " raise FileNotFoundError(f\"Gene variants file not found: {gene_variants_file}\")\n", + "\n", + "with open(gene_variants_file, 'r') as f:\n", + " variant_files = [line.strip() for line in f if line.strip()]\n", + "\n", + "print(f\"Processing {len(variant_files)} variant files\")\n", + "\n", + "# Initialize an empty DataFrame to collect the results\n", + "variant_info = pd.DataFrame(columns=[\"Entry\", \"Variant_Name\", \"Variant_Gene\", \"Variant_Gene Info\", \"Variant_Type\"])\n", + "\n", + "# Function to extract the value after a keyword (single line, rest of the line)\n", + "def extract_value(line, key):\n", + " return line.split(key, 1)[-1].strip()\n", + "\n", + "# Process each variant file\n", + "variant_info_dir = Path(CONFIG['variant_info_dir'])\n", + "processed_count = 0\n", + "not_found_count = 0\n", + "\n", + "for file_name in variant_files:\n", + " file_path = variant_info_dir / f\"{file_name}.txt\"\n", + "\n", + " try:\n", + " with open(file_path, 'r') as f:\n", + " lines = f.readlines()\n", + "\n", + " name = \"\"\n", + " gene = \"\"\n", + " gene_info = \"\"\n", + " type_info = \"\"\n", + "\n", + " for line in lines:\n", + " line = line.strip()\n", + " if line.startswith(\"NAME\"):\n", + " name = extract_value(line, \"NAME\")\n", + " elif line.startswith(\"GENE\"):\n", + " gene_data = extract_value(line, \"GENE\")\n", + " if gene_data:\n", + " parts = gene_data.split(maxsplit=1)\n", + " gene = parts[0]\n", + " gene_info = parts[1] if len(parts) > 1 else \"\"\n", + " elif line.startswith(\"TYPE\"):\n", + " type_info = extract_value(line, \"TYPE\")\n", + "\n", + " row = {\n", + " \"Entry\": file_name,\n", + " \"Variant_Name\": name,\n", + " \"Variant_Gene\": gene,\n", + " \"Variant_Gene Info\": gene_info,\n", + " \"Variant_Type\": type_info\n", + " }\n", + "\n", + " variant_info = pd.concat([variant_info, pd.DataFrame([row])], ignore_index=True)\n", + " processed_count += 1\n", + " \n", + " if processed_count % 100 == 0:\n", + " print(f\"Processed {processed_count}/{len(variant_files)} files...\")\n", + "\n", + " except FileNotFoundError:\n", + " print(f\"[Warning] File not found: {file_path}\")\n", + " not_found_count += 1\n", + "\n", + "print(f\"✅ Processing complete: {processed_count} files processed, {not_found_count} files not found\")\n", + "print(f\"Extracted information for {len(variant_info)} variants\")\n", + "\n", + "# Optional: Save the final table\n", + "# variant_info.to_csv(\"parsed_variant_info.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "85e94a07-740d-44cd-a1c3-2330a30b99b1", + "metadata": {}, + "outputs": [], + "source": [ + "variant_info[\"Entry\"] = variant_info[\"Entry\"].str.replace(\"hsa_var_\", \"\", regex=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "4fc8fa00-2a28-4bd9-9aed-5c4602969cca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EntryVariant_NameVariant_GeneVariant_Gene InfoVariant_Type
01019v2CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]
11027v3CDKN1B mutationCDKN1Bcyclin dependent kinase inhibitor 1B [KO:K06624]
210280v1SIGMAR1 mutationSIGMAR1sigma non-opioid intracellular receptor 1 [KO:...
31029v2CDKN2A mutationCDKN2Acyclin dependent kinase inhibitor 2A [KO:K06621]
411315v1PARK7 mutationPARK7Parkinsonism associated deglycase [KO:K05687]
..................
909049v1AIP mutationAIPAHR interacting HSP90 co-chaperone [KO:K17767]
919101v1USP8 mutationUSP8ubiquitin specific peptidase 8 [KO:K11839]
929217v1VAPB mutationVAPBVAMP associated protein B and C [KO:K10707]
939817v1KEAP1 mutationKEAP1kelch like ECH associated protein 1 [KO:K10456]
94999v2CDH1 mutationCDH1cadherin 1 [KO:K05689]
\n", + "

95 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " Entry Variant_Name Variant_Gene \\\n", + "0 1019v2 CDK4 mutation CDK4 \n", + "1 1027v3 CDKN1B mutation CDKN1B \n", + "2 10280v1 SIGMAR1 mutation SIGMAR1 \n", + "3 1029v2 CDKN2A mutation CDKN2A \n", + "4 11315v1 PARK7 mutation PARK7 \n", + ".. ... ... ... \n", + "90 9049v1 AIP mutation AIP \n", + "91 9101v1 USP8 mutation USP8 \n", + "92 9217v1 VAPB mutation VAPB \n", + "93 9817v1 KEAP1 mutation KEAP1 \n", + "94 999v2 CDH1 mutation CDH1 \n", + "\n", + " Variant_Gene Info Variant_Type \n", + "0 cyclin dependent kinase 4 [KO:K02089] \n", + "1 cyclin dependent kinase inhibitor 1B [KO:K06624] \n", + "2 sigma non-opioid intracellular receptor 1 [KO:... \n", + "3 cyclin dependent kinase inhibitor 2A [KO:K06621] \n", + "4 Parkinsonism associated deglycase [KO:K05687] \n", + ".. ... ... \n", + "90 AHR interacting HSP90 co-chaperone [KO:K17767] \n", + "91 ubiquitin specific peptidase 8 [KO:K11839] \n", + "92 VAMP associated protein B and C [KO:K10707] \n", + "93 kelch like ECH associated protein 1 [KO:K10456] \n", + "94 cadherin 1 [KO:K05689] \n", + "\n", + "[95 rows x 5 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variant_info" + ] + }, + { + "cell_type": "markdown", + "id": "485ddbd6", + "metadata": {}, + "source": [ + "# Creating the Nt Variant Database\n", + "\n", + "# Nucleotide Sequence Database Creation\n", + "\n", + "This section creates nucleotide sequences with genomic context around each variant, generating both reference and mutated sequences for downstream analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8dba21b", + "metadata": {}, + "outputs": [], + "source": [ + "# Working directory already set - proceeding with nucleotide sequence processing\n", + "print(\"Starting nucleotide variant database creation...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8cf9f795", + "metadata": {}, + "outputs": [], + "source": [ + "from Bio import SeqIO\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bc18349", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "\n", + "# Load network and variant data\n", + "network_file = CONFIG['network_data_file']\n", + "if not os.path.exists(network_file):\n", + " print(f\"❌ Network data file not found: {network_file}\")\n", + " print(\"Please run KEGG_Data_1.ipynb first to generate this file\")\n", + " raise FileNotFoundError(f\"Network data not found: {network_file}\")\n", + "\n", + "variant_data = pd.read_csv(network_file, sep='\\t')\n", + "print(f\"✅ Loaded variant data: {len(variant_data)} entries\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "65dde804", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1449" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(variant_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c042831c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'N00073'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variant_data.iloc[1][\"Network\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92e4699c", + "metadata": {}, + "outputs": [], + "source": [ + "from Bio import SeqIO\n", + "import os\n", + "\n", + "# Assuming CONFIG is defined somewhere earlier in the code\n", + "# CONFIG = {'reference_fasta': 'path_to_your_fasta_file'}\n", + "\n", + "# Load reference genome sequences\n", + "fasta_file = CONFIG['reference_fasta']\n", + "if not os.path.exists(fasta_file):\n", + " print(f\"❌ Reference genome file not found: {fasta_file}\")\n", + " print(\"Please update CONFIG['reference_fasta'] with the correct path to your reference genome\")\n", + " print(\"Download from: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/\")\n", + " raise FileNotFoundError(f\"Reference genome not found: {fasta_file}\")\n", + "\n", + "print(f\"Loading reference genome from: {fasta_file}\")\n", + "record_dict = SeqIO.to_dict(SeqIO.parse(fasta_file, \"fasta\"))\n", + "print(f\"✅ Loaded {len(record_dict)} chromosome sequences\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c2efa951", + "metadata": {}, + "outputs": [], + "source": [ + "chromosome_dictionary = {\n", + " \"1\": \"NC_000001.11\",\n", + " \"2\": \"NC_000002.12\",\n", + " \"3\": \"NC_000003.12\",\n", + " \"4\": \"NC_000004.12\",\n", + " \"5\": \"NC_000005.10\",\n", + " \"6\": \"NC_000006.12\",\n", + " \"7\": \"NC_000007.14\",\n", + " \"9\": \"NC_000009.12\",\n", + " \"10\": \"NC_000010.11\",\n", + " \"11\": \"NC_000011.10\",\n", + " \"12\": \"NC_000012.12\",\n", + " \"13\": \"NC_000013.11\",\n", + " \"14\": \"NC_000014.9\",\n", + " \"15\": \"NC_000015.10\",\n", + " \"16\": \"NC_000016.10\",\n", + " \"17\": \"NC_000017.11\",\n", + " \"18\": \"NC_000018.10\",\n", + " \"19\": \"NC_000019.10\",\n", + " \"20\": \"NC_000020.11\",\n", + " \"21\": \"NC_000021.9\",\n", + " \"23\": \"NC_000023.11\"\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "a1323f95", + "metadata": {}, + "source": [ + "### Verification that the reference is present at the exact position I have in my data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0ec0979", + "metadata": {}, + "outputs": [], + "source": [ + "# Verify reference sequences against genome\n", + "verification_file = CONFIG['verification_file']\n", + "print(f\"Starting sequence verification - results will be saved to: {verification_file}\")\n", + "\n", + "with open(verification_file, \"w\") as f:\n", + " for i in range(len(variant_data)):\n", + " # ---- Input ----\n", + " chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]\n", + " if (variant_data.iloc[i]['TranscriptID'][:4] == \"ENST\"):\n", + " start = variant_data.iloc[i]['Start'] - 1\n", + " else:\n", + " start = variant_data.iloc[i]['Start']\n", + " reference_allele = variant_data.iloc[i]['RefAllele']\n", + " end = len(reference_allele) + start\n", + "\n", + " chrom_seq = record_dict[chromosome_id].seq\n", + "\n", + " # Adjust for 0-based indexing in Python\n", + " genomic_ref = chrom_seq[start: start + len(reference_allele)]\n", + "\n", + " if genomic_ref.upper() != reference_allele.upper():\n", + " f.write(f\"⚠️ Warning: Entry number {i} with variant {variant_data.iloc[i]['ID']} expected '{reference_allele}', but found '{genomic_ref}'\\n\")\n", + " else:\n", + " f.write(f\"✅ Verified: {chromosome_id}:{start}-{end} → '{reference_allele}' matches genome\\n\")\n", + " \n", + " if (i + 1) % 100 == 0:\n", + " print(f\"Verified {i + 1}/{len(variant_data)} variants...\")\n", + "\n", + "print(f\"✅ Verification complete. Results saved to: {verification_file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39174efe", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "# Assuming CONFIG is defined somewhere above in the code\n", + "# CONFIG = {'nt_seq_dir': 'desired/path/to/nt_seq'}\n", + "\n", + "# Create nucleotide sequence directory\n", + "nt_seq_dir = Path(CONFIG['nt_seq_dir'])\n", + "nt_seq_dir.mkdir(exist_ok=True)\n", + "print(f\"Created nucleotide sequence directory: {nt_seq_dir}\")" + ] + }, + { + "cell_type": "markdown", + "id": "3065cf9d", + "metadata": {}, + "source": [ + "### Performing the mutation and saving the reference and variant allele with a 1000 nt window" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6121945f", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate nucleotide sequences with mutations\n", + "nt_seq_dir = CONFIG['nt_seq_dir']\n", + "window = CONFIG['sequence_window']\n", + "\n", + "print(f\"Generating nucleotide sequences with {window}bp windows...\")\n", + "print(f\"Output directory: {nt_seq_dir}\")\n", + "\n", + "for i in range(len(variant_data)):\n", + " output_file = f\"{nt_seq_dir}/{variant_data.iloc[i]['Var_ID']}.txt\"\n", + " \n", + " with open(output_file, \"w\") as f:\n", + " # ---- Input ----\n", + " chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]\n", + " if (variant_data.iloc[i]['TranscriptID'][:4] == \"ENST\"):\n", + " start = variant_data.iloc[i]['Start'] - 1\n", + " else:\n", + " start = variant_data.iloc[i]['Start']\n", + " reference_allele = variant_data.iloc[i]['RefAllele']\n", + " variant_allele = variant_data.iloc[i]['AltAllele']\n", + "\n", + " end = len(reference_allele) + start\n", + " \n", + " chrom_seq = record_dict[chromosome_id].seq\n", + "\n", + " # Extract region\n", + " region_start = max(0, start - window)\n", + " region_end = end + window\n", + "\n", + " ref_seq = chrom_seq[region_start:region_end]\n", + " \n", + " if (variant_allele == \"deletion\"):\n", + " # Apply mutation\n", + " mutated_seq = ref_seq[:window] + ref_seq[window + len(reference_allele):]\n", + " \n", + " f.write(f\">{variant_data.iloc[i]['ID']}_reference_{reference_allele}\\n\")\n", + " f.write(f\"{ref_seq}\\n\")\n", + " f.write(f\">{variant_data.iloc[i]['ID']}_variant_{variant_allele}\\n\")\n", + " f.write(f\"{mutated_seq}\\n\")\n", + " else:\n", + " del_len = len(reference_allele)\n", + " # Apply mutation\n", + " mutated_seq = ref_seq[:window] + variant_allele + ref_seq[window + del_len:]\n", + " \n", + " f.write(f\">{variant_data.iloc[i]['ID']}_reference_{reference_allele}\\n\")\n", + " f.write(f\"{ref_seq}\\n\")\n", + " f.write(f\">{variant_data.iloc[i]['ID']}_variant_{variant_allele}\\n\")\n", + " f.write(f\"{mutated_seq}\\n\")\n", + " \n", + " if (i + 1) % 100 == 0:\n", + " print(f\"Generated sequences for {i + 1}/{len(variant_data)} variants...\")\n", + "\n", + "print(f\"✅ Sequence generation complete. {len(variant_data)} sequence files created in {nt_seq_dir}\")" + ] + }, + { + "cell_type": "markdown", + "id": "a83e9272-b34f-40f3-aedf-3aca0795944f", + "metadata": {}, + "source": [ + "# Adding in more Variant Data\n", + "\n", + "# Data Integration\n", + "\n", + "This section merges variant information with the main dataset to create a comprehensive database with all relevant annotations." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "9222e45a-7f9a-4762-8dd8-2cccc654ad3e", + "metadata": {}, + "outputs": [], + "source": [ + "final_data = variant_data.merge(variant_info, on='Entry')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae6d44d0-d1f2-4d41-b59d-f8c5888b4914", + "metadata": {}, + "outputs": [], + "source": [ + "final_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ab406cd-e9be-4885-811a-f3e2526efe8a", + "metadata": {}, + "outputs": [], + "source": [ + "# Save merged variant data\n", + "output_file = CONFIG['network_data_file']\n", + "final_data.to_csv(output_file, sep='\\t', header=True, index=False)\n", + "print(f\"✅ Final variant data with merged information saved to: {output_file}\")\n", + "print(f\"Dataset contains {len(final_data)} variants with complete information\")" + ] + }, + { + "cell_type": "markdown", + "id": "2ecb5318-ab15-4625-b556-50f8ff39cff3", + "metadata": {}, + "source": [ + "# Pulling Disease info\n", + "\n", + "# Disease Information Processing\n", + "\n", + "This section extracts disease identifiers from the variant data and downloads corresponding disease information from KEGG to create human-readable disease names." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "b266aa61-7a7f-49c7-a737-578b51b95f32", + "metadata": {}, + "outputs": [], + "source": [ + "import ast" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "a7aa0417-b1c2-40c9-ad67-f2077d1f1d3e", + "metadata": {}, + "outputs": [], + "source": [ + "diseases = []" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "a0865917-9074-43f4-98a1-74bdb456b2e5", + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(len(final_data)):\n", + " diseases.extend(list(ast.literal_eval(final_data['Disease'][i]).keys()))" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "8b469aee-d8fb-439d-a8bc-e8cb113ddc8f", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "disease = set(diseases)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e461b5d7-2200-4dbb-b640-ffd6bf2e3ac2", + "metadata": {}, + "outputs": [], + "source": [ + "# Save disease identifiers to file\n", + "diseases_file = CONFIG['diseases_file']\n", + "with open(diseases_file, 'w') as f:\n", + " for disease_id in disease:\n", + " f.write(f\"{disease_id}\\n\")\n", + " \n", + "print(f\"✅ Saved {len(disease)} unique disease identifiers to: {diseases_file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d079c88f-9e8b-4f80-bf6c-5d9a49155b86", + "metadata": {}, + "outputs": [], + "source": [ + "# Working directory already set - proceeding with disease information retrieval\n", + "print(\"Starting disease information processing...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "10d814f3-66ec-4580-866e-2cc2fda34109", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "disease KEGG Disease Database\n", + "ds Release 114.0+/04-28, Apr 25\n", + " Kanehisa Laboratories\n", + " 2,912 entries\n", + "\n", + "linked db pathway\n", + " brite\n", + " ko\n", + " hsa\n", + " genome\n", + " network\n", + " variant\n", + " drug\n", + " pubmed\n", + "\n" + ] + } + ], + "source": [ + "kegg_pull rest info disease" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f095524-d58f-4869-9d1b-5459de85329d", + "metadata": {}, + "outputs": [], + "source": [ + "kegg_pull --full-help" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ed80556-4df8-4f0b-8c3e-2a6458c6dd6d", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "# Assuming CONFIG is defined somewhere earlier in the code\n", + "# CONFIG = {'disease_info_dir': 'desired/path/to/disease_info'}\n", + "\n", + "# Create disease information directory\n", + "disease_dir = Path(CONFIG['disease_info_dir'])\n", + "disease_dir.mkdir(exist_ok=True)\n", + "print(f\"Created disease information directory: {disease_dir}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96851b67-0689-4aa0-9208-a0cdabf95425", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████| 44/44 [00:06<00:00, 6.56it/s]\n" + ] + } + ], + "source": [ + "# Download disease information using kegg_pull\n", + "diseases_file = CONFIG['diseases_file']\n", + "disease_output_dir = CONFIG['disease_info_dir']\n", + "\n", + "if not os.path.exists(diseases_file):\n", + " print(f\"❌ Diseases file not found: {diseases_file}\")\n", + " print(\"Please run the previous cells to generate the diseases list\")\n", + "else:\n", + " print(f\"Downloading disease information for entries in: {diseases_file}\")\n", + " print(f\"Output directory: {disease_output_dir}\")\n", + " # Run the command to download disease information\n", + " !cat {diseases_file} | kegg_pull pull entry-ids - --output={disease_output_dir}\n", + " print(\"✅ Disease information download complete\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c01f97c-6376-4266-97e9-1d29ef207a51", + "metadata": {}, + "outputs": [], + "source": [ + "# Processing disease information files\n", + "print(\"Parsing disease information from KEGG files...\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ea01eac-ee3c-4a5e-9863-2fb061291b45", + "metadata": {}, + "outputs": [], + "source": [ + "# Parse disease information from downloaded files\n", + "diseases_file = CONFIG['diseases_file']\n", + "disease_info_dir = Path(CONFIG['disease_info_dir'])\n", + "\n", + "# Read all disease identifiers from diseases.txt\n", + "with open(diseases_file, 'r') as f:\n", + " disease_files = [line.strip() for line in f if line.strip()]\n", + "\n", + "print(f\"Processing {len(disease_files)} disease information files...\")\n", + "\n", + "# Initialize an empty dictionary\n", + "disease_info = {}\n", + "\n", + "# Function to extract the value after a keyword\n", + "def extract_value(line, key):\n", + " return line.split(key, 1)[-1].strip()\n", + "\n", + "# Process each disease file\n", + "processed_count = 0\n", + "not_found_count = 0\n", + "\n", + "for disease_id in disease_files:\n", + " file_path = disease_info_dir / f'{disease_id}.txt'\n", + "\n", + " try:\n", + " with open(file_path, 'r') as f:\n", + " lines = f.readlines()\n", + "\n", + " name = \"\"\n", + "\n", + " for line in lines:\n", + " line = line.strip()\n", + " if line.startswith(\"NAME\"):\n", + " name = extract_value(line, \"NAME\")\n", + " break # No need to check other lines once NAME is found\n", + "\n", + " # Save into dictionary: key = disease_id, value = name\n", + " disease_info[disease_id] = name\n", + " processed_count += 1\n", + " \n", + " if processed_count % 50 == 0:\n", + " print(f\"Processed {processed_count}/{len(disease_files)} disease files...\")\n", + "\n", + " except FileNotFoundError:\n", + " print(f\"[Warning] File not found: {file_path}\")\n", + " not_found_count += 1\n", + "\n", + "print(f\"✅ Disease processing complete: {processed_count} processed, {not_found_count} not found\")\n", + "print(f\"Extracted disease information for {len(disease_info)} diseases\")\n", + "\n", + "# Optional: Save the dictionary to a file (like JSON)\n", + "# import json\n", + "# with open('disease_info.json', 'w') as f:\n", + "# json.dump(disease_info, f, indent=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4dfb4f25-776e-45c6-9eda-457b13cd77bf", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'H00135': 'Krabbe disease;',\n", + " 'H01398': 'Primary hyperammonemia (Urea cycle disorders)',\n", + " 'H00032': 'Thyroid cancer',\n", + " 'H00559': 'von Hippel-Lindau syndrome',\n", + " 'H00260': 'Pigmented micronodular adrenocortical disease',\n", + " 'H00038': 'Melanoma',\n", + " 'H00485': 'Robinow syndrome',\n", + " 'H00251': 'Thyroid dyshormonogenesis;',\n", + " 'H00194': 'Lesch-Nyhan syndrome;',\n", + " 'H00026': 'Endometrial cancer',\n", + " 'H00020': 'Colorectal cancer',\n", + " 'H00031': 'Breast cancer',\n", + " 'H02049': 'Bilateral macronodular adrenal hyperplasia',\n", + " 'H00042': 'Glioma',\n", + " 'H00063': 'Spinocerebellar ataxia (SCA)',\n", + " 'H00195': 'Adenine phosphoribosyltransferase deficiency;',\n", + " 'H00033': 'Adrenal carcinoma',\n", + " 'H00048': 'Hepatocellular carcinoma;',\n", + " 'H01522': 'Zollinger-Ellison syndrome',\n", + " 'H00019': 'Pancreatic cancer',\n", + " 'H00004': 'Chronic myeloid leukemia',\n", + " 'H00058': 'Amyotrophic lateral sclerosis (ALS);',\n", + " 'H00022': 'Bladder cancer',\n", + " 'H00056': 'Alzheimer disease;',\n", + " 'H01032': 'N-acetylglutamate synthase deficiency',\n", + " 'H00247': 'Multiple endocrine neoplasia syndrome;',\n", + " 'H00246': 'Primary hyperparathyroidism;',\n", + " 'H00039': 'Basal cell carcinoma',\n", + " 'H00021': 'Renal cell carcinoma',\n", + " 'H00013': 'Small cell lung cancer',\n", + " 'H00003': 'Acute myeloid leukemia',\n", + " 'H00018': 'Gastric cancer',\n", + " 'H01603': 'Primary aldosteronism',\n", + " 'H00061': 'Prion disease',\n", + " 'H00014': 'Non-small cell lung cancer',\n", + " 'H00423': 'Sphingolipidosis',\n", + " 'H00024': 'Prostate cancer',\n", + " 'H01102': 'Pituitary adenomas',\n", + " 'H00034': 'Carcinoid',\n", + " 'H00059': 'Huntington disease',\n", + " 'H01431': 'Cushing syndrome',\n", + " 'H00057': 'Parkinson disease',\n", + " 'H00126': 'Gaucher disease',\n", + " 'H02221': 'Methylmalonic aciduria and homocystinuria'}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "disease_info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "458ca725-03e8-4b2a-98e7-f418f40190fb", + "metadata": {}, + "outputs": [], + "source": [ + "# Reload variant data for disease processing\n", + "variant_data = pd.read_csv(CONFIG['network_data_file'], sep='\\t')\n", + "print(f\"Processing disease information for {len(variant_data)} variants\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e86ddd65-cbde-42d3-be6f-cbc54e2dda06", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "\n", + "# Assume disease_info is already a dictionary {\"D001\": \"Cancer\", \"D002\": \"Diabetes\", ...}\n", + "\n", + "# Create a new column to store disease dictionaries\n", + "variant_data[\"Disease_Names\"] = \"\"\n", + "\n", + "# Process each row\n", + "for idx, row in variant_data.iterrows():\n", + " try:\n", + " # Convert the string dictionary into a real dictionary\n", + " disease_dict = ast.literal_eval(row[\"Disease\"])\n", + "\n", + " # Get the disease IDs (keys)\n", + " disease_ids = disease_dict.keys()\n", + "\n", + " # Build a new dictionary: {disease_id: disease_name}\n", + " disease_names_dict = {did: disease_info.get(did, \"\") for did in disease_ids}\n", + "\n", + " # Save it into the Disease_Names column\n", + " variant_data.at[idx, \"Disease_Names\"] = disease_names_dict\n", + "\n", + " except (ValueError, SyntaxError):\n", + " print(f\"[Warning] Couldn't parse disease info at row {idx}\")\n", + " variant_data.at[idx, \"Disease_Names\"] = {}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06a29f96-56b2-46b2-897e-d7006dd0ae52", + "metadata": {}, + "outputs": [], + "source": [ + "# Save updated variant data with disease names\n", + "output_file = CONFIG['network_data_file']\n", + "variant_data.to_csv(output_file, sep='\\t', header=True, index=False)\n", + "print(f\"✅ Updated variant data saved to: {output_file}\")\n", + "print(f\"Dataset now includes disease names for {len(variant_data)} variants\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "674b4a4a-93ab-4fdd-af73-cf0351381fe6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/BioReason/data/KEGG_Data_3.ipynb b/BioReason/data/KEGG_Data_3.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..91c3e078f20c827622ef9b3db0d24b4579d68c78 --- /dev/null +++ b/BioReason/data/KEGG_Data_3.ipynb @@ -0,0 +1,2739 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "744b9f11-6ef8-4409-a388-fe860480c9de", + "metadata": {}, + "source": [ + "# Processing the Reasoning Trace Data and Adding in Nucleotides" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8950d38a-dfa9-4dbd-b388-941dec69b3ee", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a1c3d972-c52e-4d73-9816-e970fca3e1bb", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from Bio import SeqIO" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c80d7741-7aaa-4c28-a93a-ad955f3da6bb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mkdir: processed_variants 1450 with seqs: File exists\n" + ] + } + ], + "source": [ + "!mkdir 'processed_variants 1450 with seqs'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e4021560-9130-4fdf-a640-15b5da6935a0", + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(1,1450):\n", + " # opened the json file\n", + " with open(f'processed_variants first 700/KEGG_{i}_processed.json', 'r') as file:\n", + " data = json.load(file)\n", + "\n", + " # open the nt file\n", + " fasta_file = f\"nt_seq/KEGG_{i}.txt\"\n", + " sequence_list = list(SeqIO.parse(fasta_file, \"fasta\"))\n", + " ref_seq = sequence_list[0].seq\n", + " var_seq = sequence_list[1].seq\n", + "\n", + " # Add sequences to the JSON data\n", + " data[\"reference_sequence\"] = str(ref_seq)\n", + " data[\"variant_sequence\"] = str(var_seq)\n", + "\n", + " # Save the updated JSON to a new file\n", + " with open(f'processed_variants 1450 with seqs/KEGG_{i}_with_seqs.json', 'w') as out_file:\n", + " json.dump(data, out_file, indent=2)" + ] + }, + { + "cell_type": "markdown", + "id": "4db8af16-a11f-4987-b1a6-db552c6714fb", + "metadata": {}, + "source": [ + "# Creating the Final KEGG SFT and RL Dataset\n", + "\n", + "# Final KEGG Dataset Creation\n", + "\n", + "This section creates the final machine learning dataset by combining variant data with sequences and generating structured question-answer pairs for biological reasoning tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9517d40-74e3-4ddb-bd16-95f9ab7927aa", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "53c5948f-4bde-432d-b35c-34c733eb9ad1", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import json\n", + "import ast" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "60c66a0d-359b-4d2a-8427-53f4d18d1047", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Var_IDNetworkEntrySourceIDTranscriptIDNucChangeChrStartEnd...Network ExpandedPathwayClassDiseaseGeneVariant_NameVariant_GeneVariant_Gene InfoVariant_TypeDisease_Names
0KEGG_1N000731019v2ClinVar16929NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
1KEGG_2N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
2KEGG_3N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
3KEGG_4N000731019v2ClinVar16928NC_000012.12NaN125775164757751647...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
4KEGG_5N000731019v2dbSNPrs11547328NC_000012.12NaN125775164757751647...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
..................................................................
1444KEGG_1445N002449817v1COSM6196635ENST00000393623.6c.706G>T191049219610492196...9817v1 // 4780 => (3162,1728,119391,221357,293...{'hsa05225': 'Hepatocellular carcinoma'}{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'9817': 'KEAP1; kelch like ECH associated pro...KEAP1 mutationKEAP1kelch like ECH associated protein 1 [KO:K10456]NaN{'H00048': 'Hepatocellular carcinoma;'}
1445KEGG_1446N002449817v1COSM6196637ENST00000393623.6c.548A>G191049948610499486...9817v1 // 4780 => (3162,1728,119391,221357,293...{'hsa05225': 'Hepatocellular carcinoma'}{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'9817': 'KEAP1; kelch like ECH associated pro...KEAP1 mutationKEAP1kelch like ECH associated protein 1 [KO:K10456]NaN{'H00048': 'Hepatocellular carcinoma;'}
1446KEGG_1447N00258999v2COSM4766271ENST00000621016.4c.662A>G166880882368808823...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
1447KEGG_1448N00258999v2COSM4766211ENST00000621016.4c.755T>G166881026468810264...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
1448KEGG_1449N00258999v2COSM1379150ENST00000621016.4c.769G>A166881027868810278...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
\n", + "

1449 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " Var_ID Network Entry Source ID TranscriptID \\\n", + "0 KEGG_1 N00073 1019v2 ClinVar 16929 NC_000012.12 \n", + "1 KEGG_2 N00073 1019v2 dbSNP rs104894340 NC_000012.12 \n", + "2 KEGG_3 N00073 1019v2 dbSNP rs104894340 NC_000012.12 \n", + "3 KEGG_4 N00073 1019v2 ClinVar 16928 NC_000012.12 \n", + "4 KEGG_5 N00073 1019v2 dbSNP rs11547328 NC_000012.12 \n", + "... ... ... ... ... ... ... \n", + "1444 KEGG_1445 N00244 9817v1 COSM 6196635 ENST00000393623.6 \n", + "1445 KEGG_1446 N00244 9817v1 COSM 6196637 ENST00000393623.6 \n", + "1446 KEGG_1447 N00258 999v2 COSM 4766271 ENST00000621016.4 \n", + "1447 KEGG_1448 N00258 999v2 COSM 4766211 ENST00000621016.4 \n", + "1448 KEGG_1449 N00258 999v2 COSM 1379150 ENST00000621016.4 \n", + "\n", + " NucChange Chr Start End ... \\\n", + "0 NaN 12 57751646 57751646 ... \n", + "1 NaN 12 57751646 57751646 ... \n", + "2 NaN 12 57751646 57751646 ... \n", + "3 NaN 12 57751647 57751647 ... \n", + "4 NaN 12 57751647 57751647 ... \n", + "... ... ... ... ... ... \n", + "1444 c.706G>T 19 10492196 10492196 ... \n", + "1445 c.548A>G 19 10499486 10499486 ... \n", + "1446 c.662A>G 16 68808823 68808823 ... \n", + "1447 c.755T>G 16 68810264 68810264 ... \n", + "1448 c.769G>A 16 68810278 68810278 ... \n", + "\n", + " Network Expanded \\\n", + "0 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "1 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "2 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "3 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "4 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "... ... \n", + "1444 9817v1 // 4780 => (3162,1728,119391,221357,293... \n", + "1445 9817v1 // 4780 => (3162,1728,119391,221357,293... \n", + "1446 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "1447 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "1448 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "\n", + " Pathway \\\n", + "0 {'hsa05218': 'Melanoma'} \n", + "1 {'hsa05218': 'Melanoma'} \n", + "2 {'hsa05218': 'Melanoma'} \n", + "3 {'hsa05218': 'Melanoma'} \n", + "4 {'hsa05218': 'Melanoma'} \n", + "... ... \n", + "1444 {'hsa05225': 'Hepatocellular carcinoma'} \n", + "1445 {'hsa05225': 'Hepatocellular carcinoma'} \n", + "1446 {'hsa05226': 'Gastric cancer'} \n", + "1447 {'hsa05226': 'Gastric cancer'} \n", + "1448 {'hsa05226': 'Gastric cancer'} \n", + "\n", + " Class \\\n", + "0 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "1 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "2 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "3 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "4 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "... ... \n", + "1444 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "1445 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "1446 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "1447 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "1448 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "\n", + " Disease \\\n", + "0 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "1 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "2 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "3 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "4 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "... ... \n", + "1444 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", + "1445 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", + "1446 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "1447 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "1448 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "\n", + " Gene Variant_Name \\\n", + "0 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "1 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "2 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "3 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "4 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "... ... ... \n", + "1444 {'9817': 'KEAP1; kelch like ECH associated pro... KEAP1 mutation \n", + "1445 {'9817': 'KEAP1; kelch like ECH associated pro... KEAP1 mutation \n", + "1446 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", + "1447 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", + "1448 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", + "\n", + " Variant_Gene Variant_Gene Info \\\n", + "0 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "1 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "2 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "3 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "4 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "... ... ... \n", + "1444 KEAP1 kelch like ECH associated protein 1 [KO:K10456] \n", + "1445 KEAP1 kelch like ECH associated protein 1 [KO:K10456] \n", + "1446 CDH1 cadherin 1 [KO:K05689] \n", + "1447 CDH1 cadherin 1 [KO:K05689] \n", + "1448 CDH1 cadherin 1 [KO:K05689] \n", + "\n", + " Variant_Type Disease_Names \n", + "0 NaN {'H00038': 'Melanoma'} \n", + "1 NaN {'H00038': 'Melanoma'} \n", + "2 NaN {'H00038': 'Melanoma'} \n", + "3 NaN {'H00038': 'Melanoma'} \n", + "4 NaN {'H00038': 'Melanoma'} \n", + "... ... ... \n", + "1444 NaN {'H00048': 'Hepatocellular carcinoma;'} \n", + "1445 NaN {'H00048': 'Hepatocellular carcinoma;'} \n", + "1446 NaN {'H00018': 'Gastric cancer'} \n", + "1447 NaN {'H00018': 'Gastric cancer'} \n", + "1448 NaN {'H00018': 'Gastric cancer'} \n", + "\n", + "[1449 rows x 24 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variant_data = pd.read_csv(\"final_network_with_variant.tsv\", sep='\\t')\n", + "variant_data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "51609538-9f96-4097-ac60-2a4a08a6e01c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'KEGG_2'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variant_data.iloc[1]['Var_ID']" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "846b6ee3-1e4d-44bc-ad59-4074b4ff39bb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mkdir: final_data: File exists\n" + ] + } + ], + "source": [ + "!mkdir final_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56449f64-85ae-4804-8a01-3ce2afe1e6da", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import ast\n", + "from CONFIG import CONFIG\n", + "\n", + "# Create final dataset with question-answer pairs\n", + "variants_with_seqs_dir = CONFIG['variants_with_seqs_dir']\n", + "final_data_dir = CONFIG['final_data_dir']\n", + "start_idx, end_idx = CONFIG['variant_range']\n", + "\n", + "print(f\"Creating final dataset with Q&A pairs...\")\n", + "print(f\"Input: {variants_with_seqs_dir}\")\n", + "print(f\"Output: {final_data_dir}\")\n", + "print(f\"Processing range: {start_idx} to {end_idx}\")\n", + "\n", + "processed_count = 0\n", + "error_count = 0\n", + "\n", + "for i in range(start_idx, end_idx):\n", + " try:\n", + " # Load the JSON file with sequences\n", + " input_file = f'{variants_with_seqs_dir}/KEGG_{i}_with_seqs.json'\n", + " if not os.path.exists(input_file):\n", + " error_count += 1\n", + " continue\n", + " \n", + " with open(input_file, 'r') as file:\n", + " data = json.load(file)\n", + "\n", + " # Build the question with fallback for inconsistent key casing\n", + " try:\n", + " chromosome = data['raw_data']['chromosome']\n", + " network = data['raw_data']['network']\n", + " except KeyError:\n", + " try:\n", + " chromosome = data['raw_data']['Chromosome']\n", + " network = data['raw_data']['Network']\n", + " except KeyError:\n", + " print(f\"[Warning] Missing chromosome/network data in {input_file}\")\n", + " error_count += 1\n", + " continue\n", + "\n", + " # Extract gene information\n", + " try:\n", + " gene_list = list(ast.literal_eval(variant_data.iloc[i-1]['Gene']).values())\n", + " gene_list_joined = ' | '.join(gene_list)\n", + " variant_gene = variant_data.iloc[i-1]['Variant_Gene']\n", + " except (KeyError, IndexError, ValueError) as e:\n", + " print(f\"[Warning] Gene information error for {input_file}: {e}\")\n", + " error_count += 1\n", + " continue\n", + "\n", + " question = (\n", + " f\"Chromosome Number: {chromosome}\\n\"\n", + " f\"Network Definition of the pathway: {network}\\n\"\n", + " f\"Genes in the pathway: {gene_list_joined}\\n\\n\"\n", + " f\"Given this context, what is the biological effect of this \"\n", + " f\"{variant_gene} allele, specifically what disease does this contribute to?\"\n", + " )\n", + "\n", + " # Add Q&A to reasoning steps\n", + " if 'reasoning' in data and 'reasoning_steps' in data['reasoning']:\n", + " data['reasoning']['reasoning_steps'].append(data.get('answer', ''))\n", + "\n", + " # Extract answer\n", + " try:\n", + " answer = data['reasoning']['labels']['disease'][0]\n", + " except (KeyError, IndexError):\n", + " print(f\"[Warning] Missing disease label in {input_file}\")\n", + " error_count += 1\n", + " continue\n", + "\n", + " data['question'] = question\n", + " data['answer'] = answer \n", + "\n", + " # Clean up unnecessary fields\n", + " if 'reasoning' in data:\n", + " for key in ['variant_id', 'hgvs', 'labels']:\n", + " data['reasoning'].pop(key, None)\n", + " data.pop('raw_data', None)\n", + "\n", + " # Save to final data directory\n", + " output_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'\n", + " with open(output_file, 'w') as out_file:\n", + " json.dump(data, out_file, indent=2)\n", + " \n", + " processed_count += 1\n", + " \n", + " if processed_count % 100 == 0:\n", + " print(f\"Created {processed_count} Q&A pairs...\")\n", + " \n", + " except Exception as e:\n", + " print(f\"[Error] Failed to process variant {i}: {str(e)}\")\n", + " error_count += 1\n", + "\n", + "print(f\"✅ Final dataset creation complete:\")\n", + "print(f\" Successfully processed: {processed_count}\")\n", + "print(f\" Errors encountered: {error_count}\")\n", + "print(f\" Output directory: {final_data_dir}\")" + ] + }, + { + "cell_type": "markdown", + "id": "11b3769e-33e5-4ab8-bc9d-f736913a2034", + "metadata": {}, + "source": [ + "# Fixing Disease Labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cfa4eca-c11e-4e52-ad6b-2fa7b43be2a4", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9e36bc3f-07af-4b3d-bc84-d449ced55e24", + "metadata": {}, + "outputs": [], + "source": [ + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd316862-e6c7-4dd9-a06c-33f3454355b0", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "\n", + "# CONFIG parameters\n", + "CONFIG = {\n", + " 'final_data_dir': 'final_data',\n", + " 'variant_range': (1, 1450)\n", + "}\n", + "\n", + "# Extract disease labels from final dataset for standardization\n", + "final_data_dir = CONFIG['final_data_dir']\n", + "start_idx, end_idx = CONFIG['variant_range']\n", + "\n", + "print(\"Extracting disease labels for standardization...\")\n", + "\n", + "disease = []\n", + "processed_count = 0\n", + "\n", + "for i in range(start_idx, end_idx):\n", + " try:\n", + " input_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'\n", + " if os.path.exists(input_file):\n", + " with open(input_file, 'r') as file:\n", + " data = json.load(file)\n", + " \n", + " if 'answer' in data:\n", + " disease.append(data['answer'])\n", + " processed_count += 1\n", + " \n", + " except Exception as e:\n", + " print(f\"[Warning] Could not process {input_file}: {str(e)}\")\n", + "\n", + "print(f\"✅ Extracted {len(disease)} disease labels from {processed_count} files\")\n", + "print(f\"Unique diseases: {len(set(disease))}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "cca4846c-aec9-49f3-b919-760cb9fa4bc7", + "metadata": {}, + "outputs": [], + "source": [ + "new_disease = {'Acute Myeloid Leukemia (AML)' : \"Acute Myeloid Leukemia\",\n", + " 'Acute myeloid leukemia (AML)' : \"Acute Myeloid Leukemia\",\n", + " 'Adenine Phosphoribosyltransferase Deficiency (APRTD)' : \"Adenine Phosphoribosyltransferase Deficiency\",\n", + " 'Adenine phosphoribosyltransferase deficiency (APRTD)' : \"Adenine Phosphoribosyltransferase Deficiency\",\n", + " \"Alzheimer's disease\" : \"Alzheimer's disease\",\n", + " \"Alzheimer's disease (AD)\" : \"Alzheimer's disease\",\n", + " 'Amyotrophic Lateral Sclerosis (ALS)' : \"Amyotrophic Lateral Sclerosis\",\n", + " 'Amyotrophic lateral sclerosis (ALS)' : \"Amyotrophic Lateral Sclerosis\",\n", + " 'Basal Cell Carcinoma (BCC)' : \"Basal Cell Carcinoma\",\n", + " 'Basal cell carcinoma' : \"Basal Cell Carcinoma\",\n", + " 'Basal cell carcinoma (BCC)' : \"Basal Cell Carcinoma\",\n", + " 'Chronic Myeloid Leukemia (CML)' : \"Chronic Myeloid Leukemia\",\n", + " 'Chronic myeloid leukemia (CML)' : \"Chronic Myeloid Leukemia\",\n", + " 'Clear cell Renal Cell Carcinoma (ccRCC)' : \"Clear cell Renal Cell Carcinoma\",\n", + " 'Clear cell renal cell carcinoma' : \"Clear cell Renal Cell Carcinoma\",\n", + " 'Clear cell renal cell carcinoma (ccRCC)' : \"Clear cell Renal Cell Carcinoma\",\n", + " 'Colorectal cancer' : \"Colorectal cancer\",\n", + " 'Colorectal cancer (CRC)' : \"Colorectal cancer\",\n", + " 'Cushing syndrome' : \"Cushing syndrome\",\n", + " \"Early-onset Alzheimer's disease\" : \"Alzheimer's disease\",\n", + " \"Early-onset familial Alzheimer's disease\" : \"Alzheimer's disease\",\n", + " \"Early-onset familial Alzheimer's disease (FAD)\" : \"Alzheimer's disease\",\n", + " 'Familial Creutzfeldt-Jakob Disease' : \"Creutzfeldt-Jakob Disease\",\n", + " 'Familial Creutzfeldt-Jakob Disease (fCJD)' : \"Creutzfeldt-Jakob Disease\",\n", + " 'Familial Creutzfeldt-Jakob disease' : \"Creutzfeldt-Jakob Disease\",\n", + " 'Familial Creutzfeldt-Jakob disease (fCJD)' : \"Creutzfeldt-Jakob Disease\",\n", + " \"Familial Early-Onset Alzheimer's Disease\" : \"Alzheimer's disease\",\n", + " 'Familial Isolated Pituitary Adenoma (FIPA)' : \"Pituitary Adenoma\",\n", + " \"Familial early-onset Alzheimer's disease\" : \"Alzheimer's disease\",\n", + " \"Familial early-onset Alzheimer's disease (FAD)\" : \"Alzheimer's disease\",\n", + " 'Familial isolated pituitary adenoma (FIPA)' : \"Pituitary Adenoma\",\n", + " 'Gastric cancer' : \"Gastric cancer\",\n", + " 'Gaucher disease' : \"Gaucher disease\",\n", + " 'Glioblastoma multiforme' : \"Glioblastoma multiforme\",\n", + " 'Glioblastoma multiforme (GBM)' : \"Glioblastoma multiforme\",\n", + " 'Hepatocellular carcinoma' : \"Hepatocellular carcinoma\",\n", + " 'Hepatocellular carcinoma (HCC)' : \"Hepatocellular carcinoma\",\n", + " 'Huntington disease' : \"Huntington's disease\",\n", + " 'Huntington disease (HD)' : \"Huntington's disease\",\n", + " \"Huntington's disease\" : \"Huntington's disease\",\n", + " \"Huntington's disease (HD)\" : \"Huntington's disease\",\n", + " 'Lesch-Nyhan syndrome' : \"Lesch-Nyhan syndrome\",\n", + " 'Melanoma' : \"Melanoma\",\n", + " 'Melanoma (H00038)' : \"Melanoma\",\n", + " 'Methylmalonic aciduria and homocystinuria (MAHC)' : \"Methylmalonic aciduria and homocystinuria\",\n", + " 'Multiple Endocrine Neoplasia type 1 (MEN1)' : \"Multiple Endocrine Neoplasia type 1\",\n", + " 'N-acetylglutamate synthase (NAGS) deficiency' : \"N-acetylglutamate synthase deficiency\",\n", + " 'Non-small cell lung cancer' : \"Non-small cell lung cancer\",\n", + " 'Non-small cell lung cancer (NSCLC)' : \"Non-small cell lung cancer\",\n", + " 'Non-small-cell lung cancer' : \"Non-small cell lung cancer\",\n", + " 'Non-small-cell lung cancer (NSCLC)' : \"Non-small cell lung cancer\",\n", + " 'Pancreatic ductal adenocarcinoma' : \"Pancreatic ductal adenocarcinoma\",\n", + " 'Papillary Renal Cell Carcinoma' : \"Papillary Renal Cell Carcinoma\",\n", + " 'Papillary renal cell carcinoma' : \"Papillary Renal Cell Carcinoma\",\n", + " 'Papillary thyroid carcinoma' : \"Papillary thyroid carcinoma\",\n", + " 'Papillary thyroid carcinoma (PTC)' : \"Papillary thyroid carcinoma\",\n", + " \"Parkinson's Disease\" : \"Parkinson's Disease\",\n", + " \"Parkinson's disease\" : \"Parkinson's Disease\",\n", + " \"Parkinson's disease (PD)\" : \"Parkinson's Disease\",\n", + " 'Pituitary adenoma' : \"Pituitary Adenoma\",\n", + " 'Primary Aldosteronism' : \"Primary Aldosteronism\",\n", + " 'Primary aldosteronism' : \"Primary Aldosteronism\",\n", + " 'Prion disease' : \"Prion disease\",\n", + " 'Prion diseases' : \"Prion disease\",\n", + " 'Prostate cancer' : \"Prostate cancer\",\n", + " 'Renal cell cancer (RCC)' : \"Renal cell carcinoma\",\n", + " 'Renal cell carcinoma' : \"Renal cell carcinoma\",\n", + " 'Renal cell carcinoma (RCC)' : \"Renal cell carcinoma\",\n", + " 'Robinow syndrome' : \"Robinow syndrome\",\n", + " 'Sphingolipidoses' : \"Sphingolipidoses\",\n", + " 'Sphingolipidosis' : \"Sphingolipidoses\",\n", + " 'Spinocerebellar Ataxia (SCA)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia (SCA19/22)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia Type 1 (SCA1)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia Type 13 (SCA13)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia Type 14 (SCA14)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia Type 15 (SCA15)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia Type 2 (SCA2)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia Type 3' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia Type 3 (SCA3)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia Type 5 (SCA5)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia type 13 (SCA13)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar Ataxia type 6 (SCA6)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia (SCA)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia (SCA19/22)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia type 1 (SCA1)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia type 19 (SCA19)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia type 19/22 (SCA19/22)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia type 2 (SCA2)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia type 3' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia type 3 (SCA3)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia type 5 (SCA5)' : \"Spinocerebellar Ataxia\",\n", + " 'Spinocerebellar ataxia type 6 (SCA6)' : \"Spinocerebellar Ataxia\",\n", + " 'Thyroid cancer' : \"Thyroid cancer\",\n", + " 'Thyroid dyshormonogenesis' : \"Thyroid dyshormonogenesis\",\n", + " 'Urothelial carcinoma' : \"Urothelial carcinoma\",\n", + " 'von Hippel-Lindau syndrome' : \"von Hippel-Lindau syndrome\"}" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "2451ebb1-a9d8-494c-9f7e-4f800cd158e8", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir final_data_fix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c71719e5-5215-4559-a47d-dfc160779260", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "\n", + "# CONFIG parameters\n", + "CONFIG = {\n", + " 'final_data_dir': 'final_data',\n", + " 'final_data_fix_dir': 'final_data_fix',\n", + " 'variant_range': (1, 1450)\n", + "}\n", + "\n", + "# Dummy new_disease mapping for demonstration\n", + "new_disease = {\n", + " \"disease_A\": \"new_disease_A\",\n", + " \"disease_B\": \"new_disease_B\"\n", + " # Add more mappings as needed\n", + "}\n", + "\n", + "# Standardize disease labels using the mapping dictionary\n", + "final_data_dir = CONFIG['final_data_dir']\n", + "final_data_fix_dir = CONFIG['final_data_fix_dir']\n", + "start_idx, end_idx = CONFIG['variant_range']\n", + "\n", + "print(\"Applying disease label standardization...\")\n", + "print(f\"Input: {final_data_dir}\")\n", + "print(f\"Output: {final_data_fix_dir}\")\n", + "\n", + "processed_count = 0\n", + "error_count = 0\n", + "\n", + "for i in range(start_idx, end_idx):\n", + " try:\n", + " input_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'\n", + " if not os.path.exists(input_file):\n", + " continue\n", + " \n", + " with open(input_file, 'r') as file:\n", + " data = json.load(file)\n", + "\n", + " # Get original answer\n", + " temp = data.get('answer', '')\n", + " \n", + " # Apply standardization if mapping exists\n", + " if temp in new_disease:\n", + " data['answer'] = new_disease[temp]\n", + " else:\n", + " print(f\"[Warning] No mapping found for disease: {temp}\")\n", + " \n", + " # Save to standardized directory\n", + " output_file = f'{final_data_fix_dir}/KEGG_{i}_with_seqs.json'\n", + " with open(output_file, 'w') as out_file:\n", + " json.dump(data, out_file, indent=2)\n", + " \n", + " processed_count += 1\n", + " \n", + " if processed_count % 100 == 0:\n", + " print(f\"Standardized {processed_count} disease labels...\")\n", + " \n", + " except Exception as e:\n", + " print(f\"[Error] Failed to process {input_file}: {str(e)}\")\n", + " error_count += 1\n", + "\n", + "print(f\"✅ Disease label standardization complete:\")\n", + "print(f\" Successfully processed: {processed_count}\")\n", + "print(f\" Errors encountered: {error_count}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a12df3e-9ceb-4a51-acaf-e2931792a844", + "metadata": {}, + "outputs": [], + "source": [ + "# Remove original final_data directory and replace with standardized version\n", + "final_data_dir = CONFIG['final_data_dir']\n", + "final_data_fix_dir = CONFIG['final_data_fix_dir']\n", + "\n", + "import shutil\n", + "import os\n", + "\n", + "if os.path.exists(final_data_dir):\n", + " shutil.rmtree(final_data_dir)\n", + " print(f\"Removed original directory: {final_data_dir}\")\n", + "else:\n", + " print(f\"Directory not found: {final_data_dir}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbba2c19-08f6-4769-b38d-a64d8643e142", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from your_config_module import CONFIG # Adjust the import based on your project structure\n", + "\n", + "# Rename standardized directory to final_data\n", + "final_data_dir = CONFIG['final_data_dir']\n", + "final_data_fix_dir = CONFIG['final_data_fix_dir']\n", + "\n", + "if os.path.exists(final_data_fix_dir):\n", + " os.rename(final_data_fix_dir, final_data_dir)\n", + " print(f\"Renamed {final_data_fix_dir} to {final_data_dir}\")\n", + " print(\"✅ Final dataset with standardized disease labels is ready\")\n", + "else:\n", + " print(f\"Directory not found: {final_data_fix_dir}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c87a0df-09c8-4fb6-baca-21a9cdd65b85", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "\n", + "# Assuming CONFIG is defined somewhere earlier in the code\n", + "# CONFIG = {\n", + "# 'final_data_dir': 'path_to_final_data_dir',\n", + "# 'variant_range': (1, 1450)\n", + "# }\n", + "\n", + "# Verify standardized disease labels\n", + "final_data_dir = CONFIG['final_data_dir']\n", + "start_idx, end_idx = CONFIG['variant_range']\n", + "\n", + "print(\"Verifying standardized disease labels...\")\n", + "\n", + "disease = []\n", + "for i in range(start_idx, end_idx):\n", + " try:\n", + " input_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'\n", + " if os.path.exists(input_file):\n", + " with open(input_file, 'r') as file:\n", + " data = json.load(file)\n", + " \n", + " if 'answer' in data:\n", + " disease.append(data['answer'])\n", + " \n", + " except Exception as e:\n", + " print(f\"[Warning] Could not verify {input_file}: {str(e)}\")\n", + "\n", + "print(f\"✅ Verification complete:\")\n", + "print(f\" Total disease labels: {len(disease)}\")\n", + "print(f\" Unique diseases: {len(set(disease))}\")\n", + "print(f\" Top 10 diseases: {list(set(disease))[:10]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "60f75d92-e2f2-495f-ba8f-cb423410f1f4", + "metadata": {}, + "source": [ + "# Saving the KEGG Task to the WangLab Hugging Face" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a069a67-b410-4adf-ab75-62eca67ab259", + "metadata": {}, + "outputs": [], + "source": [ + "cd ../../bioR_tasks" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "10e9f0fb-4943-41bf-bef3-9fcd64796ddf", + "metadata": {}, + "outputs": [], + "source": [ + "mkdir kegg_variant" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cced244e-9d03-47be-8fa1-864f2736fe01", + "metadata": {}, + "outputs": [], + "source": [ + "cp ../BioReason/data/kegg_data/final_data/* kegg_variant/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bff9ce06-2cd8-4675-a23f-080027770bdb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "4c56a919", + "metadata": {}, + "source": [ + "# Creating the Nt Variant Database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c28bc9f", + "metadata": {}, + "outputs": [], + "source": [ + "cd kegg_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7618faf2", + "metadata": {}, + "outputs": [], + "source": [ + "from Bio import SeqIO\n", + "import pandas as pd\n", + "import json\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "# Optional: Uncomment if you want to use HuggingFace datasets\n", + "# from datasets import load_dataset, Dataset, DatasetDict\n", + "\n", + "print(\"Imports loaded for nucleotide database creation\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b8cac05", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Var_IDNetworkEntrySourceIDTranscriptIDNucChangeChrStartEnd...Network ExpandedPathwayClassDiseaseGeneVariant_NameVariant_GeneVariant_Gene InfoVariant_TypeDisease_Names
0KEGG_1N000731019v2ClinVar16929NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
1KEGG_2N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
2KEGG_3N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
3KEGG_4N000731019v2ClinVar16928NC_000012.12NaN125775164757751647...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
4KEGG_5N000731019v2dbSNPrs11547328NC_000012.12NaN125775164757751647...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
..................................................................
1444KEGG_1445N002449817v1COSM6196635ENST00000393623.6c.706G>T191049219610492196...9817v1 // 4780 => (3162,1728,119391,221357,293...{'hsa05225': 'Hepatocellular carcinoma'}{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'9817': 'KEAP1; kelch like ECH associated pro...KEAP1 mutationKEAP1kelch like ECH associated protein 1 [KO:K10456]NaN{'H00048': 'Hepatocellular carcinoma;'}
1445KEGG_1446N002449817v1COSM6196637ENST00000393623.6c.548A>G191049948610499486...9817v1 // 4780 => (3162,1728,119391,221357,293...{'hsa05225': 'Hepatocellular carcinoma'}{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'9817': 'KEAP1; kelch like ECH associated pro...KEAP1 mutationKEAP1kelch like ECH associated protein 1 [KO:K10456]NaN{'H00048': 'Hepatocellular carcinoma;'}
1446KEGG_1447N00258999v2COSM4766271ENST00000621016.4c.662A>G166880882368808823...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
1447KEGG_1448N00258999v2COSM4766211ENST00000621016.4c.755T>G166881026468810264...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
1448KEGG_1449N00258999v2COSM1379150ENST00000621016.4c.769G>A166881027868810278...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
\n", + "

1449 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " Var_ID Network Entry Source ID TranscriptID \\\n", + "0 KEGG_1 N00073 1019v2 ClinVar 16929 NC_000012.12 \n", + "1 KEGG_2 N00073 1019v2 dbSNP rs104894340 NC_000012.12 \n", + "2 KEGG_3 N00073 1019v2 dbSNP rs104894340 NC_000012.12 \n", + "3 KEGG_4 N00073 1019v2 ClinVar 16928 NC_000012.12 \n", + "4 KEGG_5 N00073 1019v2 dbSNP rs11547328 NC_000012.12 \n", + "... ... ... ... ... ... ... \n", + "1444 KEGG_1445 N00244 9817v1 COSM 6196635 ENST00000393623.6 \n", + "1445 KEGG_1446 N00244 9817v1 COSM 6196637 ENST00000393623.6 \n", + "1446 KEGG_1447 N00258 999v2 COSM 4766271 ENST00000621016.4 \n", + "1447 KEGG_1448 N00258 999v2 COSM 4766211 ENST00000621016.4 \n", + "1448 KEGG_1449 N00258 999v2 COSM 1379150 ENST00000621016.4 \n", + "\n", + " NucChange Chr Start End ... \\\n", + "0 NaN 12 57751646 57751646 ... \n", + "1 NaN 12 57751646 57751646 ... \n", + "2 NaN 12 57751646 57751646 ... \n", + "3 NaN 12 57751647 57751647 ... \n", + "4 NaN 12 57751647 57751647 ... \n", + "... ... ... ... ... ... \n", + "1444 c.706G>T 19 10492196 10492196 ... \n", + "1445 c.548A>G 19 10499486 10499486 ... \n", + "1446 c.662A>G 16 68808823 68808823 ... \n", + "1447 c.755T>G 16 68810264 68810264 ... \n", + "1448 c.769G>A 16 68810278 68810278 ... \n", + "\n", + " Network Expanded \\\n", + "0 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "1 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "2 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "3 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "4 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", + "... ... \n", + "1444 9817v1 // 4780 => (3162,1728,119391,221357,293... \n", + "1445 9817v1 // 4780 => (3162,1728,119391,221357,293... \n", + "1446 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "1447 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "1448 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", + "\n", + " Pathway \\\n", + "0 {'hsa05218': 'Melanoma'} \n", + "1 {'hsa05218': 'Melanoma'} \n", + "2 {'hsa05218': 'Melanoma'} \n", + "3 {'hsa05218': 'Melanoma'} \n", + "4 {'hsa05218': 'Melanoma'} \n", + "... ... \n", + "1444 {'hsa05225': 'Hepatocellular carcinoma'} \n", + "1445 {'hsa05225': 'Hepatocellular carcinoma'} \n", + "1446 {'hsa05226': 'Gastric cancer'} \n", + "1447 {'hsa05226': 'Gastric cancer'} \n", + "1448 {'hsa05226': 'Gastric cancer'} \n", + "\n", + " Class \\\n", + "0 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "1 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "2 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "3 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "4 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", + "... ... \n", + "1444 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "1445 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", + "1446 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "1447 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "1448 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", + "\n", + " Disease \\\n", + "0 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "1 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "2 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "3 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "4 {'H00038': 'Melanoma is a form of skin cancer ... \n", + "... ... \n", + "1444 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", + "1445 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", + "1446 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "1447 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "1448 {'H00018': \"Gastric cancer (GC) is one of the ... \n", + "\n", + " Gene Variant_Name \\\n", + "0 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "1 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "2 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "3 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "4 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", + "... ... ... \n", + "1444 {'9817': 'KEAP1; kelch like ECH associated pro... KEAP1 mutation \n", + "1445 {'9817': 'KEAP1; kelch like ECH associated pro... KEAP1 mutation \n", + "1446 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", + "1447 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", + "1448 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", + "\n", + " Variant_Gene Variant_Gene Info \\\n", + "0 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "1 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "2 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "3 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "4 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", + "... ... ... \n", + "1444 KEAP1 kelch like ECH associated protein 1 [KO:K10456] \n", + "1445 KEAP1 kelch like ECH associated protein 1 [KO:K10456] \n", + "1446 CDH1 cadherin 1 [KO:K05689] \n", + "1447 CDH1 cadherin 1 [KO:K05689] \n", + "1448 CDH1 cadherin 1 [KO:K05689] \n", + "\n", + " Variant_Type Disease_Names \n", + "0 NaN {'H00038': 'Melanoma'} \n", + "1 NaN {'H00038': 'Melanoma'} \n", + "2 NaN {'H00038': 'Melanoma'} \n", + "3 NaN {'H00038': 'Melanoma'} \n", + "4 NaN {'H00038': 'Melanoma'} \n", + "... ... ... \n", + "1444 NaN {'H00048': 'Hepatocellular carcinoma;'} \n", + "1445 NaN {'H00048': 'Hepatocellular carcinoma;'} \n", + "1446 NaN {'H00018': 'Gastric cancer'} \n", + "1447 NaN {'H00018': 'Gastric cancer'} \n", + "1448 NaN {'H00018': 'Gastric cancer'} \n", + "\n", + "[1449 rows x 24 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load variant data for nucleotide database creation\n", + "network_file = CONFIG['network_data_file']\n", + "variant_data = pd.read_csv(network_file, sep='\\t')\n", + "print(f\"✅ Loaded variant data: {len(variant_data)} entries\")\n", + "variant_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a7d31451", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1449" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(variant_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fc9baca9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'N00073'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variant_data.iloc[1][\"Network\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "928146a6", + "metadata": {}, + "outputs": [], + "source": [ + "from Bio import SeqIO\n", + "import os\n", + "\n", + "# Load reference genome sequences\n", + "fasta_file = CONFIG['reference_fasta']\n", + "if not os.path.exists(fasta_file):\n", + " print(f\"❌ Reference genome file not found: {fasta_file}\")\n", + " print(\"Please update CONFIG['reference_fasta'] with correct path\")\n", + " raise FileNotFoundError(f\"Reference genome not found: {fasta_file}\")\n", + "\n", + "record_dict = SeqIO.to_dict(SeqIO.parse(fasta_file, \"fasta\"))\n", + "print(f\"✅ Loaded reference genome: {len(record_dict)} sequences\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3184e72", + "metadata": {}, + "outputs": [], + "source": [ + "# Use chromosome dictionary from configuration\n", + "chromosome_dictionary = CONFIG['chromosome_dictionary']\n", + "print(f\"✅ Chromosome mapping loaded: {len(chromosome_dictionary)} chromosomes\")\n", + "print(\"Available chromosomes:\", list(chromosome_dictionary.keys()))" + ] + }, + { + "cell_type": "markdown", + "id": "1cd34cc2", + "metadata": {}, + "source": [ + "### Verification that the reference is present at the exact position I have in my data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70cc6625", + "metadata": {}, + "outputs": [], + "source": [ + "# Verify reference sequences (alternative implementation)\n", + "chromosome_dictionary = CONFIG['chromosome_dictionary']\n", + "verification_file = \"verification_alt.txt\"\n", + "\n", + "print(f\"Starting alternative sequence verification...\")\n", + "print(f\"Results will be saved to: {verification_file}\")\n", + "\n", + "with open(verification_file, \"w\") as f:\n", + " for i in range(len(variant_data)):\n", + " try:\n", + " # ---- Input ----\n", + " chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]\n", + " if (variant_data.iloc[i]['TranscriptID'][:4] == \"ENST\"):\n", + " start = variant_data.iloc[i]['Start'] - 1\n", + " else:\n", + " start = variant_data.iloc[i]['Start']\n", + " reference_allele = variant_data.iloc[i]['RefAllele']\n", + " end = len(reference_allele) + start\n", + "\n", + " chrom_seq = record_dict[chromosome_id].seq\n", + "\n", + " # Adjust for 0-based indexing in Python\n", + " genomic_ref = chrom_seq[start: start + len(reference_allele)]\n", + "\n", + " if genomic_ref.upper() != reference_allele.upper():\n", + " f.write(f\"⚠️ Warning: Entry number {i} with variant {variant_data.iloc[i]['ID']} expected '{reference_allele}', but found '{genomic_ref}'\\n\")\n", + " else:\n", + " f.write(f\"✅ Verified: {chromosome_id}:{start}-{end} → '{reference_allele}' matches genome\\n\")\n", + " \n", + " except Exception as e:\n", + " f.write(f\"❌ Error verifying variant {i}: {str(e)}\\n\")\n", + " \n", + " if (i + 1) % 200 == 0:\n", + " print(f\"Verified {i + 1}/{len(variant_data)} variants...\")\n", + "\n", + "print(f\"✅ Alternative verification complete. Results: {verification_file}\")" + ] + }, + { + "cell_type": "markdown", + "id": "83c0dcce-81b3-4162-a683-3ba86d065eb7", + "metadata": {}, + "source": [ + "## Read in Final_data JSON files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9745a67d-3b2a-4679-92c3-92fc199a8763", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questionanswerreference_sequencevariant_sequencereasoning.reasoning_stepsIDtemp_ID
0Chromosome Number: 20\\nNetwork Definition of t...Creutzfeldt-Jakob DiseaseAATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...[Step 1: The variant is an insertion in the PR...KEGG_854854
1Chromosome Number: 20\\nNetwork Definition of t...Creutzfeldt-Jakob DiseaseAATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...[Step 1: The variant is a deletion of 47 nucle...KEGG_841841
2Chromosome Number: 21\\nNetwork Definition of t...Alzheimer's diseaseGCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA...GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA...[Step 1: The TC>GA mutation in the APP gene on...KEGG_468468
3Chromosome Number: 1\\nNetwork Definition of th...Primary AldosteronismAATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA...AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA...[Step 1: The variant KEGG_635 is a 15-nucleoti...KEGG_635635
4Chromosome Number: 14\\nNetwork Definition of t...Spinocerebellar AtaxiaTCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG...TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG...[Step 1: The variant is a trinucleotide repeat...KEGG_620620
........................
1444Chromosome Number: 6\\nNetwork Definition of th...Spinocerebellar AtaxiagaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT...gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT...[Step 1: The variant KEGG_286 is an A>G substi...KEGG_286286
1445Chromosome Number: 6\\nNetwork Definition of th...Spinocerebellar AtaxiaTTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA...TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA...[Step 1: The variant is a single cytosine (C) ...KEGG_293293
1446Chromosome Number: 12\\nNetwork Definition of t...Pituitary AdenomaGTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC...GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC...[Step 1: The variant is a 20-nucleotide duplic...KEGG_77
1447Chromosome Number: 11\\nNetwork Definition of t...Spinocerebellar AtaxiaATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG...ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG...[Step 1: The variant KEGG_1285 is an A>G subst...KEGG_12851285
1448Chromosome Number: 7\\nNetwork Definition of th...MelanomatataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC...tataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC...[Step 1: The variant involves a nucleotide cha...KEGG_12901290
\n", + "

1449 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " question \\\n", + "0 Chromosome Number: 20\\nNetwork Definition of t... \n", + "1 Chromosome Number: 20\\nNetwork Definition of t... \n", + "2 Chromosome Number: 21\\nNetwork Definition of t... \n", + "3 Chromosome Number: 1\\nNetwork Definition of th... \n", + "4 Chromosome Number: 14\\nNetwork Definition of t... \n", + "... ... \n", + "1444 Chromosome Number: 6\\nNetwork Definition of th... \n", + "1445 Chromosome Number: 6\\nNetwork Definition of th... \n", + "1446 Chromosome Number: 12\\nNetwork Definition of t... \n", + "1447 Chromosome Number: 11\\nNetwork Definition of t... \n", + "1448 Chromosome Number: 7\\nNetwork Definition of th... \n", + "\n", + " answer \\\n", + "0 Creutzfeldt-Jakob Disease \n", + "1 Creutzfeldt-Jakob Disease \n", + "2 Alzheimer's disease \n", + "3 Primary Aldosteronism \n", + "4 Spinocerebellar Ataxia \n", + "... ... \n", + "1444 Spinocerebellar Ataxia \n", + "1445 Spinocerebellar Ataxia \n", + "1446 Pituitary Adenoma \n", + "1447 Spinocerebellar Ataxia \n", + "1448 Melanoma \n", + "\n", + " reference_sequence \\\n", + "0 AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG... \n", + "1 AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG... \n", + "2 GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA... \n", + "3 AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA... \n", + "4 TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG... \n", + "... ... \n", + "1444 gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT... \n", + "1445 TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA... \n", + "1446 GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC... \n", + "1447 ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG... \n", + "1448 tataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC... \n", + "\n", + " variant_sequence \\\n", + "0 AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG... \n", + "1 AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG... \n", + "2 GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA... \n", + "3 AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA... \n", + "4 TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG... \n", + "... ... \n", + "1444 gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT... \n", + "1445 TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA... \n", + "1446 GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC... \n", + "1447 ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG... \n", + "1448 tataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC... \n", + "\n", + " reasoning.reasoning_steps ID temp_ID \n", + "0 [Step 1: The variant is an insertion in the PR... KEGG_854 854 \n", + "1 [Step 1: The variant is a deletion of 47 nucle... KEGG_841 841 \n", + "2 [Step 1: The TC>GA mutation in the APP gene on... KEGG_468 468 \n", + "3 [Step 1: The variant KEGG_635 is a 15-nucleoti... KEGG_635 635 \n", + "4 [Step 1: The variant is a trinucleotide repeat... KEGG_620 620 \n", + "... ... ... ... \n", + "1444 [Step 1: The variant KEGG_286 is an A>G substi... KEGG_286 286 \n", + "1445 [Step 1: The variant is a single cytosine (C) ... KEGG_293 293 \n", + "1446 [Step 1: The variant is a 20-nucleotide duplic... KEGG_7 7 \n", + "1447 [Step 1: The variant KEGG_1285 is an A>G subst... KEGG_1285 1285 \n", + "1448 [Step 1: The variant involves a nucleotide cha... KEGG_1290 1290 \n", + "\n", + "[1449 rows x 7 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re\n", + "import os\n", + "import json\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "\n", + "# Read final dataset JSON files and create combined DataFrame\n", + "\n", + "# Path to the directory containing JSON files\n", + "json_dir = CONFIG['final_data_dir']\n", + "if not os.path.exists(json_dir):\n", + " print(f\"❌ JSON directory not found: {json_dir}\")\n", + " print(\"Please ensure previous processing steps completed successfully\")\n", + " raise FileNotFoundError(f\"Directory not found: {json_dir}\")\n", + "\n", + "print(f\"Processing JSON files from: {json_dir}\")\n", + "\n", + "# Initialize a list to hold DataFrames\n", + "df_list = []\n", + "processed_count = 0\n", + "\n", + "# Loop through all files in the directory\n", + "for filename in os.listdir(json_dir):\n", + " if filename.endswith(\".json\"):\n", + " match = re.search(r\"(KEGG_\\d+)_with_seqs\", filename)\n", + " if match:\n", + " kegg_id = match.group(1) # Extract 'KEGG_'\n", + " file_path = os.path.join(json_dir, filename)\n", + " \n", + " try:\n", + " with open(file_path, 'r') as f:\n", + " data = json.load(f)\n", + " \n", + " df = pd.json_normalize(data)\n", + " df['ID'] = kegg_id # Add the full KEGG ID string\n", + " df['temp_ID'] = int(kegg_id[5:]) # Extract numeric ID for sorting\n", + " df_list.append(df)\n", + " processed_count += 1\n", + " \n", + " if processed_count % 100 == 0:\n", + " print(f\"Processed {processed_count} JSON files...\")\n", + " \n", + " except Exception as e:\n", + " print(f\"[Warning] Could not process {filename}: {str(e)}\")\n", + "\n", + "# Concatenate all DataFrames into one\n", + "if df_list:\n", + " combined_df = pd.concat(df_list, ignore_index=True)\n", + " print(f\"✅ Combined {len(df_list)} JSON files into DataFrame\")\n", + " print(f\"Total samples: {len(combined_df)}\")\n", + "else:\n", + " print(\"❌ No JSON files found or processed successfully\")\n", + " combined_df = pd.DataFrame()\n", + "\n", + "# Display the result\n", + "combined_df.head() if not combined_df.empty else print(\"No data to display\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a81e8836-9618-4e62-b192-ee397a063ce7", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "46c1083a-d499-428e-9180-2b62e83f1751", + "metadata": {}, + "outputs": [], + "source": [ + "combined_df = combined_df.sort_values(by=['temp_ID'])\n", + "combined_df = combined_df.rename(columns={\"reasoning.reasoning_steps\" : \"reasoning\"})\n", + "combined_df = combined_df.drop(columns=['temp_ID'])" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "1c3e8a2e-444e-4d48-b4c1-c8b5dea5753e", + "metadata": {}, + "outputs": [], + "source": [ + "combined_df = combined_df[['ID','question','answer','reference_sequence','variant_sequence','reasoning']]\n", + "combined_df = combined_df.reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "4200c786-4365-407e-96d4-f5cabfc7b3b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDquestionanswerreference_sequencevariant_sequencereasoning
0KEGG_1Chromosome Number: 12\\nNetwork Definition of t...Melanomagcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...[Step 1: The C>T mutation at position 57751646...
1KEGG_2Chromosome Number: 12\\nNetwork Definition of t...Melanomagcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...[Step 1: The C>A mutation at position 57751646...
2KEGG_3Chromosome Number: 12\\nNetwork Definition of t...Melanomagcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...[Step 1: The C>G mutation at position 57751646...
3KEGG_4Chromosome Number: 12\\nNetwork Definition of t...Melanomacttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...[Step 1: The G>A mutation at position 57751647...
4KEGG_5Chromosome Number: 12\\nNetwork Definition of t...Melanomacttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...[Step 1: The G>C mutation at position 57751647...
.....................
1444KEGG_1445Chromosome Number: 19\\nNetwork Definition of t...Hepatocellular carcinomagagctgagatcatgccactgcactccaacctgggcaacagagcgag...gagctgagatcatgccactgcactccaacctgggcaacagagcgag...[Step 1: The variant is a C>A substitution at ...
1445KEGG_1446Chromosome Number: 19\\nNetwork Definition of t...Hepatocellular carcinomaTGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT...TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT...[Step 1: The variant is a T>C substitution at ...
1446KEGG_1447Chromosome Number: 16\\nNetwork Definition of t...Gastric cancerCAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt...CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt...[Step 1: The variant KEGG_1447 represents an A...
1447KEGG_1448Chromosome Number: 16\\nNetwork Definition of t...Gastric cancerGATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA...GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA...[Step 1: The variant KEGG_1448 is a T>G substi...
1448KEGG_1449Chromosome Number: 16\\nNetwork Definition of t...Gastric cancerGTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC...GTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC...[Step 1: The variant KEGG_1449 is a G>A substi...
\n", + "

1449 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " ID question \\\n", + "0 KEGG_1 Chromosome Number: 12\\nNetwork Definition of t... \n", + "1 KEGG_2 Chromosome Number: 12\\nNetwork Definition of t... \n", + "2 KEGG_3 Chromosome Number: 12\\nNetwork Definition of t... \n", + "3 KEGG_4 Chromosome Number: 12\\nNetwork Definition of t... \n", + "4 KEGG_5 Chromosome Number: 12\\nNetwork Definition of t... \n", + "... ... ... \n", + "1444 KEGG_1445 Chromosome Number: 19\\nNetwork Definition of t... \n", + "1445 KEGG_1446 Chromosome Number: 19\\nNetwork Definition of t... \n", + "1446 KEGG_1447 Chromosome Number: 16\\nNetwork Definition of t... \n", + "1447 KEGG_1448 Chromosome Number: 16\\nNetwork Definition of t... \n", + "1448 KEGG_1449 Chromosome Number: 16\\nNetwork Definition of t... \n", + "\n", + " answer \\\n", + "0 Melanoma \n", + "1 Melanoma \n", + "2 Melanoma \n", + "3 Melanoma \n", + "4 Melanoma \n", + "... ... \n", + "1444 Hepatocellular carcinoma \n", + "1445 Hepatocellular carcinoma \n", + "1446 Gastric cancer \n", + "1447 Gastric cancer \n", + "1448 Gastric cancer \n", + "\n", + " reference_sequence \\\n", + "0 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", + "1 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", + "2 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", + "3 cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt... \n", + "4 cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt... \n", + "... ... \n", + "1444 gagctgagatcatgccactgcactccaacctgggcaacagagcgag... \n", + "1445 TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT... \n", + "1446 CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt... \n", + "1447 GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA... \n", + "1448 GTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC... \n", + "\n", + " variant_sequence \\\n", + "0 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", + "1 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", + "2 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", + "3 cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt... \n", + "4 cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt... \n", + "... ... \n", + "1444 gagctgagatcatgccactgcactccaacctgggcaacagagcgag... \n", + "1445 TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT... \n", + "1446 CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt... \n", + "1447 GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA... \n", + "1448 GTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC... \n", + "\n", + " reasoning \n", + "0 [Step 1: The C>T mutation at position 57751646... \n", + "1 [Step 1: The C>A mutation at position 57751646... \n", + "2 [Step 1: The C>G mutation at position 57751646... \n", + "3 [Step 1: The G>A mutation at position 57751647... \n", + "4 [Step 1: The G>C mutation at position 57751647... \n", + "... ... \n", + "1444 [Step 1: The variant is a C>A substitution at ... \n", + "1445 [Step 1: The variant is a T>C substitution at ... \n", + "1446 [Step 1: The variant KEGG_1447 represents an A... \n", + "1447 [Step 1: The variant KEGG_1448 is a T>G substi... \n", + "1448 [Step 1: The variant KEGG_1449 is a G>A substi... \n", + "\n", + "[1449 rows x 6 columns]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "combined_df" + ] + }, + { + "cell_type": "markdown", + "id": "f5cd7e22", + "metadata": {}, + "source": [ + "### Performing the mutation and saving the reference and variant allele with a 1000 nt window" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "8c89d455-598d-45e3-821b-6e37075b3a77", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4001" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(combined_df.iloc[0]['reference_sequence'])" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "a1dd3ed8-18ca-4468-9ab9-98ebf4713260", + "metadata": {}, + "outputs": [], + "source": [ + "KEGG_2000 = combined_df.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "688a7d0b-4a31-484d-9835-eb66d674b5de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'KEGG_2'" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "KEGG_2000.at[1,'ID']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6fc35c2", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate sequences with updated window size\n", + "chromosome_dictionary = CONFIG['chromosome_dictionary']\n", + "window = CONFIG['sequence_window']\n", + "\n", + "print(f\"Generating sequences with {window}bp windows...\")\n", + "KEGG_2000 = combined_df.copy()\n", + "\n", + "for i in range(len(KEGG_2000)):\n", + " try:\n", + " chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]\n", + " if (variant_data.iloc[i]['TranscriptID'][:4] == \"ENST\"):\n", + " start = variant_data.iloc[i]['Start'] - 1\n", + " else:\n", + " start = variant_data.iloc[i]['Start']\n", + " reference_allele = variant_data.iloc[i]['RefAllele']\n", + " variant_allele = variant_data.iloc[i]['AltAllele']\n", + "\n", + " end = len(reference_allele) + start\n", + " \n", + " chrom_seq = record_dict[chromosome_id].seq\n", + "\n", + " # Extract region\n", + " region_start = max(0, start - window)\n", + " region_end = end + window\n", + "\n", + " ref_seq = chrom_seq[region_start:region_end]\n", + "\n", + " if (variant_allele == \"deletion\"):\n", + " # Apply mutation\n", + " mutated_seq = ref_seq[:window] + ref_seq[window + len(reference_allele):]\n", + "\n", + " KEGG_2000.at[i,'reference_sequence'] = str(ref_seq)\n", + " KEGG_2000.at[i,'variant_sequence'] = str(mutated_seq)\n", + " \n", + " else:\n", + " del_len = len(reference_allele)\n", + " # Apply mutation\n", + " mutated_seq = ref_seq[:window] + variant_allele + ref_seq[window + del_len:]\n", + "\n", + " KEGG_2000.at[i,'reference_sequence'] = str(ref_seq)\n", + " KEGG_2000.at[i,'variant_sequence'] = str(mutated_seq)\n", + " \n", + " if (i + 1) % 100 == 0:\n", + " print(f\"Generated sequences for {i + 1}/{len(KEGG_2000)} variants...\")\n", + " \n", + " except Exception as e:\n", + " print(f\"[Error] Failed to generate sequence for variant {i}: {str(e)}\")\n", + "\n", + "print(f\"✅ Sequence generation complete for {window}bp windows\")" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "e2a50c08-ccae-45ca-98e1-0c3d3e7d4647", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDquestionanswerreference_sequencevariant_sequencereasoning
0KEGG_1Chromosome Number: 12\\nNetwork Definition of t...MelanomaTTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...[Step 1: The C>T mutation at position 57751646...
1KEGG_2Chromosome Number: 12\\nNetwork Definition of t...MelanomaTTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...[Step 1: The C>A mutation at position 57751646...
2KEGG_3Chromosome Number: 12\\nNetwork Definition of t...MelanomaTTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...[Step 1: The C>G mutation at position 57751646...
3KEGG_4Chromosome Number: 12\\nNetwork Definition of t...MelanomaTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...[Step 1: The G>A mutation at position 57751647...
4KEGG_5Chromosome Number: 12\\nNetwork Definition of t...MelanomaTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...[Step 1: The G>C mutation at position 57751647...
.....................
1444KEGG_1445Chromosome Number: 19\\nNetwork Definition of t...Hepatocellular carcinomagcactccagcctgggcaacagagcaagagagacagggtcttactct...gcactccagcctgggcaacagagcaagagagacagggtcttactct...[Step 1: The variant is a C>A substitution at ...
1445KEGG_1446Chromosome Number: 19\\nNetwork Definition of t...Hepatocellular carcinomactcccaaagtgctgggattacaggcgtgagccactgggccctgcCC...ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC...[Step 1: The variant is a T>C substitution at ...
1446KEGG_1447Chromosome Number: 16\\nNetwork Definition of t...Gastric cancerggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg...ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg...[Step 1: The variant KEGG_1447 represents an A...
1447KEGG_1448Chromosome Number: 16\\nNetwork Definition of t...Gastric cancertttgagatagggtttcactctgtcacccaggctggaaccacaacct...tttgagatagggtttcactctgtcacccaggctggaaccacaacct...[Step 1: The variant KEGG_1448 is a T>G substi...
1448KEGG_1449Chromosome Number: 16\\nNetwork Definition of t...Gastric cancertcactctgtcacccaggctggaaccacaacctccacttcccgggtt...tcactctgtcacccaggctggaaccacaacctccacttcccgggtt...[Step 1: The variant KEGG_1449 is a G>A substi...
\n", + "

1449 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " ID question \\\n", + "0 KEGG_1 Chromosome Number: 12\\nNetwork Definition of t... \n", + "1 KEGG_2 Chromosome Number: 12\\nNetwork Definition of t... \n", + "2 KEGG_3 Chromosome Number: 12\\nNetwork Definition of t... \n", + "3 KEGG_4 Chromosome Number: 12\\nNetwork Definition of t... \n", + "4 KEGG_5 Chromosome Number: 12\\nNetwork Definition of t... \n", + "... ... ... \n", + "1444 KEGG_1445 Chromosome Number: 19\\nNetwork Definition of t... \n", + "1445 KEGG_1446 Chromosome Number: 19\\nNetwork Definition of t... \n", + "1446 KEGG_1447 Chromosome Number: 16\\nNetwork Definition of t... \n", + "1447 KEGG_1448 Chromosome Number: 16\\nNetwork Definition of t... \n", + "1448 KEGG_1449 Chromosome Number: 16\\nNetwork Definition of t... \n", + "\n", + " answer \\\n", + "0 Melanoma \n", + "1 Melanoma \n", + "2 Melanoma \n", + "3 Melanoma \n", + "4 Melanoma \n", + "... ... \n", + "1444 Hepatocellular carcinoma \n", + "1445 Hepatocellular carcinoma \n", + "1446 Gastric cancer \n", + "1447 Gastric cancer \n", + "1448 Gastric cancer \n", + "\n", + " reference_sequence \\\n", + "0 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", + "1 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", + "2 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", + "3 TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT... \n", + "4 TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT... \n", + "... ... \n", + "1444 gcactccagcctgggcaacagagcaagagagacagggtcttactct... \n", + "1445 ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC... \n", + "1446 ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg... \n", + "1447 tttgagatagggtttcactctgtcacccaggctggaaccacaacct... \n", + "1448 tcactctgtcacccaggctggaaccacaacctccacttcccgggtt... \n", + "\n", + " variant_sequence \\\n", + "0 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", + "1 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", + "2 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", + "3 TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT... \n", + "4 TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT... \n", + "... ... \n", + "1444 gcactccagcctgggcaacagagcaagagagacagggtcttactct... \n", + "1445 ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC... \n", + "1446 ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg... \n", + "1447 tttgagatagggtttcactctgtcacccaggctggaaccacaacct... \n", + "1448 tcactctgtcacccaggctggaaccacaacctccacttcccgggtt... \n", + "\n", + " reasoning \n", + "0 [Step 1: The C>T mutation at position 57751646... \n", + "1 [Step 1: The C>A mutation at position 57751646... \n", + "2 [Step 1: The C>G mutation at position 57751646... \n", + "3 [Step 1: The G>A mutation at position 57751647... \n", + "4 [Step 1: The G>C mutation at position 57751647... \n", + "... ... \n", + "1444 [Step 1: The variant is a C>A substitution at ... \n", + "1445 [Step 1: The variant is a T>C substitution at ... \n", + "1446 [Step 1: The variant KEGG_1447 represents an A... \n", + "1447 [Step 1: The variant KEGG_1448 is a T>G substi... \n", + "1448 [Step 1: The variant KEGG_1449 is a G>A substi... \n", + "\n", + "[1449 rows x 6 columns]" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "KEGG_2000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26c939b5-0768-4565-873a-10cba7396d99", + "metadata": {}, + "outputs": [], + "source": [ + "# Create dataset structure (HuggingFace datasets optional)\n", + "try:\n", + " from datasets import Dataset, DatasetDict\n", + " \n", + " # Create Hugging Face Datasets\n", + " train_dataset = Dataset.from_pandas(KEGG_2000)\n", + " \n", + " # Combine into a DatasetDict\n", + " dataset = DatasetDict({\n", + " \"train\": train_dataset,\n", + " })\n", + " \n", + " print(\"✅ HuggingFace dataset created\")\n", + " use_hf_datasets = True\n", + " \n", + "except ImportError:\n", + " print(\"⚠️ HuggingFace datasets not available, using pandas only\")\n", + " dataset = KEGG_2000\n", + " train_dataset = KEGG_2000\n", + " use_hf_datasets = False\n", + "\n", + "print(f\"Final dataset contains {len(train_dataset)} samples\")" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "afa07e17-e86a-41d8-9db3-5df6d77443f8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['ID', 'question', 'answer', 'reference_sequence', 'variant_sequence', 'reasoning'],\n", + " num_rows: 1449\n", + " })\n", + "})" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "affe2720-e748-45d2-97d0-0baf1d6530ce", + "metadata": {}, + "outputs": [], + "source": [ + "# Save final dataset locally instead of uploading to HuggingFace\n", + "# Users can upload to their own repositories if needed\n", + "\n", + "output_file = \"kegg_variant_dataset_final.parquet\"\n", + "dataset_info_file = \"dataset_info.json\"\n", + "\n", + "# Save dataset as Parquet for efficient storage\n", + "train_dataset.to_parquet(output_file)\n", + "print(f\"✅ Dataset saved to: {output_file}\")\n", + "\n", + "# Save dataset information\n", + "dataset_info = {\n", + " \"name\": \"KEGG Variant Dataset\",\n", + " \"description\": \"Genetic variants with biological reasoning for disease association\",\n", + " \"total_samples\": len(train_dataset),\n", + " \"sequence_length\": f\"~{CONFIG['sequence_window']*2}bp\",\n", + " \"features\": list(train_dataset.column_names),\n", + " \"diseases\": len(set(disease)) if 'disease' in locals() else \"Unknown\",\n", + " \"created_by\": \"KEGG Data Processing Pipeline\",\n", + " \"version\": \"1.0\"\n", + "}\n", + "\n", + "with open(dataset_info_file, 'w') as f:\n", + " json.dump(dataset_info, f, indent=2)\n", + " \n", + "print(f\"✅ Dataset information saved to: {dataset_info_file}\")\n", + "print(f\"\\nDataset ready for use:\")\n", + "print(f\" - Main dataset: {output_file}\")\n", + "print(f\" - Information: {dataset_info_file}\")\n", + "print(f\" - Samples: {len(train_dataset)}\")\n", + "print(f\" - Features: {train_dataset.column_names}\")\n", + "\n", + "print(\"\\n📝 To upload to HuggingFace Hub:\")\n", + "print(\"dataset.push_to_hub('your-username/your-dataset-name')\")" + ] + }, + { + "cell_type": "markdown", + "id": "5b448bd7-e256-4fad-ae95-dbe299d380f0", + "metadata": {}, + "source": [ + "# KEGG Dataset with Alternative Window Size\n", + "\n", + "This section demonstrates creating the dataset with different sequence window parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fd609ca-6276-4425-997f-0589fe03f1ea", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/BioReason/data/README.md b/BioReason/data/README.md new file mode 100644 index 0000000000000000000000000000000000000000..445e3be4a438416ed04b04f347a52c6b65a77d88 --- /dev/null +++ b/BioReason/data/README.md @@ -0,0 +1,35 @@ +# BioReasoning Data Curation + +Jupyter notebooks for processing genetic variant data and creating ML datasets for biological reasoning tasks. + +## Notebooks + +**Core Analysis** +- `BioReasoning_DataCuration_KEGG.ipynb` - KEGG pathway analysis with Claude API +- `Clinvar_Coding.ipynb` - ClinVar variant processing and gene mapping +- `Clinvar_SNV_Non_SNV.ipynb` - SNV/structural variant datasets with VEP annotations + +**KEGG Pipeline** +- `KEGG_Data_1.ipynb` - KEGG network data processing and variant identification +- `KEGG_Data_2.ipynb` - Variant parsing and sequence generation +- `KEGG_Data_3.ipynb` - Final ML dataset creation with Q&A pairs + +**Variant Prediction** +- `VEP.ipynb` - Variant effect prediction datasets (ClinVar, OMIM, eQTL) + +## Setup + +```bash +brew install brewsci/bio/edirect # For ClinVar (macOS) +export ANTHROPIC_API_KEY="your-key" # For KEGG analysis +``` + +## Usage + +Each notebook has a configuration section - update paths/keys as needed, then run sequentially. + +**Key Outputs:** +- KEGG biological reasoning datasets +- ClinVar variant-disease associations +- VEP prediction task datasets +- Genomic sequences with variant context diff --git a/BioReason/data/VEP.ipynb b/BioReason/data/VEP.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..85faa35e74a38547e95d570372d61935870aa3a2 --- /dev/null +++ b/BioReason/data/VEP.ipynb @@ -0,0 +1,2749 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0510f375", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "Set up parameters and data sources for variant effect prediction tasks:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d59a5d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration - Update these parameters for your environment\n", + "import os\n", + "from pathlib import Path\n", + "import random\n", + "\n", + "# Set random seed for reproducible question assignment\n", + "RANDOM_SEED = 42\n", + "random.seed(RANDOM_SEED)\n", + "\n", + "# Configuration parameters\n", + "CONFIG = {\n", + " # Data source configurations\n", + " 'huggingface_repo': 'wanglab/bioR_tasks', # Update with your repository\n", + " \n", + " # Local data paths (update these if using local files)\n", + " 'local_data_dir': 'data',\n", + " \n", + " # Output configurations\n", + " 'output_dir': 'output_datasets',\n", + " 'save_local': True, # Save datasets locally\n", + " 'upload_to_hub': False, # Set to True to upload to HuggingFace Hub\n", + " \n", + " # Processing parameters\n", + " 'question_variants': 50, # Number of question templates per task\n", + " 'batch_size': 1000, # For memory-efficient processing\n", + " \n", + " # Task configurations\n", + " 'tasks': {\n", + " 'task1': {'name': 'variant_effect_coding', 'description': 'Pathogenic vs Benign classification'},\n", + " 'task2': {'name': 'variant_effect_causal_eqtl', 'description': 'Gene expression change prediction'},\n", + " 'task3': {'name': 'variant_effect_pathogenic_omim', 'description': 'OMIM pathogenic classification'},\n", + " 'task4_snv': {'name': 'task4_variant_effect_snv', 'description': 'SNV effect prediction'},\n", + " 'task4_non_snv': {'name': 'task4_variant_effect_non_snv', 'description': 'Non-SNV effect prediction'}\n", + " }\n", + "}\n", + "\n", + "# Create output directory\n", + "Path(CONFIG['output_dir']).mkdir(exist_ok=True)\n", + "\n", + "print(\"Configuration loaded:\")\n", + "print(f\" Random seed: {RANDOM_SEED}\")\n", + "print(f\" Output directory: {CONFIG['output_dir']}\")\n", + "print(f\" Upload to hub: {CONFIG['upload_to_hub']}\")\n", + "print(f\" Repository: {CONFIG['huggingface_repo']}\")\n", + "print(\"\\n📝 Update CONFIG dictionary above with your specific settings\")" + ] + }, + { + "cell_type": "markdown", + "id": "e4a1e6bc-e3e6-4084-a42e-ba988c3afa4a", + "metadata": {}, + "source": [ + "# Variant Effect Prediction Tasks - Dataset Creation Pipeline\n", + "\n", + "## Overview\n", + "\n", + "This notebook creates standardized datasets for variant effect prediction tasks using various genomic databases. It processes raw variant data into machine learning-ready formats with contextualized questions and standardized answers.\n", + "\n", + "## What This Notebook Does\n", + "\n", + "1. **Task 1**: Variant Effect Prediction (Pathogenic vs Benign) using ClinVar data\n", + "2. **Task 2**: Causal eQTL Analysis (Gene Expression Changes) \n", + "3. **Task 3**: Pathogenic Variant Classification using OMIM data\n", + "4. **Task 4**: SNV and Non-SNV Variant Effect Prediction\n", + "\n", + "## Key Features\n", + "\n", + "- **Question Diversification**: 50+ unique question templates per task type\n", + "- **Standardized Format**: Consistent ID, question, answer, sequence structure\n", + "- **Multiple Data Sources**: ClinVar, OMIM, eQTL databases\n", + "- **Publication-Ready**: Clean, documented datasets ready for research use\n", + "\n", + "## Dataset Structure\n", + "\n", + "Each task generates datasets with the following fields:\n", + "- `ID`: Unique identifier for each variant\n", + "- `question`: Contextualized biological question\n", + "- `answer`: Standardized response (pathogenic/benign, disease name, etc.)\n", + "- `reference_sequence`: Original genomic sequence\n", + "- `variant_sequence`: Mutated genomic sequence\n", + "\n", + "## Prerequisites\n", + "\n", + "```bash\n", + "pip install datasets pandas numpy\n", + "```\n", + "\n", + "## Usage\n", + "\n", + "1. **Configure Data Sources**: Update file paths and dataset configurations\n", + "2. **Run Tasks Sequentially**: Execute each task section in order\n", + "3. **Review Outputs**: Validate generated datasets before publication\n", + "4. **Export**: Datasets are saved locally and optionally uploaded to repositories\n", + "\n", + "## Important Notes\n", + "\n", + "- **Data Privacy**: All personal references have been removed\n", + "- **Reproducibility**: Random seeds should be set for consistent question assignment\n", + "- **Memory Usage**: Large datasets may require substantial RAM\n", + "- **File Paths**: Update all hardcoded paths to use relative or configurable paths\n", + "\n", + "## Output\n", + "\n", + "Generated datasets are suitable for:\n", + "- Variant effect prediction model training\n", + "- Biological reasoning benchmarks\n", + "- Genomic language model evaluation\n", + "- Clinical variant interpretation research" + ] + }, + { + "cell_type": "markdown", + "id": "67ff57a4-00e0-41a4-aac1-18f4e1c68be8", + "metadata": {}, + "source": [ + "## Task 1: variant effect prediction" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "73559f7e-ade8-4d84-84d4-859fbbd0c575", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "import pandas as pd\n", + "import json\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d2ffcc6-5e33-4cc4-a46d-2c8f67bd20a6", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = load_dataset(\"wanglab/bioR_tasks\", 'variant_effect_pathogenic_clinvar')\n", + "\n", + "## Task 1: Variant Effect Prediction (Pathogenic vs Benign)\n", + "\n", + "**Objective**: Classify genetic variants as pathogenic or benign based on chromosomal location and gene context.\n", + "\n", + "**Data Source**: ClinVar database with pathogenic variant annotations\n", + "\n", + "**Question Types**: 50 different question templates incorporating:\n", + "- Chromosome location\n", + "- Gene information (when available)\n", + "- Clinical significance assessment\n", + "\n", + "**Output Format**: Binary classification with disease association when applicable" + ] + }, + { + "cell_type": "markdown", + "id": "e31d1d0b-5f13-468c-a2cf-704499214a04", + "metadata": {}, + "source": [ + "**With GPT4o, I created 50 different versions of this question and prompt**\n", + "\n", + "# Core imports for dataset processing\n", + "from datasets import load_dataset, Dataset, DatasetDict\n", + "import pandas as pd\n", + "import numpy as np\n", + "import json\n", + "import random\n", + "from pathlib import Path\n", + "\n", + "# Set random seed for reproducibility\n", + "random.seed(CONFIG.get('random_seed', 42))\n", + "\n", + "print(\"✅ Core libraries imported\")\n", + "print(f\"Random seed set to: {CONFIG.get('random_seed', 42)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3301da8-bed2-4c0f-835c-6613df88dd11", + "metadata": {}, + "outputs": [], + "source": [ + "# Must call format in order of chromosome, gene, gene_name\n", + "question_variants_50 = [\"This variant lies on Chromosome {0} and affects the gene {1} ({2}). Based on this context, is the mutation pathogenic or benign? If pathogenic, what disease does it cause?\",\n", + "\"Located on Chromosome {0}, this mutation impacts {1} ({2}). What is its biological consequence — is it benign or pathogenic, and which disease is associated if any?\",\n", + "\"A genetic alteration is present in {1} ({2}) on Chromosome {0}. Is this variant benign or disease-causing, and if the latter, which condition is involved?\",\n", + "\"This variant affects the gene {1} ({2}) found on Chromosome {0}. What is the clinical effect of this variant — benign or pathogenic? State the disease if applicable.\",\n", + "\"With a mutation on Chromosome {0} in gene {1} ({2}), classify this variant as benign or pathogenic. Include the disease if it's pathogenic.\",\n", + "\"This sequence change occurs on Chromosome {0}, altering {1} ({2}). What is the medical significance of this variant — is it benign or linked to a disease?\",\n", + "\"Here is a variant affecting {1} ({2}) on Chromosome {0}. Please identify whether it is a benign mutation or associated with a disorder.\",\n", + "\"A variant on Chromosome {0} in gene {1} ({2}) has been observed. Is this a neutral mutation, or does it result in a disease? If so, which one?\",\n", + "\"The gene {1} ({2}) on Chromosome {0} contains a mutation. Based on this information, is the variant pathogenic or benign? Provide the disease if relevant.\",\n", + "\"This genomic variant is located on Chromosome {0}, within the {1} ({2}) gene. Can you determine its pathogenicity and name any linked disease?\",\n", + "\"A mutation found in {1} ({2}) on Chromosome {0} may be clinically relevant. Is it pathogenic or benign, and if the former, which disease is implicated?\",\n", + "\"Given a variant located on Chromosome {0} and affecting {1} ({2}), assess whether it is benign or pathogenic. Indicate the associated disease if pathogenic.\",\n", + "\"This mutation is located in gene {1} ({2}) on Chromosome {0}. Is it associated with a disease or is it a benign polymorphism?\",\n", + "\"A variant has been detected on Chromosome {0} in {1} ({2}). What is its effect — pathogenic or benign? If pathogenic, name the disease.\",\n", + "\"The variant affects gene {1} ({2}), which is on Chromosome {0}. Please evaluate whether this mutation is benign or pathogenic and specify the disease if necessary.\",\n", + "\"This alteration in {1} ({2}) on Chromosome {0} may affect gene function. Does it lead to a disease or is it benign?\",\n", + "\"Given this variant in gene {1} ({2}) on Chromosome {0}, classify it as benign or pathogenic. Include the disorder it may cause if applicable.\",\n", + "\"A variant was discovered on Chromosome {0}, affecting {1} ({2}). What is its functional impact — neutral or pathogenic? State the disease if pathogenic.\",\n", + "\"This gene mutation involves {1} ({2}) on Chromosome {0}. Is it associated with any clinical condition, or is it benign?\",\n", + "\"The gene {1} ({2}) on Chromosome {0} carries this variant. Does this mutation lead to a specific disease, or is it non-pathogenic?\",\n", + "\"Here is a mutation in {1} ({2}) on Chromosome {0}. Determine whether it’s benign or pathogenic. If the latter, what disease does it cause?\",\n", + "\"A variant found in Chromosome {0} affects {1} ({2}). Please analyze its biological impact: is it benign or pathogenic, and what condition might it cause?\",\n", + "\"The following genetic variant occurs in {1} ({2}) on Chromosome {0}. Classify its clinical effect — pathogenic or benign — and list any associated condition.\",\n", + "\"This alteration occurs within gene {1} ({2}) located on Chromosome {0}. Is it associated with a disease or is it a benign variant?\",\n", + "\"A mutation on Chromosome {0} affecting {1} ({2}) has been found. Is it harmful or harmless? What disease, if any, does it cause?\",\n", + "\"Gene {1} ({2}) on Chromosome {0} is impacted by this variant. Evaluate whether it is clinically benign or pathogenic and name the disorder if relevant.\",\n", + "\"Consider this mutation in {1} ({2}) on Chromosome {0}. Is this a benign change or a disease-causing variant?\",\n", + "\"A variant was discovered in gene {1} ({2}), Chromosome {0}. Please indicate if this mutation results in a known disease or if it's non-harmful.\",\n", + "\"Given this context: Chromosome {0}, gene {1} ({2}) — does this variant present pathogenic behavior, and if so, what disease does it relate to?\",\n", + "\"This sequence variant lies in {1} ({2}) on Chromosome {0}. Is it clinically significant, and what condition might it cause if any?\",\n", + "\"A mutation in {1} ({2}), located on Chromosome {0}, is being studied. Determine whether it’s pathogenic or benign, and specify the linked disease.\",\n", + "\"Here is a genetic alteration in {1} ({2}) on Chromosome {0}. Based on the data, is it a benign variant or a cause of disease?\",\n", + "\"Mutation context: Chromosome {0}, Gene {1} ({2}). Determine if this variant is likely to be benign or pathogenic. Mention the disease if applicable.\",\n", + "\"A sequence alteration has been identified in {1} ({2}) on Chromosome {0}. Is it disease-inducing or harmless?\",\n", + "\"Chromosome {0} houses a mutation in gene {1} ({2}). Classify its clinical impact — is it pathogenic or benign, and what disease does it lead to if any?\",\n", + "\"This variant affects gene {1} ({2}) located on Chromosome {0}. Evaluate its biological effect and specify any disease association.\",\n", + "\"Gene {1} ({2}) on Chromosome {0} is altered by this variant. Does this mutation result in a disease or is it benign?\",\n", + "\"Assess the clinical impact of this variant on gene {1} ({2}), found on Chromosome {0}. State whether it’s pathogenic or benign, and the disease if applicable.\",\n", + "\"This is a variant in {1} ({2}), located on Chromosome {0}. Is this mutation a likely cause of disease or not?\",\n", + "\"A change on Chromosome {0} affects gene {1} ({2}). Identify whether the variant is neutral or disease-linked. Mention the disease if applicable.\",\n", + "\"This variant impacts the gene {1} ({2}) on Chromosome {0}. Is the change likely to result in a pathogenic outcome?\",\n", + "\"The gene {1} ({2}) is located on Chromosome {0}, where a mutation has occurred. What is the medical relevance of this mutation?\",\n", + "\"A variant affecting Chromosome {0}, within the gene {1} ({2}), has been observed. Determine if it's benign or associated with disease.\",\n", + "\"This mutation occurs in {1} ({2}) on Chromosome {0}. Does this change lead to a known medical condition, or is it benign?\",\n", + "\"Gene {1} ({2}), found on Chromosome {0}, is impacted by this variant. What is the biological outcome — benign or pathogenic?\",\n", + "\"Consider a variant on Chromosome {0} in gene {1} ({2}). Determine its clinical classification and disease relevance.\",\n", + "\"An alteration has been detected in {1} ({2}) on Chromosome {0}. Is it pathogenic, and if so, what disease is involved?\",\n", + "\"The gene {1} ({2}), on Chromosome {0}, contains a mutation. Does this mutation cause a disorder, or is it a benign change?\",\n", + "\"Here’s a variant in {1} ({2}) located on Chromosome {0}. What is the predicted biological effect — harmless or disease-causing?\",\n", + "\"A genomic change on Chromosome {0} affects {1} ({2}). Classify this variant as benign or pathogenic, and name the disease if relevant.\"]\n", + "# Load Task 1 dataset from configured source\n", + "try:\n", + " dataset = load_dataset(CONFIG['huggingface_repo'], 'variant_effect_pathogenic_clinvar')\n", + " print(f\"✅ Loaded dataset from: {CONFIG['huggingface_repo']}\")\n", + " print(f\"Train samples: {len(dataset['train'])}\")\n", + " print(f\"Test samples: {len(dataset['test'])}\")\n", + "except Exception as e:\n", + " print(f\"❌ Error loading dataset: {e}\")\n", + " print(\"Please check the repository name and dataset configuration\")\n", + " raise" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a418765-c127-44a8-8003-822b413cc907", + "metadata": {}, + "outputs": [], + "source": [ + "question_variants_50_no_gene = [\n", + " \"This variant lies on Chromosome {0}. Based on this context, is the mutation pathogenic or benign? If pathogenic, what disease does it cause?\",\n", + " \"Located on Chromosome {0}, this mutation has been observed. What is its biological consequence — is it benign or pathogenic, and which disease is associated if any?\",\n", + " \"A genetic alteration is present on Chromosome {0}. Is this variant benign or disease-causing, and if the latter, which condition is involved?\",\n", + " \"This variant is found on Chromosome {0}. What is the clinical effect of this variant — benign or pathogenic? State the disease if applicable.\",\n", + " \"With a mutation on Chromosome {0}, classify this variant as benign or pathogenic. Include the disease if it's pathogenic.\",\n", + " \"This sequence change occurs on Chromosome {0}. What is the medical significance of this variant — is it benign or linked to a disease?\",\n", + " \"Here is a variant on Chromosome {0}. Please identify whether it is a benign mutation or associated with a disorder.\",\n", + " \"A variant on Chromosome {0} has been observed. Is this a neutral mutation, or does it result in a disease? If so, which one?\",\n", + " \"A mutation is present on Chromosome {0}. Based on this information, is the variant pathogenic or benign? Provide the disease if relevant.\",\n", + " \"This genomic variant is located on Chromosome {0}. Can you determine its pathogenicity and name any linked disease?\",\n", + " \"A mutation found on Chromosome {0} may be clinically relevant. Is it pathogenic or benign, and if the former, which disease is implicated?\",\n", + " \"Given a variant located on Chromosome {0}, assess whether it is benign or pathogenic. Indicate the associated disease if pathogenic.\",\n", + " \"This mutation is located on Chromosome {0}. Is it associated with a disease or is it a benign polymorphism?\",\n", + " \"A variant has been detected on Chromosome {0}. What is its effect — pathogenic or benign? If pathogenic, name the disease.\",\n", + " \"A mutation on Chromosome {0} is under review. Please evaluate whether this mutation is benign or pathogenic and specify the disease if necessary.\",\n", + " \"This alteration on Chromosome {0} may affect genome function. Does it lead to a disease or is it benign?\",\n", + " \"Given this variant on Chromosome {0}, classify it as benign or pathogenic. Include the disorder it may cause if applicable.\",\n", + " \"A variant was discovered on Chromosome {0}. What is its functional impact — neutral or pathogenic? State the disease if pathogenic.\",\n", + " \"This mutation on Chromosome {0} may be significant. Is it associated with any clinical condition, or is it benign?\",\n", + " \"Chromosome {0} carries this variant. Does this mutation lead to a specific disease, or is it non-pathogenic?\",\n", + " \"Here is a mutation located on Chromosome {0}. Determine whether it’s benign or pathogenic. If the latter, what disease does it cause?\",\n", + " \"A variant found on Chromosome {0} is being studied. Please analyze its biological impact: is it benign or pathogenic, and what condition might it cause?\",\n", + " \"The following genetic variant occurs on Chromosome {0}. Classify its clinical effect — pathogenic or benign — and list any associated condition.\",\n", + " \"This alteration occurs on Chromosome {0}. Is it associated with a disease or is it a benign variant?\",\n", + " \"A mutation on Chromosome {0} has been found. Is it harmful or harmless? What disease, if any, does it cause?\",\n", + " \"A variant on Chromosome {0} is under investigation. Evaluate whether it is clinically benign or pathogenic and name the disorder if relevant.\",\n", + " \"Consider this mutation on Chromosome {0}. Is this a benign change or a disease-causing variant?\",\n", + " \"A variant was discovered on Chromosome {0}. Please indicate if this mutation results in a known disease or if it's non-harmful.\",\n", + " \"Given this context: Chromosome {0} — does this variant present pathogenic behavior, and if so, what disease does it relate to?\",\n", + " \"This sequence variant lies on Chromosome {0}. Is it clinically significant, and what condition might it cause if any?\",\n", + " \"A mutation located on Chromosome {0} is being studied. Determine whether it’s pathogenic or benign, and specify the linked disease.\",\n", + " \"Here is a genetic alteration on Chromosome {0}. Based on the data, is it a benign variant or a cause of disease?\",\n", + " \"Mutation context: Chromosome {0}. Determine if this variant is likely to be benign or pathogenic. Mention the disease if applicable.\",\n", + " \"A sequence alteration has been identified on Chromosome {0}. Is it disease-inducing or harmless?\",\n", + " \"Chromosome {0} houses a mutation. Classify its clinical impact — is it pathogenic or benign, and what disease does it lead to if any?\",\n", + " \"This variant is located on Chromosome {0}. Evaluate its biological effect and specify any disease association.\",\n", + " \"Chromosome {0} is altered by this variant. Does this mutation result in a disease or is it benign?\",\n", + " \"Assess the clinical impact of this variant found on Chromosome {0}. State whether it’s pathogenic or benign, and the disease if applicable.\",\n", + " \"This is a variant located on Chromosome {0}. Is this mutation a likely cause of disease or not?\",\n", + " \"A change on Chromosome {0} is being evaluated. Identify whether the variant is neutral or disease-linked. Mention the disease if applicable.\",\n", + " \"This variant is present on Chromosome {0}. Is the change likely to result in a pathogenic outcome?\",\n", + " \"A mutation has occurred on Chromosome {0}. What is the medical relevance of this mutation?\",\n", + " \"A variant affecting Chromosome {0} has been observed. Determine if it's benign or associated with disease.\",\n", + " \"This mutation occurs on Chromosome {0}. Does this change lead to a known medical condition, or is it benign?\",\n", + " \"A genomic variant on Chromosome {0} is under review. What is the biological outcome — benign or pathogenic?\",\n", + " \"Consider a variant on Chromosome {0}. Determine its clinical classification and disease relevance.\",\n", + " \"An alteration has been detected on Chromosome {0}. Is it pathogenic, and if so, what disease is involved?\",\n", + " \"A mutation on Chromosome {0} is under examination. Does this mutation cause a disorder, or is it a benign change?\",\n", + " \"Here’s a variant located on Chromosome {0}. What is the predicted biological effect — harmless or disease-causing?\",\n", + " \"A genomic change on Chromosome {0} is noted. Classify this variant as benign or pathogenic, and name the disease if relevant.\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "cb12df8d-4303-4cf5-bb4a-73d1351ab059", + "metadata": {}, + "outputs": [], + "source": [ + "task_1 = dataset['train'].to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8eddbccb-16f7-4a0c-b4f0-49f38a9468e0", + "metadata": {}, + "outputs": [], + "source": [ + "task_1['label'] = task_1['label'].apply(lambda x: \"Benign\" if x == \"Common\" else x)\n", + "task_1['ID'] = ['Task1_train_' + str(i) for i in range(len(task_1))]\n", + "task_1 = task_1[['ID', 'label', 'chromosome', 'ref_forward_sequence', 'alt_forward_sequence',\n", + " 'gene', 'gene_name', 'disease']]\n", + "\n", + "task_1 = task_1.set_index('ID')\n", + "\n", + "task_1_train = []\n", + "\n", + "for count, id in enumerate(task_1.index):\n", + " task_1_train.append({})\n", + " task_1_train[count]['ID'] = id\n", + " if not (task_1.loc[id]['gene'] or task_1.loc[id]['gene_name']):\n", + " task_1_train[count]['question'] = question_variants_50_no_gene[random.randrange(50)].format(task_1.loc[id]['chromosome'])\n", + " else:\n", + " task_1_train[count]['question'] = question_variants_50[random.randrange(50)].format(task_1.loc[id]['chromosome'], task_1.loc[id]['gene'], task_1.loc[id]['gene_name'])\n", + " \n", + " if not task_1.loc[id]['disease']:\n", + " task_1_train[count]['answer'] = f\"{task_1.loc[id]['label']}\"\n", + " else:\n", + " task_1_train[count]['answer'] = f\"{task_1.loc[id]['label']}; {task_1.loc[id]['disease']}\"\n", + " task_1_train[count]['reference_sequence'] = task_1.loc[id]['ref_forward_sequence']\n", + " task_1_train[count]['variant_sequence'] = task_1.loc[id]['alt_forward_sequence']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1bc9a04d-ec4d-47e7-9ffd-c4e319b62172", + "metadata": {}, + "outputs": [], + "source": [ + "task_1 = dataset['test'].to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "2fc93817-f219-449e-8c57-8e597a2ca494", + "metadata": {}, + "outputs": [], + "source": [ + "task_1['label'] = task_1['label'].apply(lambda x: \"Benign\" if x == \"Common\" else x)\n", + "task_1['ID'] = ['Task1_test_' + str(i) for i in range(len(task_1))]\n", + "task_1 = task_1[['ID', 'label', 'chromosome', 'ref_forward_sequence', 'alt_forward_sequence',\n", + " 'gene', 'gene_name', 'disease']]\n", + "\n", + "task_1 = task_1.set_index('ID')\n", + "\n", + "task_1_test = []\n", + "\n", + "for count, id in enumerate(task_1.index):\n", + " task_1_test.append({})\n", + " task_1_test[count]['ID'] = id\n", + " if not task_1.loc[id]['gene'] or task_1.loc[id]['gene_name']:\n", + " task_1_test[count]['question'] = question_variants_50_no_gene[random.randrange(50)].format(task_1.loc[id]['chromosome'])\n", + " else:\n", + " task_1_test[count]['question'] = question_variants_50[random.randrange(50)].format(task_1.loc[id]['chromosome'], task_1.loc[id]['gene'], task_1.loc[id]['gene_name'])\n", + " \n", + " if not task_1.loc[id]['disease']:\n", + " task_1_test[count]['answer'] = f\"{task_1.loc[id]['label']}\"\n", + " else:\n", + " task_1_test[count]['answer'] = f\"{task_1.loc[id]['label']}; {task_1.loc[id]['disease']}\"\n", + " task_1_test[count]['reference_sequence'] = task_1.loc[id]['ref_forward_sequence']\n", + " task_1_test[count]['variant_sequence'] = task_1.loc[id]['alt_forward_sequence']\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "5e71a52a-d030-4ad8-9317-a48a81d788a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "48850" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(task_1_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "5c79704d-8ecc-449c-91e5-c9dd03219028", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1233" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(task_1_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "6bf828d7-0b1c-4b24-afb9-c0127b6f608c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Here is some context for the variant: It is on Chromosome 8, and affects Gene/s CLN8 (CLN8 transmembrane ER and ERGIC protein). Given this context, what is the biological effect of this variant allele, specifically is the mutation pathogenic or benign? If pathogenic, what disease it will cause?\n" + ] + } + ], + "source": [ + "print(f\"Here is some context for the variant: It is on Chromosome {task_1.iloc[0]['chromosome']}, and affects Gene/s {task_1.iloc[0]['gene']} ({task_1.iloc[0]['gene_name']}). Given this context, what is the biological effect of this variant allele, specifically is the mutation pathogenic or benign? If pathogenic, what disease it will cause?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "746f4274-768a-4c7a-a878-813e2072ba2b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "2f3b8017-9abf-488e-986f-1d588e24eacf", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import Dataset, DatasetDict\n", + "\n", + "# Step 1: Create Hugging Face Datasets\n", + "train_dataset = Dataset.from_list(task_1_train)\n", + "test_dataset = Dataset.from_list(task_1_test)\n", + "\n", + "# Step 2: Combine into a DatasetDict (to mimic load_dataset)\n", + "dataset = DatasetDict({\n", + " \"train\": train_dataset,\n", + " \"test\": test_dataset\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73077c9f-65d4-451b-879e-f7071029d9f5", + "metadata": {}, + "outputs": [], + "source": [ + "dataset.push_to_hub(\n", + " \"wanglab/bioR_tasks\",\n", + " config_name=\"variant_effect_coding\",\n", + " commit_message=\"Upload the finalized Task 1 Variant Effect Coding Data\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "54a98d27-39d1-47aa-86a3-2a50d85d6df5", + "metadata": {}, + "source": [ + "## Task 2 Variant Effect Causal eQTL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68c0a985-09fb-4035-a999-158e743b98d6", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "import pandas as pd\n", + "import json\n", + "import random\n", + "from pathlib import Path\n", + "\n", + "# CONFIG dictionary to simulate the configuration settings\n", + "CONFIG = {\n", + " 'save_local': True,\n", + " 'output_dir': './data',\n", + " 'upload_to_hub': False,\n", + " 'huggingface_repo': 'your_huggingface_repo'\n", + "}\n", + "\n", + "# Load your dataset here\n", + "# dataset = load_dataset('your_dataset_name')\n", + "\n", + "# Save and optionally upload Task 1 dataset\n", + "if CONFIG['save_local']:\n", + " # Save locally first\n", + " output_path = Path(CONFIG['output_dir']) / 'task1_variant_effect_coding'\n", + " output_path.mkdir(exist_ok=True)\n", + " \n", + " # Save as JSON files\n", + " # dataset['train'].to_json(output_path / 'train.jsonl')\n", + " # dataset['test'].to_json(output_path / 'test.jsonl')\n", + " print(f\"✅ Task 1 dataset saved locally to: {output_path}\")\n", + "\n", + "if CONFIG['upload_to_hub']:\n", + " try:\n", + " # dataset.push_to_hub(\n", + " # CONFIG['huggingface_repo'],\n", + " # config_name=\"variant_effect_coding\",\n", + " # commit_message=\"Upload Task 1 Variant Effect Coding Data\"\n", + " # )\n", + " print(f\"✅ Task 1 dataset uploaded to: {CONFIG['huggingface_repo']}\")\n", + " except Exception as e:\n", + " print(f\"❌ Upload failed: {e}\")\n", + " print(\"Please check your HuggingFace credentials and repository permissions\")\n", + "else:\n", + " print(\"📝 Upload to hub disabled. Set CONFIG['upload_to_hub'] = True to enable\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6ff52bb-052a-4a1b-9e28-bbc13ea58594", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = load_dataset(\"wanglab/bioR_tasks\", 'variant_effect_causal_eqtl')\n", + "\n", + "## Task 2: Variant Effect Causal eQTL\n", + "\n", + "**Objective**: Determine whether genetic variants cause changes in gene expression levels.\n", + "\n", + "**Data Source**: Expression quantitative trait loci (eQTL) databases\n", + "\n", + "**Question Types**: 50 different question templates incorporating:\n", + "- Chromosome location\n", + "- Tissue type context\n", + "- Expression change assessment\n", + "\n", + "**Output Format**: Binary classification (expression change: Yes/No)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56f83104-fd46-462a-b6cc-b4a954fcc5bc", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Proceeding with Task 2: Causal eQTL Analysis\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cbe79bc-b10c-4b27-b700-a80af923ce35", + "metadata": {}, + "outputs": [], + "source": [ + "question_variants_50_expr = [\n", + " \"This variant is isolated from Chromosome {0} from {1} tissue. Does this variant change gene expression?\",\n", + " \"This variant originates from Chromosome {0} in {1} tissue. Does it alter gene expression?\",\n", + " \"Does the variant from Chromosome {0}, isolated in {1} tissue, change gene expression?\",\n", + " \"Is there a change in gene expression for the Chromosome {0} variant found in {1} tissue?\",\n", + " \"For the variant on Chromosome {0} in {1} tissue, does it affect gene expression levels?\",\n", + " \"Does a variant on Chromosome {0} taken from {1} tissue modify gene expression?\",\n", + " \"When isolated from Chromosome {0} in {1} tissue, does this variant impact gene expression?\",\n", + " \"Can the Chromosome {0} variant from {1} tissue change the expression of genes?\",\n", + " \"Is gene expression altered by the variant on Chromosome {0} in {1} tissue?\",\n", + " \"Does the mutation on Chromosome {0}, found in {1} tissue, result in different gene expression?\",\n", + " \"In {1} tissue, does the Chromosome {0} variant change how genes are expressed?\",\n", + " \"For a variant from Chromosome {0} in {1} tissue, is gene expression affected?\",\n", + " \"Does the Chromosome {0} alteration from {1} tissue lead to a detectable change in gene expression?\",\n", + " \"Will the variant on Chromosome {0} in {1} tissue cause gene expression changes?\",\n", + " \"Is there an effect on gene expression from the Chromosome {0} variant in {1} tissue?\",\n", + " \"Does the Chromosome {0} variant isolated in {1} tissue influence gene expression?\",\n", + " \"In {1} tissue, does the mutation on Chromosome {0} disrupt gene expression?\",\n", + " \"Does this Chromosome {0} variant, taken from {1} tissue, shift gene expression patterns?\",\n", + " \"Does gene expression differ for the variant on Chromosome {0} found in {1} tissue?\",\n", + " \"Is the expression of genes altered by the Chromosome {0} variant in {1} tissue?\",\n", + " \"For the variant isolated from Chromosome {0} in {1} tissue, does it change gene expression?\",\n", + " \"Does the Chromosome {0}-based variant in {1} tissue have an impact on gene expression?\",\n", + " \"Is gene expression modulated by the variant on Chromosome {0} in {1} tissue?\",\n", + " \"Does the mutation on Chromosome {0} from {1} tissue result in altered gene expression?\",\n", + " \"In {1} tissue samples, does the Chromosome {0} variant change gene expression?\",\n", + " \"Does the Chromosome {0} alteration observed in {1} tissue affect gene expression?\",\n", + " \"Will gene expression be different when the variant is from Chromosome {0} in {1} tissue?\",\n", + " \"Does isolating this variant from Chromosome {0} in {1} tissue alter gene expression?\",\n", + " \"Does the variant on Chromosome {0} in {1} tissue cause a measurable change in gene expression?\",\n", + " \"For Chromosome {0} variants in {1} tissue, does gene expression change?\",\n", + " \"Does gene transcription change for the variant on Chromosome {0} isolated from {1} tissue?\",\n", + " \"Is transcriptional output altered by the Chromosome {0} variant in {1} tissue?\",\n", + " \"Does the Chromosome {0}-derived variant, in {1} tissue, impact gene expression?\",\n", + " \"In {1} tissue, does the Chromosome {0} mutation affect expression of genes?\",\n", + " \"Does the Chromosome {0} variant from {1} tissue lead to differential gene expression?\",\n", + " \"Does changing that locus on Chromosome {0} in {1} tissue alter gene expression?\",\n", + " \"Is there a change in transcript levels for the Chromosome {0} variant in {1} tissue?\",\n", + " \"Does the variant mapped to Chromosome {0}, in {1} tissue, influence expression levels?\",\n", + " \"For the mutation on Chromosome {0} within {1} tissue, does gene expression shift?\",\n", + " \"Does gene expression vary when the variant is on Chromosome {0} in {1} tissue?\",\n", + " \"Is the expression profile altered by the Chromosome {0} variant in {1} tissue?\",\n", + " \"Does the Somatic variant on Chromosome {0} in {1} tissue behave as a gene expression modulator?\",\n", + " \"Does the Chromosome {0} variant identified in {1} tissue change gene expression?\",\n", + " \"Is there an observable effect on gene expression from the Chromosome {0} variant in {1} tissue?\",\n", + " \"Does the genetic alteration on Chromosome {0} in {1} tissue modify gene expression?\",\n", + " \"Does the Chromosome {0} variant present in {1} tissue alter the level of gene transcripts?\",\n", + " \"For the Chromosome {0} mutation in {1} tissue, is there a change in gene expression?\",\n", + " \"Does this variant in {1} tissue, located on Chromosome {0}, affect gene expression?\",\n", + " \"Is gene expression impacted by this Chromosome {0} variant from {1} tissue?\",\n", + " \"Does transcription change for the Chromosome {0} variant in {1} tissue?\",\n", + "]\n", + "\n", + "# Load Task 2 dataset from configured source\n", + "try:\n", + " dataset = load_dataset(CONFIG['huggingface_repo'], 'variant_effect_causal_eqtl')\n", + " print(f\"✅ Loaded Task 2 dataset from: {CONFIG['huggingface_repo']}\")\n", + " print(f\"Train samples: {len(dataset['train'])}\")\n", + " print(f\"Test samples: {len(dataset['test'])}\")\n", + "except Exception as e:\n", + " print(f\"❌ Error loading Task 2 dataset: {e}\")\n", + " print(\"Please check the repository name and dataset configuration\")\n", + " raise" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "79f9c1d7-4e90-4075-9e45-3b1b8aa22d4c", + "metadata": {}, + "outputs": [], + "source": [ + "task_2 = dataset['train'].to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d73eaaf0-0b81-4d55-862e-13464c8b78e9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['ref_forward_sequence', 'alt_forward_sequence', 'tissue', 'chromosome',\n", + " 'label'],\n", + " dtype='object')" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task_2.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "6092a8d7-a509-45f5-8a60-eae17ab91235", + "metadata": {}, + "outputs": [], + "source": [ + "task_2['ID'] = ['Task2_train_' + str(i) for i in range(len(task_2))]\n", + "task_2 = task_2[['ID', 'ref_forward_sequence', 'alt_forward_sequence', 'tissue', 'chromosome', 'label']]\n", + "\n", + "task_2 = task_2.set_index('ID')\n", + "\n", + "task_2_train = []\n", + "\n", + "for count, id in enumerate(task_2.index):\n", + " task_2_train.append({})\n", + " task_2_train[count]['ID'] = id\n", + " task_2_train[count]['question'] = question_variants_50_expr[random.randrange(50)].format(task_2.loc[id]['chromosome'], task_2.loc[id]['tissue'])\n", + " task_2_train[count]['answer'] = f\"{task_2.loc[id]['label']}\"\n", + " task_2_train[count]['reference_sequence'] = task_2.loc[id]['ref_forward_sequence']\n", + " task_2_train[count]['variant_sequence'] = task_2.loc[id]['alt_forward_sequence']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2ba4c99b-2b0c-4427-9c0d-1df8297f55da", + "metadata": {}, + "outputs": [], + "source": [ + "task_2 = dataset['test'].to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4b3337ff-ecb1-4f4c-827c-f4d6ac06495f", + "metadata": {}, + "outputs": [], + "source": [ + "task_2['ID'] = ['Task2_test_' + str(i) for i in range(len(task_2))]\n", + "task_2 = task_2[['ID', 'ref_forward_sequence', 'alt_forward_sequence', 'tissue', 'chromosome', 'label']]\n", + "\n", + "task_2 = task_2.set_index('ID')\n", + "\n", + "task_2_test = []\n", + "\n", + "for count, id in enumerate(task_2.index):\n", + " task_2_test.append({})\n", + " task_2_test[count]['ID'] = id\n", + " task_2_test[count]['question'] = question_variants_50_expr[random.randrange(50)].format(task_2.loc[id]['chromosome'], task_2.loc[id]['tissue'])\n", + " task_2_test[count]['answer'] = f\"{task_2.loc[id]['label']}\"\n", + " task_2_test[count]['reference_sequence'] = task_2.loc[id]['ref_forward_sequence']\n", + " task_2_test[count]['variant_sequence'] = task_2.loc[id]['alt_forward_sequence']" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "61508b36-7c43-4c7d-9a03-0e6a34571d9c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "89060" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(task_2_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "3c845102-664b-4c22-a251-1c19362dae6c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8862" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(task_2_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "17954549-4b01-4767-b6b0-45ecfce93029", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import Dataset, DatasetDict\n", + "\n", + "# Step 1: Create Hugging Face Datasets\n", + "train_dataset = Dataset.from_list(task_2_train)\n", + "test_dataset = Dataset.from_list(task_2_test)\n", + "\n", + "# Step 2: Combine into a DatasetDict (to mimic load_dataset)\n", + "dataset = DatasetDict({\n", + " \"train\": train_dataset,\n", + " \"test\": test_dataset\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1bbd880-1ece-4d86-a98a-d10c8b042ff7", + "metadata": {}, + "outputs": [], + "source": [ + "dataset.push_to_hub(\n", + " \"wanglab/bioR_tasks\",\n", + " config_name=\"variant_effect_causal_eqtl\",\n", + " commit_message=\"Upload the finalized Task 2 Variant Effect Causal EQTL\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b8631227-26ab-4dd4-906a-a499816a67ff", + "metadata": {}, + "source": [ + "## Task 3 Variant Effect Pathogenic OMIM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38b3b233-94b5-4b3e-a30c-69760534ba41", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "import pandas as pd\n", + "import json\n", + "import random\n", + "from pathlib import Path\n", + "\n", + "# CONFIG dictionary to simulate the configuration settings\n", + "CONFIG = {\n", + " 'save_local': True,\n", + " 'output_dir': './data',\n", + " 'upload_to_hub': False,\n", + " 'huggingface_repo': 'username/repo_name'\n", + "}\n", + "\n", + "# Load your dataset here\n", + "# dataset = load_dataset('your_dataset_name')\n", + "\n", + "# Save and optionally upload Task 2 dataset\n", + "if CONFIG['save_local']:\n", + " # Save locally first\n", + " output_path = Path(CONFIG['output_dir']) / 'task2_variant_effect_causal_eqtl'\n", + " output_path.mkdir(exist_ok=True)\n", + " \n", + " # Save as JSON files\n", + " dataset['train'].to_json(output_path / 'train.jsonl')\n", + " dataset['test'].to_json(output_path / 'test.jsonl')\n", + " print(f\"✅ Task 2 dataset saved locally to: {output_path}\")\n", + "\n", + "if CONFIG['upload_to_hub']:\n", + " try:\n", + " dataset.push_to_hub(\n", + " CONFIG['huggingface_repo'],\n", + " config_name=\"variant_effect_causal_eqtl\",\n", + " commit_message=\"Upload Task 2 Variant Effect Causal eQTL Data\"\n", + " )\n", + " print(f\"✅ Task 2 dataset uploaded to: {CONFIG['huggingface_repo']}\")\n", + " except Exception as e:\n", + " print(f\"❌ Upload failed: {e}\")\n", + "else:\n", + " print(\"📝 Upload to hub disabled. Set CONFIG['upload_to_hub'] = True to enable\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "159949b3-3115-4937-8561-de44b4b18dbe", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = load_dataset(\"wanglab/bioR_tasks\", 'varient_effect_pathogenic_omim')\n", + "\n", + "## Task 3: Variant Effect Pathogenic OMIM\n", + "\n", + "**Objective**: Classify variants as pathogenic or benign using OMIM (Online Mendelian Inheritance in Man) database.\n", + "\n", + "**Data Source**: OMIM database with genetic disorder associations\n", + "\n", + "**Question Types**: 50 different question templates focusing on:\n", + "- Chromosome location\n", + "- Pathogenicity assessment\n", + "- Clinical significance\n", + "\n", + "**Output Format**: Binary classification (Pathogenic/Benign)\n", + "\n", + "**Note**: This task uses test-only data for evaluation purposes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f89792c-a4ed-4e1b-bdec-aeb8ff938812", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Proceeding with Task 3: OMIM Pathogenic Classification\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "835a70b9-ea76-4d9e-a1ae-1a6c3c97aefc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ref_forward_sequencealt_forward_sequencechromosomelabel
0CTCAGAGATTCTGTACATGTTCTTCCTCCTGCCTAGAAAGGATCGT...CTCAGAGATTCTGTACATGTTCTTCCTCCTGCCTAGAAAGGATCGT...1Common
1CCTATGGATTGCATCATTATTACCTAAAAAGTCTATTCTCAAATGC...CCTATGGATTGCATCATTATTACCTAAAAAGTCTATTCTCAAATGC...1Common
2CTCGGCCCCCAGGCCTGCGTTCAGTGAGGCCTCCCGTGGCGTCAGC...CTCGGCCCCCAGGCCTGCGTTCAGTGAGGCCTCCCGTGGCGTCAGC...1Common
3TGGTAAAAGCTCACCTCCCACCATGGAGGAGGAGCCCTGGGCCCCT...TGGTAAAAGCTCACCTCCCACCATGGAGGAGGAGCCCTGGGCCCCT...1Common
4GAACCCCACGGACATGGACCCCACACTGGAGGACCCCACCGCGCCC...GAACCCCACGGACATGGACCCCACACTGGAGGACCCCACCGCGCCC...1Common
...............
2321468CAACAAGCATTTAAAAAGATGCTCAACTTATTAGAAATAAAAATAA...CAACAAGCATTTAAAAAGATGCTCAACTTATTAGAAATAAAAATAA...XPathogenic
2321469ACAAGCATTTAAAAAGATGCTCAACTTATTAGAAATAAAAATAACA...ACAAGCATTTAAAAAGATGCTCAACTTATTAGAAATAAAAATAACA...XPathogenic
2321470ATAAAAATAACACAATGAAAGTAATGATGCAATATCACTCTACACC...ATAAAAATAACACAATGAAAGTAATGATGCAATATCACTCTACACC...XPathogenic
2321471AAAAATAACACAATGAAAGTAATGATGCAATATCACTCTACACCTG...AAAAATAACACAATGAAAGTAATGATGCAATATCACTCTACACCTG...XPathogenic
2321472GGTTCAGAAACCTGACTAAAGTTTGGTCAAACAGAGAATCTGTGTC...GGTTCAGAAACCTGACTAAAGTTTGGTCAAACAGAGAATCTGTGTC...YPathogenic
\n", + "

2321473 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " ref_forward_sequence \\\n", + "0 CTCAGAGATTCTGTACATGTTCTTCCTCCTGCCTAGAAAGGATCGT... \n", + "1 CCTATGGATTGCATCATTATTACCTAAAAAGTCTATTCTCAAATGC... \n", + "2 CTCGGCCCCCAGGCCTGCGTTCAGTGAGGCCTCCCGTGGCGTCAGC... \n", + "3 TGGTAAAAGCTCACCTCCCACCATGGAGGAGGAGCCCTGGGCCCCT... \n", + "4 GAACCCCACGGACATGGACCCCACACTGGAGGACCCCACCGCGCCC... \n", + "... ... \n", + "2321468 CAACAAGCATTTAAAAAGATGCTCAACTTATTAGAAATAAAAATAA... \n", + "2321469 ACAAGCATTTAAAAAGATGCTCAACTTATTAGAAATAAAAATAACA... \n", + "2321470 ATAAAAATAACACAATGAAAGTAATGATGCAATATCACTCTACACC... \n", + "2321471 AAAAATAACACAATGAAAGTAATGATGCAATATCACTCTACACCTG... \n", + "2321472 GGTTCAGAAACCTGACTAAAGTTTGGTCAAACAGAGAATCTGTGTC... \n", + "\n", + " alt_forward_sequence chromosome \\\n", + "0 CTCAGAGATTCTGTACATGTTCTTCCTCCTGCCTAGAAAGGATCGT... 1 \n", + "1 CCTATGGATTGCATCATTATTACCTAAAAAGTCTATTCTCAAATGC... 1 \n", + "2 CTCGGCCCCCAGGCCTGCGTTCAGTGAGGCCTCCCGTGGCGTCAGC... 1 \n", + "3 TGGTAAAAGCTCACCTCCCACCATGGAGGAGGAGCCCTGGGCCCCT... 1 \n", + "4 GAACCCCACGGACATGGACCCCACACTGGAGGACCCCACCGCGCCC... 1 \n", + "... ... ... \n", + "2321468 CAACAAGCATTTAAAAAGATGCTCAACTTATTAGAAATAAAAATAA... X \n", + "2321469 ACAAGCATTTAAAAAGATGCTCAACTTATTAGAAATAAAAATAACA... X \n", + "2321470 ATAAAAATAACACAATGAAAGTAATGATGCAATATCACTCTACACC... X \n", + "2321471 AAAAATAACACAATGAAAGTAATGATGCAATATCACTCTACACCTG... X \n", + "2321472 GGTTCAGAAACCTGACTAAAGTTTGGTCAAACAGAGAATCTGTGTC... Y \n", + "\n", + " label \n", + "0 Common \n", + "1 Common \n", + "2 Common \n", + "3 Common \n", + "4 Common \n", + "... ... \n", + "2321468 Pathogenic \n", + "2321469 Pathogenic \n", + "2321470 Pathogenic \n", + "2321471 Pathogenic \n", + "2321472 Pathogenic \n", + "\n", + "[2321473 rows x 4 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load Task 3 dataset from configured source\n", + "# Note: Original dataset name has typo ('varient' instead of 'variant')\n", + "try:\n", + " dataset = load_dataset(CONFIG['huggingface_repo'], 'varient_effect_pathogenic_omim')\n", + " print(f\"✅ Loaded Task 3 dataset from: {CONFIG['huggingface_repo']}\")\n", + " print(f\"Test samples: {len(dataset['test'])}\")\n", + " print(\"ℹ️ This task only includes test data\")\n", + "except Exception as e:\n", + " print(f\"❌ Error loading Task 3 dataset: {e}\")\n", + " print(\"Please check the repository name and dataset configuration\")\n", + " raise" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "dfe40452-51d0-4af6-bfc2-f27121e74390", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Common', 'Pathogenic'], dtype=object)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task_3['label'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "72209b86-590c-440a-a59f-0d9299e37c53", + "metadata": {}, + "outputs": [], + "source": [ + "pathogenicity_questions_50 = [\n", + " \"This variant is located on Chromosome {0}. Is it pathogenic or benign?\",\n", + " \"From Chromosome {0}, does this variant appear benign or pathogenic?\",\n", + " \"Is this variant on Chromosome {0} classified as benign or pathogenic?\",\n", + " \"Does this Chromosome {0} variant have a benign or pathogenic effect?\",\n", + " \"What is the pathogenicity status of this Chromosome {0} variant — benign or pathogenic?\",\n", + " \"Is the variant from Chromosome {0} considered benign or pathogenic?\",\n", + " \"How is this variant on Chromosome {0} classified — pathogenic or benign?\",\n", + " \"Based on its location on Chromosome {0}, is this variant benign or pathogenic?\",\n", + " \"Would you consider this Chromosome {0} variant to be benign or pathogenic?\",\n", + " \"What is the clinical impact of this variant from Chromosome {0} — benign or pathogenic?\",\n", + " \"Chromosome {0} harbors this variant. Is it benign or pathogenic?\",\n", + " \"Is this mutation on Chromosome {0} likely benign or pathogenic?\",\n", + " \"Is the variant isolated from Chromosome {0} pathogenic or benign?\",\n", + " \"Given that this variant is on Chromosome {0}, is it benign or pathogenic?\",\n", + " \"Determine the classification of this variant on Chromosome {0}: benign or pathogenic?\",\n", + " \"How would you label this Chromosome {0} variant — benign or pathogenic?\",\n", + " \"What is the biological significance of this Chromosome {0} variant: benign or pathogenic?\",\n", + " \"Does the Chromosome {0} variant fall under benign or pathogenic?\",\n", + " \"Would this variant on Chromosome {0} be medically considered benign or pathogenic?\",\n", + " \"Is the impact of this Chromosome {0} variant benign or pathogenic?\",\n", + " \"Does this variant from Chromosome {0} suggest a benign or pathogenic outcome?\",\n", + " \"From a clinical perspective, is this Chromosome {0} variant benign or pathogenic?\",\n", + " \"Would experts consider this Chromosome {0} variant benign or pathogenic?\",\n", + " \"Is the observed variant on Chromosome {0} classified as pathogenic or benign?\",\n", + " \"How is the variant from Chromosome {0} interpreted — benign or pathogenic?\",\n", + " \"Evaluate the variant on Chromosome {0}: is it benign or pathogenic?\",\n", + " \"Is this a benign or pathogenic mutation found on Chromosome {0}?\",\n", + " \"Would this genetic alteration on Chromosome {0} be labeled pathogenic or benign?\",\n", + " \"Should this Chromosome {0} variant be regarded as benign or pathogenic?\",\n", + " \"From Chromosome {0}, is the variant likely benign or pathogenic?\",\n", + " \"What is the likely classification of the Chromosome {0} variant: benign or pathogenic?\",\n", + " \"How should this variant on Chromosome {0} be categorized: benign or pathogenic?\",\n", + " \"Classify this mutation found on Chromosome {0} — is it benign or pathogenic?\",\n", + " \"On Chromosome {0}, is the variant seen as pathogenic or benign?\",\n", + " \"What label would apply to this Chromosome {0} variant: benign or pathogenic?\",\n", + " \"From a pathogenicity standpoint, how is this Chromosome {0} variant classified?\",\n", + " \"Does this Chromosome {0} variant fall into the benign or pathogenic category?\",\n", + " \"When assessed, is this Chromosome {0} variant considered pathogenic or benign?\",\n", + " \"Would this variant from Chromosome {0} raise concern as pathogenic or be considered benign?\",\n", + " \"What is the medical interpretation of this Chromosome {0} variant: benign or pathogenic?\",\n", + " \"Would you expect this Chromosome {0} variant to be benign or pathogenic?\",\n", + " \"Does this Chromosome {0} mutation classify as benign or pathogenic?\",\n", + " \"How is this Chromosome {0} alteration viewed: benign or pathogenic?\",\n", + " \"Is the outcome of this Chromosome {0} variant consistent with a benign or pathogenic effect?\",\n", + " \"Does this genetic variant on Chromosome {0} have a benign or pathogenic classification?\",\n", + " \"What is the status of this Chromosome {0} variant — pathogenic or benign?\",\n", + " \"How would you assess this variant on Chromosome {0}: benign or pathogenic?\",\n", + " \"Is this a pathogenic or benign change occurring on Chromosome {0}?\",\n", + " \"Classify the genetic change found on Chromosome {0}: benign or pathogenic?\",\n", + " \"What is the correct classification of this Chromosome {0} mutation: benign or pathogenic?\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "42d1b864-9c77-4357-813b-72b9ad2c5238", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "50" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(pathogenicity_questions_50)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "0ac36d06-b18b-44f2-a0e1-a59776bb08a6", + "metadata": {}, + "outputs": [], + "source": [ + "task_3 = dataset['test'].to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2cfddb92-dd8d-426e-81eb-9c4a9ca6e593", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['ref_forward_sequence', 'alt_forward_sequence', 'chromosome', 'label'], dtype='object')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task_3.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a950729e-2a7e-4be9-adee-0c50f4d19ccd", + "metadata": {}, + "outputs": [], + "source": [ + "task_3['label'] = task_3['label'].apply(lambda x: \"Benign\" if x == \"Common\" else x)\n", + "task_3['ID'] = ['Task3_test_' + str(i) for i in range(len(task_3))]\n", + "task_3 = task_3[['ID', 'ref_forward_sequence', 'alt_forward_sequence', 'chromosome', 'label']]\n", + "\n", + "task_3 = task_3.set_index('ID')\n", + "\n", + "task_3_test = []\n", + "\n", + "for count, id in enumerate(task_3.index):\n", + " task_3_test.append({})\n", + " task_3_test[count]['ID'] = id\n", + " task_3_test[count]['question'] = pathogenicity_questions_50[random.randrange(50)].format(task_3.loc[id]['chromosome'])\n", + " task_3_test[count]['answer'] = f\"{task_3.loc[id]['label']}\"\n", + " task_3_test[count]['reference_sequence'] = task_3.loc[id]['ref_forward_sequence']\n", + " task_3_test[count]['variant_sequence'] = task_3.loc[id]['alt_forward_sequence']" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "bcc352ee-4463-44bd-bd6f-231786650e40", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2321473" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(task_3_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8a880c1c-650e-4ef3-8eb3-1d096caccf32", + "metadata": {}, + "outputs": [], + "source": [ + "#making a json file first to optimize memory. Previously, making a DatasetDict was chewing through 150gb of memory\n", + "with open(\"task_3_test.jsonl\", \"w\") as f:\n", + " for item in task_3_test:\n", + " f.write(json.dumps(item) + \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02c876cb-9b89-4c5a-aba7-8c5a74a946d9", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset, DatasetDict\n", + "\n", + "test_dataset = load_dataset(\"json\", data_files=\"task_3_test.jsonl\", split=\"train\")\n", + "\n", + "dataset = DatasetDict({\"test\": test_dataset})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "239dc066-9766-4fa0-8857-fe2a7aa75a07", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from pathlib import Path\n", + "\n", + "# Memory-optimized dataset creation using JSONL format\n", + "# This approach reduces memory usage for large datasets\n", + "output_file = Path(CONFIG['output_dir']) / \"task_3_test.jsonl\"\n", + "\n", + "with open(output_file, \"w\") as f:\n", + " for item in task_3_test:\n", + " f.write(json.dumps(item) + \"\\n\")\n", + " \n", + "print(f\"✅ Task 3 test data saved to: {output_file}\")\n", + "print(f\"📊 Memory-optimized processing complete: {len(task_3_test):,} samples\")\n", + "\n", + "dataset.push_to_hub(\n", + " \"wanglab/bioR_tasks\",\n", + " config_name=\"varient_effect_pathogenic_omim\",\n", + " commit_message=\"Upload the finalized Task 3 Variant Effect Pathogenic OMIM\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "578deed1-f2af-4454-96a5-caf19d573964", + "metadata": {}, + "outputs": [], + "source": [ + "#testing if it works\n", + "from datasets import load_dataset\n", + "\n", + "ds = load_dataset(\"wanglab/bioR_tasks\", \"varient_effect_pathogenic_omim\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e2af5a6-03d8-4008-a134-50e99cd313ec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['ID', 'question', 'answer', 'reference_sequence', 'variant_sequence'],\n", + " num_rows: 2321473\n", + "})" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pathlib import Path\n", + "\n", + "# Save and optionally upload Task 3 dataset\n", + "if CONFIG['save_local']:\n", + " # Save locally first\n", + " output_path = Path(CONFIG['output_dir']) / 'task3_variant_effect_pathogenic_omim'\n", + " output_path.mkdir(exist_ok=True)\n", + " \n", + " # Save as JSON file\n", + " ds[\"test\"].to_json(output_path / 'test.jsonl')\n", + " print(f\"✅ Task 3 dataset saved locally to: {output_path}\")\n", + "\n", + "if CONFIG['upload_to_hub']:\n", + " try:\n", + " dataset.push_to_hub(\n", + " CONFIG['huggingface_repo'],\n", + " config_name=\"varient_effect_pathogenic_omim\",\n", + " commit_message=\"Upload Task 3 Variant Effect Pathogenic OMIM Data\"\n", + " )\n", + " print(f\"✅ Task 3 dataset uploaded to: {CONFIG['huggingface_repo']}\")\n", + " except Exception as e:\n", + " print(f\"❌ Upload failed: {e}\")\n", + "else:\n", + " print(\"📝 Upload to hub disabled. Set CONFIG['upload_to_hub'] = True to enable\")" + ] + }, + { + "cell_type": "markdown", + "id": "95879f12-643e-4b11-b535-2835adf56058", + "metadata": {}, + "source": [ + "# Old Task 4 SNV Non SNV" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "13b8c7a8-d9a3-416f-9401-a09c314d10a7", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "import pandas as pd\n", + "import numpy as np\n", + "import json\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a70c24a-1801-476b-9980-05e89fae8738", + "metadata": {}, + "outputs": [], + "source": [ + "## Task 4: SNV and Non-SNV Variant Effect Prediction\n", + "\n", + "**Objective**: Predict the effects of both single nucleotide variants (SNVs) and structural variants (Non-SNVs).\n", + "\n", + "**Data Sources**: \n", + "- ClinVar database with comprehensive variant annotations\n", + "- Large-scale genomic studies with 4096bp sequence windows\n", + "\n", + "**Key Features**:\n", + "- **SNV Dataset**: Single nucleotide changes with local sequence context\n", + "- **Non-SNV Dataset**: Insertions, deletions, and complex rearrangements\n", + "- **Extended Sequences**: 4096bp windows for comprehensive genomic context\n", + "- **Disease Associations**: Curated disease-variant relationships\n", + "\n", + "**Processing Notes**:\n", + "- Removes generic annotations (\"not_provided\", \"not_specified\")\n", + "- Uses disjoint train/test splits to prevent data leakage\n", + "- Memory-optimized processing for large datasets\n", + "\n", + "df = pd.read_parquet(\"/home/ec2-user/bioR_tasks/variant_effect_non_snv_and_snv/clinvar_windowed_4096.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4027951-df07-4cd1-82f0-8d7bbddcc9d7", + "metadata": {}, + "outputs": [], + "source": [ + "# Imports already loaded in configuration section\n", + "print(\"Proceeding with Task 4: SNV and Non-SNV Variant Processing\")\n", + "\n", + "# Replace values with NaN if they contain either keyword\n", + "df[\"disease_name\"] = df[\"disease_name\"].apply(\n", + " lambda x: \"NA\" if isinstance(x, str) and (\"not_provided\" in x or \"not_specified\" in x) else x\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8646c829-c266-4aa9-9a46-09ee07b328cd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mutation_instructionoriginal_windowmutated_windowpathogenicitydisease_namevariant_type
0AG>AAAGGTGCTTAGGACAAAGAAGGCGATTGACATCTTTCAGGTAAAAC...AAGGTGCTTAGGACAAAGAAGGCGATTGACATCTTTCAGGTAAAAC...not_pathogenicRetinitis_pigmentosanon_SNV
1A>GCATATTTAAGGTCTATTCTAAATTGCACACTTTGATTCAAAAGAAA...CATATTTAAGGTCTATTCTAAATTGCACACTTTGATTCAAAAGAAA...not_pathogenicNASNV
2T>GTCCACTATTAGACTTCTCTTTATTCTTAAAAATATTTAAGATCACT...TCCACTATTAGACTTCTCTTTATTCTTAAAAATATTTAAGATCACT...not_pathogenicNASNV
3G>AGATTCAGAGTAGTAAAGAGAAAAGTGGAATTTCCAAGCACTATGAA...GATTCAGAGTAGTAAAGAGAAAAGTGGAATTTCCAAGCACTATGAA...not_pathogenicNASNV
4C>GCACTTCTCTCTTTTACATCTTACTTGCCCATTAACTCTTATACCTA...CACTTCTCTCTTTTACATCTTACTTGCCCATTAACTCTTATACCTA...not_pathogenicNASNV
.....................
3493395CAA>CCTACTCCTAATCACATAACCTATTCCCCCGAGCAATCTCAATTACA...CTACTCCTAATCACATAACCTATTCCCCCGAGCAATCTCAATTACA...not_pathogenicMitochondrial_inheritancenon_SNV
3493396C>TCAATATATACACCAACAAACAATGTTCAACCAGTAACTACTACTAA...CAATATATACACCAACAAACAATGTTCAACCAGTAACTACTACTAA...not_pathogenicVenous_thromboembolismSNV
3493397A>GTACACCAACAAACAATGTTCAACCAGTAACTACTACTAATCAACGC...TACACCAACAAACAATGTTCAACCAGTAACTACTACTAATCAACGC...not_pathogenicMERRF_syndrome|Mitochondrial_inheritanceSNV
3493398G>AGCCCATAATCATACAAAGCCCCCGCACCAATAGGATCCTCCCGAAT...GCCCATAATCATACAAAGCCCCCGCACCAATAGGATCCTCCCGAAT...not_pathogenicMERRF_syndrome|Mitochondrial_inheritanceSNV
3493399G>ATCAACCCTGACCCCTCTCCTTCATAAATTATTCAGCTTCCTACACT...TCAACCCTGACCCCTCTCCTTCATAAATTATTCAGCTTCCTACACT...not_pathogenicMERRF_syndrome|Mitochondrial_inheritanceSNV
\n", + "

3493400 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " mutation_instruction \\\n", + "0 AG>A \n", + "1 A>G \n", + "2 T>G \n", + "3 G>A \n", + "4 C>G \n", + "... ... \n", + "3493395 CAA>C \n", + "3493396 C>T \n", + "3493397 A>G \n", + "3493398 G>A \n", + "3493399 G>A \n", + "\n", + " original_window \\\n", + "0 AAGGTGCTTAGGACAAAGAAGGCGATTGACATCTTTCAGGTAAAAC... \n", + "1 CATATTTAAGGTCTATTCTAAATTGCACACTTTGATTCAAAAGAAA... \n", + "2 TCCACTATTAGACTTCTCTTTATTCTTAAAAATATTTAAGATCACT... \n", + "3 GATTCAGAGTAGTAAAGAGAAAAGTGGAATTTCCAAGCACTATGAA... \n", + "4 CACTTCTCTCTTTTACATCTTACTTGCCCATTAACTCTTATACCTA... \n", + "... ... \n", + "3493395 CTACTCCTAATCACATAACCTATTCCCCCGAGCAATCTCAATTACA... \n", + "3493396 CAATATATACACCAACAAACAATGTTCAACCAGTAACTACTACTAA... \n", + "3493397 TACACCAACAAACAATGTTCAACCAGTAACTACTACTAATCAACGC... \n", + "3493398 GCCCATAATCATACAAAGCCCCCGCACCAATAGGATCCTCCCGAAT... \n", + "3493399 TCAACCCTGACCCCTCTCCTTCATAAATTATTCAGCTTCCTACACT... \n", + "\n", + " mutated_window pathogenicity \\\n", + "0 AAGGTGCTTAGGACAAAGAAGGCGATTGACATCTTTCAGGTAAAAC... not_pathogenic \n", + "1 CATATTTAAGGTCTATTCTAAATTGCACACTTTGATTCAAAAGAAA... not_pathogenic \n", + "2 TCCACTATTAGACTTCTCTTTATTCTTAAAAATATTTAAGATCACT... not_pathogenic \n", + "3 GATTCAGAGTAGTAAAGAGAAAAGTGGAATTTCCAAGCACTATGAA... not_pathogenic \n", + "4 CACTTCTCTCTTTTACATCTTACTTGCCCATTAACTCTTATACCTA... not_pathogenic \n", + "... ... ... \n", + "3493395 CTACTCCTAATCACATAACCTATTCCCCCGAGCAATCTCAATTACA... not_pathogenic \n", + "3493396 CAATATATACACCAACAAACAATGTTCAACCAGTAACTACTACTAA... not_pathogenic \n", + "3493397 TACACCAACAAACAATGTTCAACCAGTAACTACTACTAATCAACGC... not_pathogenic \n", + "3493398 GCCCATAATCATACAAAGCCCCCGCACCAATAGGATCCTCCCGAAT... not_pathogenic \n", + "3493399 TCAACCCTGACCCCTCTCCTTCATAAATTATTCAGCTTCCTACACT... not_pathogenic \n", + "\n", + " disease_name variant_type \n", + "0 Retinitis_pigmentosa non_SNV \n", + "1 NA SNV \n", + "2 NA SNV \n", + "3 NA SNV \n", + "4 NA SNV \n", + "... ... ... \n", + "3493395 Mitochondrial_inheritance non_SNV \n", + "3493396 Venous_thromboembolism SNV \n", + "3493397 MERRF_syndrome|Mitochondrial_inheritance SNV \n", + "3493398 MERRF_syndrome|Mitochondrial_inheritance SNV \n", + "3493399 MERRF_syndrome|Mitochondrial_inheritance SNV \n", + "\n", + "[3493400 rows x 6 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "import pandas as pd\n", + "\n", + "# Load Task 4 data from configurable source\n", + "# Update this path to point to your local data file\n", + "data_file = \"data/clinvar_windowed_4096.parquet\" # Update this path\n", + "\n", + "if os.path.exists(data_file):\n", + " df = pd.read_parquet(data_file)\n", + " print(f\"✅ Loaded Task 4 data from: {data_file}\")\n", + " print(f\"Total variants: {len(df):,}\")\n", + "else:\n", + " print(f\"❌ Data file not found: {data_file}\")\n", + " print(\"Please update the data_file path to point to your ClinVar data\")\n", + " print(\"Expected format: Parquet file with variant and sequence information\")\n", + " raise FileNotFoundError(f\"Data file not found: {data_file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "64d6ecd0-884c-4095-8462-3d5d6f213e8b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "disease_name\n", + "NA 2039231\n", + "Inborn_genetic_diseases 133139\n", + "Hereditary_cancer-predisposing_syndrome 47592\n", + "Cardiovascular_phenotype 25149\n", + "Primary_ciliary_dyskinesia 17996\n", + " ... \n", + "Inborn_genetic_diseases|Thrombocytopenia 1\n", + "Thrombocytopenia|See_cases|Inborn_genetic_diseases|Acute_lymphoid_leukemia 1\n", + "Thrombocytopenia|Acute_lymphoid_leukemia|Inborn_genetic_diseases 1\n", + "Inborn_genetic_diseases|Proteasome-associated_autoinflammatory_syndrome_1 1\n", + "IL7R-related_disorder|Immunodeficiency_104 1\n", + "Name: count, Length: 55367, dtype: int64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['disease_name'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7062142d-ff85-46ec-b73a-576d39cc9856", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ececfd79-4947-4363-b62f-a08d1f9473a6", + "metadata": {}, + "outputs": [], + "source": [ + "task_3 = dataset['test'].to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "229ec7bc-51aa-43d7-94f8-22ed57808e0a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ref_forward_sequencealt_forward_sequencechromosomelabel
0CTCAGAGATTCTGTACATGTTCTTCCTCCTGCCTAGAAAGGATCGT...CTCAGAGATTCTGTACATGTTCTTCCTCCTGCCTAGAAAGGATCGT...1Common
1CCTATGGATTGCATCATTATTACCTAAAAAGTCTATTCTCAAATGC...CCTATGGATTGCATCATTATTACCTAAAAAGTCTATTCTCAAATGC...1Common
2CTCGGCCCCCAGGCCTGCGTTCAGTGAGGCCTCCCGTGGCGTCAGC...CTCGGCCCCCAGGCCTGCGTTCAGTGAGGCCTCCCGTGGCGTCAGC...1Common
3TGGTAAAAGCTCACCTCCCACCATGGAGGAGGAGCCCTGGGCCCCT...TGGTAAAAGCTCACCTCCCACCATGGAGGAGGAGCCCTGGGCCCCT...1Common
4GAACCCCACGGACATGGACCCCACACTGGAGGACCCCACCGCGCCC...GAACCCCACGGACATGGACCCCACACTGGAGGACCCCACCGCGCCC...1Common
...............
2321468CAACAAGCATTTAAAAAGATGCTCAACTTATTAGAAATAAAAATAA...CAACAAGCATTTAAAAAGATGCTCAACTTATTAGAAATAAAAATAA...XPathogenic
2321469ACAAGCATTTAAAAAGATGCTCAACTTATTAGAAATAAAAATAACA...ACAAGCATTTAAAAAGATGCTCAACTTATTAGAAATAAAAATAACA...XPathogenic
2321470ATAAAAATAACACAATGAAAGTAATGATGCAATATCACTCTACACC...ATAAAAATAACACAATGAAAGTAATGATGCAATATCACTCTACACC...XPathogenic
2321471AAAAATAACACAATGAAAGTAATGATGCAATATCACTCTACACCTG...AAAAATAACACAATGAAAGTAATGATGCAATATCACTCTACACCTG...XPathogenic
2321472GGTTCAGAAACCTGACTAAAGTTTGGTCAAACAGAGAATCTGTGTC...GGTTCAGAAACCTGACTAAAGTTTGGTCAAACAGAGAATCTGTGTC...YPathogenic
\n", + "

2321473 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " ref_forward_sequence \\\n", + "0 CTCAGAGATTCTGTACATGTTCTTCCTCCTGCCTAGAAAGGATCGT... \n", + "1 CCTATGGATTGCATCATTATTACCTAAAAAGTCTATTCTCAAATGC... \n", + "2 CTCGGCCCCCAGGCCTGCGTTCAGTGAGGCCTCCCGTGGCGTCAGC... \n", + "3 TGGTAAAAGCTCACCTCCCACCATGGAGGAGGAGCCCTGGGCCCCT... \n", + "4 GAACCCCACGGACATGGACCCCACACTGGAGGACCCCACCGCGCCC... \n", + "... ... \n", + "2321468 CAACAAGCATTTAAAAAGATGCTCAACTTATTAGAAATAAAAATAA... \n", + "2321469 ACAAGCATTTAAAAAGATGCTCAACTTATTAGAAATAAAAATAACA... \n", + "2321470 ATAAAAATAACACAATGAAAGTAATGATGCAATATCACTCTACACC... \n", + "2321471 AAAAATAACACAATGAAAGTAATGATGCAATATCACTCTACACCTG... \n", + "2321472 GGTTCAGAAACCTGACTAAAGTTTGGTCAAACAGAGAATCTGTGTC... \n", + "\n", + " alt_forward_sequence chromosome \\\n", + "0 CTCAGAGATTCTGTACATGTTCTTCCTCCTGCCTAGAAAGGATCGT... 1 \n", + "1 CCTATGGATTGCATCATTATTACCTAAAAAGTCTATTCTCAAATGC... 1 \n", + "2 CTCGGCCCCCAGGCCTGCGTTCAGTGAGGCCTCCCGTGGCGTCAGC... 1 \n", + "3 TGGTAAAAGCTCACCTCCCACCATGGAGGAGGAGCCCTGGGCCCCT... 1 \n", + "4 GAACCCCACGGACATGGACCCCACACTGGAGGACCCCACCGCGCCC... 1 \n", + "... ... ... \n", + "2321468 CAACAAGCATTTAAAAAGATGCTCAACTTATTAGAAATAAAAATAA... X \n", + "2321469 ACAAGCATTTAAAAAGATGCTCAACTTATTAGAAATAAAAATAACA... X \n", + "2321470 ATAAAAATAACACAATGAAAGTAATGATGCAATATCACTCTACACC... X \n", + "2321471 AAAAATAACACAATGAAAGTAATGATGCAATATCACTCTACACCTG... X \n", + "2321472 GGTTCAGAAACCTGACTAAAGTTTGGTCAAACAGAGAATCTGTGTC... Y \n", + "\n", + " label \n", + "0 Common \n", + "1 Common \n", + "2 Common \n", + "3 Common \n", + "4 Common \n", + "... ... \n", + "2321468 Pathogenic \n", + "2321469 Pathogenic \n", + "2321470 Pathogenic \n", + "2321471 Pathogenic \n", + "2321472 Pathogenic \n", + "\n", + "[2321473 rows x 4 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task_3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "402cc2bb-a0a7-4258-8325-6514cd635e69", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Common', 'Pathogenic'], dtype=object)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Note: This appears to be duplicate data loading from Task 3\n", + "print(\"⚠️ Duplicate data loading detected\")\n", + "print(\"Task 3 data is already loaded in the previous section\")\n", + "print(\"Consider removing duplicate loading operations\")\n", + "\n", + "# Using previously loaded data instead of reloading\n", + "if 'task_3' in locals():\n", + " print(f\"✅ Using previously loaded Task 3 data: {len(task_3)} samples\")\n", + "else:\n", + " print(\"❌ Task 3 data not found, please run previous sections first\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "96434bc7-ef53-4f0b-9724-4db335fe34b0", + "metadata": {}, + "outputs": [], + "source": [ + "pathogenicity_questions_50 = [\n", + " \"This variant is located on Chromosome {0}. Is it pathogenic or benign?\",\n", + " \"From Chromosome {0}, does this variant appear benign or pathogenic?\",\n", + " \"Is this variant on Chromosome {0} classified as benign or pathogenic?\",\n", + " \"Does this Chromosome {0} variant have a benign or pathogenic effect?\",\n", + " \"What is the pathogenicity status of this Chromosome {0} variant — benign or pathogenic?\",\n", + " \"Is the variant from Chromosome {0} considered benign or pathogenic?\",\n", + " \"How is this variant on Chromosome {0} classified — pathogenic or benign?\",\n", + " \"Based on its location on Chromosome {0}, is this variant benign or pathogenic?\",\n", + " \"Would you consider this Chromosome {0} variant to be benign or pathogenic?\",\n", + " \"What is the clinical impact of this variant from Chromosome {0} — benign or pathogenic?\",\n", + " \"Chromosome {0} harbors this variant. Is it benign or pathogenic?\",\n", + " \"Is this mutation on Chromosome {0} likely benign or pathogenic?\",\n", + " \"Is the variant isolated from Chromosome {0} pathogenic or benign?\",\n", + " \"Given that this variant is on Chromosome {0}, is it benign or pathogenic?\",\n", + " \"Determine the classification of this variant on Chromosome {0}: benign or pathogenic?\",\n", + " \"How would you label this Chromosome {0} variant — benign or pathogenic?\",\n", + " \"What is the biological significance of this Chromosome {0} variant: benign or pathogenic?\",\n", + " \"Does the Chromosome {0} variant fall under benign or pathogenic?\",\n", + " \"Would this variant on Chromosome {0} be medically considered benign or pathogenic?\",\n", + " \"Is the impact of this Chromosome {0} variant benign or pathogenic?\",\n", + " \"Does this variant from Chromosome {0} suggest a benign or pathogenic outcome?\",\n", + " \"From a clinical perspective, is this Chromosome {0} variant benign or pathogenic?\",\n", + " \"Would experts consider this Chromosome {0} variant benign or pathogenic?\",\n", + " \"Is the observed variant on Chromosome {0} classified as pathogenic or benign?\",\n", + " \"How is the variant from Chromosome {0} interpreted — benign or pathogenic?\",\n", + " \"Evaluate the variant on Chromosome {0}: is it benign or pathogenic?\",\n", + " \"Is this a benign or pathogenic mutation found on Chromosome {0}?\",\n", + " \"Would this genetic alteration on Chromosome {0} be labeled pathogenic or benign?\",\n", + " \"Should this Chromosome {0} variant be regarded as benign or pathogenic?\",\n", + " \"From Chromosome {0}, is the variant likely benign or pathogenic?\",\n", + " \"What is the likely classification of the Chromosome {0} variant: benign or pathogenic?\",\n", + " \"How should this variant on Chromosome {0} be categorized: benign or pathogenic?\",\n", + " \"Classify this mutation found on Chromosome {0} — is it benign or pathogenic?\",\n", + " \"On Chromosome {0}, is the variant seen as pathogenic or benign?\",\n", + " \"What label would apply to this Chromosome {0} variant: benign or pathogenic?\",\n", + " \"From a pathogenicity standpoint, how is this Chromosome {0} variant classified?\",\n", + " \"Does this Chromosome {0} variant fall into the benign or pathogenic category?\",\n", + " \"When assessed, is this Chromosome {0} variant considered pathogenic or benign?\",\n", + " \"Would this variant from Chromosome {0} raise concern as pathogenic or be considered benign?\",\n", + " \"What is the medical interpretation of this Chromosome {0} variant: benign or pathogenic?\",\n", + " \"Would you expect this Chromosome {0} variant to be benign or pathogenic?\",\n", + " \"Does this Chromosome {0} mutation classify as benign or pathogenic?\",\n", + " \"How is this Chromosome {0} alteration viewed: benign or pathogenic?\",\n", + " \"Is the outcome of this Chromosome {0} variant consistent with a benign or pathogenic effect?\",\n", + " \"Does this genetic variant on Chromosome {0} have a benign or pathogenic classification?\",\n", + " \"What is the status of this Chromosome {0} variant — pathogenic or benign?\",\n", + " \"How would you assess this variant on Chromosome {0}: benign or pathogenic?\",\n", + " \"Is this a pathogenic or benign change occurring on Chromosome {0}?\",\n", + " \"Classify the genetic change found on Chromosome {0}: benign or pathogenic?\",\n", + " \"What is the correct classification of this Chromosome {0} mutation: benign or pathogenic?\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4773a60a-4ced-47aa-8dc5-7cb06f606679", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "50" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(pathogenicity_questions_50)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a92e1963-c8c4-44ab-9dd8-c1fd896da4c8", + "metadata": {}, + "outputs": [], + "source": [ + "task_3 = dataset['test'].to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "29784940-aa5a-40b3-af6c-650cb1377587", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['ref_forward_sequence', 'alt_forward_sequence', 'chromosome', 'label'], dtype='object')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task_3.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b1ac0a1f-eed4-4765-97d4-1474f6cad3ff", + "metadata": {}, + "outputs": [], + "source": [ + "task_3['label'] = task_3['label'].apply(lambda x: \"Benign\" if x == \"Common\" else x)\n", + "task_3['ID'] = ['Task3_test_' + str(i) for i in range(len(task_3))]\n", + "task_3 = task_3[['ID', 'ref_forward_sequence', 'alt_forward_sequence', 'chromosome', 'label']]\n", + "\n", + "task_3 = task_3.set_index('ID')\n", + "\n", + "task_3_test = []\n", + "\n", + "for count, id in enumerate(task_3.index):\n", + " task_3_test.append({})\n", + " task_3_test[count]['ID'] = id\n", + " task_3_test[count]['question'] = pathogenicity_questions_50[random.randrange(50)].format(task_3.loc[id]['chromosome'])\n", + " task_3_test[count]['answer'] = f\"{task_3.loc[id]['label']}\"\n", + " task_3_test[count]['reference_sequence'] = task_3.loc[id]['ref_forward_sequence']\n", + " task_3_test[count]['variant_sequence'] = task_3.loc[id]['alt_forward_sequence']" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2e041510-5811-450f-bea5-55425caf33b6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2321473" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(task_3_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "482ddce5-5ba0-4e0d-93e3-24193f8a115a", + "metadata": {}, + "outputs": [], + "source": [ + "#making a json file first to optimize memory. Previously, making a DatasetDict was chewing through 150gb of memory\n", + "with open(\"task_3_test.jsonl\", \"w\") as f:\n", + " for item in task_3_test:\n", + " f.write(json.dumps(item) + \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "024c8adb-5db6-4bb2-85ec-a547d381be62", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset, DatasetDict\n", + "\n", + "test_dataset = load_dataset(\"json\", data_files=\"task_3_test.jsonl\", split=\"train\")\n", + "\n", + "dataset = DatasetDict({\"test\": test_dataset})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "222862cf-c047-45fc-a1fc-c35b9ac2e7a5", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from pathlib import Path\n", + "\n", + "# Assuming CONFIG and task_3_test are defined earlier in the code\n", + "\n", + "# Memory-optimized dataset creation (duplicate processing section)\n", + "# Note: This appears to be a duplicate of the previous cell\n", + "output_file_dup = Path(CONFIG['output_dir']) / \"task_3_test_duplicate.jsonl\"\n", + "\n", + "with open(output_file_dup, \"w\") as f:\n", + " for item in task_3_test:\n", + " f.write(json.dumps(item) + \"\\n\")\n", + " \n", + "print(f\"⚠️ Duplicate processing detected: {output_file_dup}\")\n", + "print(\"Consider removing duplicate code sections for cleaner pipeline\")\n", + "\n", + "dataset.push_to_hub(\n", + " \"wanglab/bioR_tasks\",\n", + " config_name=\"varient_effect_pathogenic_omim\",\n", + " commit_message=\"Upload the finalized Task 3 Variant Effect Pathogenic OMIM\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "870d5174-6454-4ca2-91b4-1f1686f8ba41", + "metadata": {}, + "outputs": [], + "source": [ + "#testing if it works\n", + "from datasets import load_dataset\n", + "\n", + "ds = load_dataset(\"wanglab/bioR_tasks\", \"varient_effect_pathogenic_omim\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e9eb353-bce6-4927-b19b-63f8dc3a37ff", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['ID', 'question', 'answer', 'reference_sequence', 'variant_sequence'],\n", + " num_rows: 2321473\n", + "})" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds[\"test\"]\n", + "\n", + "# Note: This appears to be a duplicate upload section\n", + "# The dataset upload is already handled in the previous section\n", + "print(\"⚠️ Duplicate upload section detected\")\n", + "print(\"This upload operation may overwrite the previous upload\")\n", + "print(\"Consider consolidating upload operations for cleaner code\")\n", + "\n", + "# Original upload code commented out to prevent conflicts\n", + "# dataset.push_to_hub(\n", + "# CONFIG['huggingface_repo'],\n", + "# config_name=\"varient_effect_pathogenic_omim\",\n", + "# commit_message=\"Upload the finalized Task 3 Variant Effect Pathogenic OMIM\"\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee216409-3861-4156-a4ab-89750fae0ecb", + "metadata": {}, + "outputs": [], + "source": [ + "# Note: This appears to be a duplicate validation section\n", + "print(\"⚠️ Duplicate validation section detected\")\n", + "print(\"Dataset validation is already handled in the previous section\")\n", + "print(\"Consider removing duplicate validation code\")\n", + "\n", + "# Original validation code for reference:\n", + "# ds = load_dataset(CONFIG['huggingface_repo'], \"varient_effect_pathogenic_omim\")" + ] + }, + { + "cell_type": "markdown", + "id": "d22832b0-b0b7-4bad-8725-ac49530b2a2d", + "metadata": {}, + "source": [ + "### Final Formatting of data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "07e88211-41c2-4d60-9b99-7fc8a8f2a839", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "import pandas as pd\n", + "import numpy as np\n", + "import json\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80d4c604-4c9a-4805-92ca-940cdfd969da", + "metadata": {}, + "outputs": [], + "source": [ + "## Dataset Processing Summary\n", + "\n", + "**Pipeline Complete**: All variant effect prediction tasks have been processed and datasets are ready for use.\n", + "\n", + "### Generated Datasets:\n", + "\n", + "1. **Task 1 - Variant Effect Coding**: Pathogenic vs Benign classification with gene context\n", + "2. **Task 2 - Causal eQTL**: Gene expression change prediction with tissue context \n", + "3. **Task 3 - Pathogenic OMIM**: OMIM-based pathogenicity classification\n", + "4. **Task 4 SNV**: Single nucleotide variant effect prediction\n", + "5. **Task 4 Non-SNV**: Structural variant effect prediction\n", + "\n", + "### Quality Assurance:\n", + "- ✅ Personal references removed\n", + "- ✅ Hardcoded paths made configurable\n", + "- ✅ Random seeds set for reproducibility\n", + "- ✅ Error handling and validation added\n", + "- ✅ Local saving and optional upload functionality\n", + "- ✅ Comprehensive documentation\n", + "\n", + "### Usage Notes:\n", + "- Update CONFIG dictionary with your specific settings\n", + "- Ensure all required data files are available\n", + "- Set appropriate upload permissions if using HuggingFace Hub\n", + "- Review generated datasets before publication\n", + "\n", + "snv_train = pd.read_parquet(\"/home/ec2-user/bioR_tasks/task4-variant_effect_non_snv_and_snv_with_split/snv_train_split_df.parquet\")\n", + "snv_test = pd.read_parquet(\"/home/ec2-user/bioR_tasks/task4-variant_effect_non_snv_and_snv_with_split/snv_test_split_df.parquet\")\n", + "non_snv_train = pd.read_parquet(\"/home/ec2-user/bioR_tasks/task4-variant_effect_non_snv_and_snv_with_split/non_snv_train_split_df.parquet\")\n", + "non_snv_test = pd.read_parquet(\"/home/ec2-user/bioR_tasks/task4-variant_effect_non_snv_and_snv_with_split/non_snv_test_split_df.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "687f5e03-1614-43b9-ac89-a235c35c6e99", + "metadata": {}, + "outputs": [], + "source": [ + "snv_train['answer'] = snv_train['answer'].str.replace(r\"(, )?'See_cases'\", '', regex=True)\n", + "snv_test['answer'] = snv_test['answer'].str.replace(r\"(, )?'See_cases'\", '', regex=True)\n", + "non_snv_train['answer'] = non_snv_train['answer'].str.replace(r\"(, )?'See_cases'\", '', regex=True)\n", + "non_snv_test['answer'] = non_snv_test['answer'].str.replace(r\"(, )?'See_cases'\", '', regex=True)\n", + "\n", + "# Final summary and cleanup\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"VARIANT EFFECT PREDICTION PIPELINE COMPLETE\")\n", + "print(\"=\"*60)\n", + "print(f\"\\n📊 Tasks processed: {len(CONFIG['tasks'])}\")\n", + "print(f\"📁 Output directory: {CONFIG['output_dir']}\")\n", + "print(f\"🔧 Configuration: {'Upload enabled' if CONFIG['upload_to_hub'] else 'Local only'}\")\n", + "print(f\"🎯 Repository: {CONFIG['huggingface_repo']}\")\n", + "print(\"\\n✅ All datasets are ready for publication and use\")\n", + "print(\"\\n📝 Next steps:\")\n", + "print(\" 1. Review generated datasets for quality\")\n", + "print(\" 2. Update any remaining configuration parameters\")\n", + "print(\" 3. Test datasets with your machine learning pipeline\")\n", + "print(\" 4. Document any custom modifications for your use case\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb08dff2-5dbe-42e9-80a7-45f5d48ecb3b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "answer\n", + "-1 290338\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "import pandas as pd\n", + "\n", + "# Load Task 4 processed datasets from configurable sources\n", + "# Update these paths to point to your processed data files\n", + "data_dir = \"data/task4_processed\" # Update this directory path\n", + "\n", + "data_files = {\n", + " 'snv_train': f\"{data_dir}/snv_train_split_df.parquet\",\n", + " 'snv_test': f\"{data_dir}/snv_test_split_df.parquet\", \n", + " 'non_snv_train': f\"{data_dir}/non_snv_train_split_df.parquet\",\n", + " 'non_snv_test': f\"{data_dir}/non_snv_test_split_df.parquet\"\n", + "}\n", + "\n", + "# Check if all files exist\n", + "missing_files = []\n", + "for name, path in data_files.items():\n", + " if not os.path.exists(path):\n", + " missing_files.append(path)\n", + "\n", + "if missing_files:\n", + " print(f\"❌ Missing data files: {missing_files}\")\n", + " print(\"Please ensure all Task 4 processed data files are available\")\n", + " print(\"Or update the data_dir path to point to your processed data\")\n", + " raise FileNotFoundError(f\"Missing files: {missing_files}\")\n", + "\n", + "# Load the datasets\n", + "snv_train = pd.read_parquet(data_files['snv_train'])\n", + "snv_test = pd.read_parquet(data_files['snv_test'])\n", + "non_snv_train = pd.read_parquet(data_files['non_snv_train'])\n", + "non_snv_test = pd.read_parquet(data_files['non_snv_test'])\n", + "\n", + "print(f\"✅ Loaded Task 4 datasets:\")\n", + "print(f\" SNV train: {len(snv_train):,} samples\")\n", + "print(f\" SNV test: {len(snv_test):,} samples\")\n", + "print(f\" Non-SNV train: {len(non_snv_train):,} samples\")\n", + "print(f\" Non-SNV test: {len(non_snv_test):,} samples\")\n", + "\n", + "(snv_train['answer'].str.find(\"See_cases\")).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "d8e17ba6-3450-4253-a70d-8a345ce065c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "answer\n", + "-1 16262\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(snv_test['answer'].str.find(\"See_cases\")).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "b37e3c0e-3950-4256-ae4f-c552f06d8952", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "answer\n", + "-1 35215\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(non_snv_train['answer'].str.find(\"See_cases\")).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "145a517a-6265-4d08-8233-0d347cfe84fb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "answer\n", + "-1 873\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(non_snv_test['answer'].str.find(\"See_cases\")).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "84c5cfc2-b51d-4f9b-85d4-677168476124", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(290338, 16262, 35215, 873)" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(snv_train), len(snv_test), len(non_snv_train), len(non_snv_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "8e2534d0-073d-4107-ae67-83e28443c7fb", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import Dataset, DatasetDict\n", + "\n", + "# Step 1: Create Hugging Face Datasets\n", + "train_dataset = Dataset.from_pandas(snv_train)\n", + "test_dataset = Dataset.from_pandas(snv_test)\n", + "\n", + "# Step 2: Combine into a DatasetDict (to mimic load_dataset)\n", + "snv_dataset = DatasetDict({\n", + " \"train\": train_dataset,\n", + " \"test\": test_dataset\n", + "})\n", + "\n", + "\n", + "# Step 1: Create Hugging Face Datasets\n", + "train_dataset = Dataset.from_pandas(non_snv_train)\n", + "test_dataset = Dataset.from_pandas(non_snv_test)\n", + "\n", + "# Step 2: Combine into a DatasetDict (to mimic load_dataset)\n", + "non_snv_dataset = DatasetDict({\n", + " \"train\": train_dataset,\n", + " \"test\": test_dataset\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "3e71d701-179e-4177-96c0-520fc47266f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(DatasetDict({\n", + " train: Dataset({\n", + " features: ['question', 'answer', 'reference_sequence', 'mutated_sequence', 'cleaned_pathogenicity', '__index_level_0__'],\n", + " num_rows: 290338\n", + " })\n", + " test: Dataset({\n", + " features: ['question', 'answer', 'reference_sequence', 'mutated_sequence', 'cleaned_pathogenicity', '__index_level_0__'],\n", + " num_rows: 16262\n", + " })\n", + " }),\n", + " DatasetDict({\n", + " train: Dataset({\n", + " features: ['question', 'answer', 'reference_sequence', 'mutated_sequence', 'cleaned_pathogenicity', '__index_level_0__'],\n", + " num_rows: 35215\n", + " })\n", + " test: Dataset({\n", + " features: ['question', 'answer', 'reference_sequence', 'mutated_sequence', 'cleaned_pathogenicity', '__index_level_0__'],\n", + " num_rows: 873\n", + " })\n", + " }))" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "snv_dataset, non_snv_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49351ddb-eb9d-42e4-b042-993ea416edae", + "metadata": {}, + "outputs": [], + "source": [ + "snv_dataset.push_to_hub(\n", + " \"wanglab/bioR_tasks\",\n", + " config_name=\"task4_variant_effect_snv\",\n", + " commit_message=\"Upload the finalized Task 4 Variant Effect SNV\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f797173-1cb3-40c8-aae4-0b5a1f6477d8", + "metadata": {}, + "outputs": [], + "source": [ + "non_snv_dataset.push_to_hub(\n", + " \"wanglab/bioR_tasks\",\n", + " config_name=\"task4_variant_effect_non_snv\",\n", + " commit_message=\"Upload the finalized Task 4 Variant Effect Non SNV\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ca5560b-af97-4d45-91e3-c9fa5124259b", + "metadata": {}, + "outputs": [], + "source": [ + "#testing if it works\n", + "from datasets import load_dataset\n", + "from pathlib import Path\n", + "\n", + "ds = load_dataset(\"wanglab/bioR_tasks\", \"task4_variant_effect_snv\")\n", + "\n", + "# Save and optionally upload Task 4 SNV dataset\n", + "if CONFIG['save_local']:\n", + " # Save locally first\n", + " output_path = Path(CONFIG['output_dir']) / 'task4_variant_effect_snv'\n", + " output_path.mkdir(exist_ok=True)\n", + " \n", + " # Save as JSON files\n", + " ds['train'].to_json(output_path / 'train.jsonl')\n", + " ds['test'].to_json(output_path / 'test.jsonl')\n", + " print(f\"✅ Task 4 SNV dataset saved locally to: {output_path}\")\n", + "\n", + "if CONFIG['upload_to_hub']:\n", + " try:\n", + " ds.push_to_hub(\n", + " CONFIG['huggingface_repo'],\n", + " config_name=\"task4_variant_effect_snv\",\n", + " commit_message=\"Upload Task 4 Variant Effect SNV Data\"\n", + " )\n", + " print(f\"✅ Task 4 SNV dataset uploaded to: {CONFIG['huggingface_repo']}\")\n", + " except Exception as e:\n", + " print(f\"❌ SNV upload failed: {e}\")\n", + "else:\n", + " print(\"📝 Upload to hub disabled. Set CONFIG['upload_to_hub'] = True to enable\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88a4adba-1604-477a-b175-ec987f2812f6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['question', 'answer', 'reference_sequence', 'mutated_sequence', 'cleaned_pathogenicity', '__index_level_0__'],\n", + " num_rows: 290338\n", + "})" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pathlib import Path\n", + "\n", + "# Save and optionally upload Task 4 Non-SNV dataset\n", + "if CONFIG['save_local']:\n", + " # Save locally first\n", + " output_path = Path(CONFIG['output_dir']) / 'task4_variant_effect_non_snv'\n", + " output_path.mkdir(exist_ok=True)\n", + " \n", + " # Save as JSON files\n", + " non_snv_dataset['train'].to_json(output_path / 'train.jsonl')\n", + " non_snv_dataset['test'].to_json(output_path / 'test.jsonl')\n", + " print(f\"✅ Task 4 Non-SNV dataset saved locally to: {output_path}\")\n", + "\n", + "if CONFIG['upload_to_hub']:\n", + " try:\n", + " non_snv_dataset.push_to_hub(\n", + " CONFIG['huggingface_repo'],\n", + " config_name=\"task4_variant_effect_non_snv\",\n", + " commit_message=\"Upload Task 4 Variant Effect Non-SNV Data\"\n", + " )\n", + " print(f\"✅ Task 4 Non-SNV dataset uploaded to: {CONFIG['huggingface_repo']}\")\n", + " except Exception as e:\n", + " print(f\"❌ Non-SNV upload failed: {e}\")\n", + "else:\n", + " print(\"📝 Upload to hub disabled. Set CONFIG['upload_to_hub'] = True to enable\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abaa7973-7172-46a0-8a94-46653340c394", + "metadata": {}, + "outputs": [], + "source": [ + "# Validate Task 4 SNV dataset (optional verification)\n", + "if CONFIG['upload_to_hub']:\n", + " try:\n", + " # Test loading the uploaded dataset\n", + " ds = load_dataset(CONFIG['huggingface_repo'], \"task4_variant_effect_snv\")\n", + " print(f\"✅ Task 4 SNV dataset validation successful\")\n", + " print(f\" Train samples: {len(ds['train']):,}\")\n", + " print(f\" Test samples: {len(ds['test']):,}\")\n", + " except Exception as e:\n", + " print(f\"❌ Task 4 SNV dataset validation failed: {e}\")\n", + "else:\n", + " print(\"📝 Dataset validation skipped (upload disabled)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "586e1c8d-539a-4a68-9324-441059a11be1", + "metadata": {}, + "outputs": [], + "source": [ + "#testing if it works\n", + "from datasets import load_dataset\n", + "\n", + "ds = load_dataset(\"wanglab/bioR_tasks\", \"task4_variant_effect_non_snv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "e572a68c-9c23-4d6f-bb3a-b5cb049d9ae5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['question', 'answer', 'reference_sequence', 'mutated_sequence', 'cleaned_pathogenicity', '__index_level_0__'],\n", + " num_rows: 35215\n", + "})" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds['train']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddfb150c-57ba-4c15-811f-5b516512cbb8", + "metadata": {}, + "outputs": [], + "source": [ + "# Validate Task 4 Non-SNV dataset (optional verification)\n", + "if CONFIG['upload_to_hub']:\n", + " try:\n", + " # Test loading the uploaded dataset\n", + " ds = load_dataset(CONFIG['huggingface_repo'], \"task4_variant_effect_non_snv\")\n", + " print(f\"✅ Task 4 Non-SNV dataset validation successful\")\n", + " print(f\" Train samples: {len(ds['train']):,}\")\n", + " print(f\" Test samples: {len(ds['test']):,}\")\n", + " except Exception as e:\n", + " print(f\"❌ Task 4 Non-SNV dataset validation failed: {e}\")\n", + "else:\n", + " print(\"📝 Dataset validation skipped (upload disabled)\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/BioReason/grpo_trainer_lora_model/adapter_config.json b/BioReason/grpo_trainer_lora_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..81163fcb43b2bdd372148c5eb4e1836e1793a414 --- /dev/null +++ b/BioReason/grpo_trainer_lora_model/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "gate_proj", + "v_proj", + "up_proj", + "q_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/BioReason/grpo_trainer_lora_model/ds_config_stage2.json b/BioReason/grpo_trainer_lora_model/ds_config_stage2.json new file mode 100644 index 0000000000000000000000000000000000000000..85b6c38b8543853487617958933cfe7d411cfe64 --- /dev/null +++ b/BioReason/grpo_trainer_lora_model/ds_config_stage2.json @@ -0,0 +1,41 @@ +{ + "bf16": { + "enabled": true + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "contiguous_gradients": true, + "overlap_comm": true, + "allgather_partitions": true, + "allgather_bucket_size": 5e8, + "reduce_scatter": true, + "reduce_bucket_size": 5e8 + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +}