Mutation_XAI / prefilter.py
nileshhanotia's picture
prefilter.py
62f19f0 verified
"""prefilter.py — PeVe v1.1"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Optional
from config import VEP_CONSEQUENCE_MAP, L3_SUBSTITUTION_INVALID
@dataclass
class VariantClass:
raw_consequence: str
all_consequences: list
variant_class: str
l3_substitution_valid: bool
rna_priority: bool
protein_priority: bool
protein_deprioritised: bool
transcript_conflict: bool
out_of_scope: bool
flags: list = field(default_factory=list)
def classify_variant(ref, alt, vep_consequence, all_vep_consequences=None):
if all_vep_consequences is None:
all_vep_consequences = [vep_consequence]
cons = vep_consequence.lower().strip()
all_cons = [c.lower().strip() for c in all_vep_consequences]
# MNV detection
if len(ref) > 1 and len(alt) > 1 and len(ref) == len(alt):
return VariantClass(cons, all_cons, "mnv", False, False, False, False, False, True,
["MNV: single-variant assessment may be incomplete"])
variant_class = VEP_CONSEQUENCE_MAP.get(cons, "unknown")
if variant_class == "unknown":
variant_class = _infer(ref, alt)
mapped = {VEP_CONSEQUENCE_MAP.get(c, "unknown") for c in all_cons}
tx_conflict = len(mapped) > 1
l3_valid = variant_class not in L3_SUBSTITUTION_INVALID
rna_priority = variant_class == "canonical_splice"
protein_priority = variant_class == "substitution_missense"
protein_deprio = variant_class == "substitution_synonymous"
out_of_scope = variant_class in {"utr_regulatory", "mnv", "unknown"}
flags = []
if variant_class == "utr_regulatory":
flags.append("UTR/regulatory: no mechanism pathway in PeVe v1.1.")
if variant_class in {"frameshift", "stop_gained", "start_lost"}:
flags.append(f"{variant_class}: Layer 3 substitution metrics NOT APPLICABLE.")
if variant_class == "in_frame_indel":
flags.append("In-frame indel: substitution biochemistry NOT APPLICABLE.")
if variant_class == "deep_intronic":
flags.append("Deep intronic: RNA interpretation down-prioritised.")
if variant_class == "substitution_synonymous":
flags.append("Synonymous: context signal alone cannot classify pathogenic.")
if tx_conflict:
flags.append("Transcript conflict: consequence differs across transcripts.")
if variant_class == "unknown":
flags.append("Variant class unknown — outputs are exploratory only.")
return VariantClass(cons, all_cons, variant_class, l3_valid,
rna_priority, protein_priority, protein_deprio,
tx_conflict, out_of_scope, flags)
def _infer(ref, alt):
if len(ref) == 1 and len(alt) == 1:
return "substitution_missense"
diff = len(alt) - len(ref)
if diff % 3 == 0:
return "in_frame_indel"
return "frameshift"