File size: 2,593 Bytes
6d3b444 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
"""
Comprehensive validation module for antibody sequences.
Performs computational checks for various sequence properties and potential issues.
"""
# Standard library imports
import re
import json
import math
from typing import Dict, List, Tuple
class SequenceValidator:
# Class-level pKa values matching BioPython's ProtParam implementation
pka_values = {
'K': 10.0, # Lysine
'R': 12.0, # Arginine
'H': 6.0, # Histidine
'D': 4.0, # Aspartic acid
'E': 4.4, # Glutamic acid
'C': 8.5, # Cysteine
'Y': 10.0, # Tyrosine
'N_term': 8.0, # N-terminus
'C_term': 3.1 # C-terminus
}
def __init__(self, sequence: str, config: Dict = None):
"""
Initialize sequence validator with optional configuration.
Args:
sequence: The amino acid sequence to validate
config: Optional configuration dictionary with validation parameters
"""
self.sequence = sequence.upper()
self.config = config or {}
# Default configuration values
self.default_config = {
"signal_peptide": {
"enabled": True,
"min_length": 15,
"max_length": 30,
"required": False,
"strip": False,
"confidence_threshold": 0.6,
"n_region_basic_threshold": 0.3, # Min fraction of K/R in N-region
"h_region_hydrophobic_threshold": 0.6 # Min fraction of hydrophobic residues in H-region
}
}
# Merge provided config with defaults
for key, default_values in self.default_config.items():
if key not in self.config:
self.config[key] = {}
for param, value in default_values.items():
self.config[key][param] = self.config.get(key, {}).get(param, value)
def validate_binder(sequence: str, config: Dict = None) -> Dict:
"""
Perform comprehensive validation of a single binder sequence.
Args:
sequence: The amino acid sequence to validate
config: Optional configuration dictionary with validation parameters
Checks:
- Sequence length
- Disorder prediction
- Signal peptide presence (configurable)
- Cysteine content and spacing
- Glycosylation sites
- Physicochemical properties
- Sequence complexity
Returns:
Dict containing comprehensive validation results
"""
validator = SequenceValidator(sequence, config) |