healdette / modules /validate_sequences.py.tmp
Raiff1982's picture
Upload 55 files
6d3b444 verified
"""
Comprehensive validation module for antibody sequences.
Performs computational checks for various sequence properties and potential issues.
"""
# Standard library imports
import re
import json
import math
from typing import Dict, List, Tuple
class SequenceValidator:
# Class-level pKa values matching BioPython's ProtParam implementation
pka_values = {
'K': 10.0, # Lysine
'R': 12.0, # Arginine
'H': 6.0, # Histidine
'D': 4.0, # Aspartic acid
'E': 4.4, # Glutamic acid
'C': 8.5, # Cysteine
'Y': 10.0, # Tyrosine
'N_term': 8.0, # N-terminus
'C_term': 3.1 # C-terminus
}
def __init__(self, sequence: str, config: Dict = None):
"""
Initialize sequence validator with optional configuration.
Args:
sequence: The amino acid sequence to validate
config: Optional configuration dictionary with validation parameters
"""
self.sequence = sequence.upper()
self.config = config or {}
# Default configuration values
self.default_config = {
"signal_peptide": {
"enabled": True,
"min_length": 15,
"max_length": 30,
"required": False,
"strip": False,
"confidence_threshold": 0.6,
"n_region_basic_threshold": 0.3, # Min fraction of K/R in N-region
"h_region_hydrophobic_threshold": 0.6 # Min fraction of hydrophobic residues in H-region
}
}
# Merge provided config with defaults
for key, default_values in self.default_config.items():
if key not in self.config:
self.config[key] = {}
for param, value in default_values.items():
self.config[key][param] = self.config.get(key, {}).get(param, value)
def validate_binder(sequence: str, config: Dict = None) -> Dict:
"""
Perform comprehensive validation of a single binder sequence.
Args:
sequence: The amino acid sequence to validate
config: Optional configuration dictionary with validation parameters
Checks:
- Sequence length
- Disorder prediction
- Signal peptide presence (configurable)
- Cysteine content and spacing
- Glycosylation sites
- Physicochemical properties
- Sequence complexity
Returns:
Dict containing comprehensive validation results
"""
validator = SequenceValidator(sequence, config)