File size: 2,593 Bytes
6d3b444
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""
Comprehensive validation module for antibody sequences.
Performs computational checks for various sequence properties and potential issues.
"""

# Standard library imports
import re
import json
import math
from typing import Dict, List, Tuple

class SequenceValidator:
    # Class-level pKa values matching BioPython's ProtParam implementation
    pka_values = {
        'K': 10.0,  # Lysine
        'R': 12.0,  # Arginine
        'H': 6.0,   # Histidine
        'D': 4.0,   # Aspartic acid
        'E': 4.4,   # Glutamic acid
        'C': 8.5,   # Cysteine
        'Y': 10.0,  # Tyrosine
        'N_term': 8.0,  # N-terminus
        'C_term': 3.1   # C-terminus
    }
    
    def __init__(self, sequence: str, config: Dict = None):
        """
        Initialize sequence validator with optional configuration.
        
        Args:
            sequence: The amino acid sequence to validate
            config: Optional configuration dictionary with validation parameters
        """
        self.sequence = sequence.upper()
        self.config = config or {}
        
        # Default configuration values
        self.default_config = {
            "signal_peptide": {
                "enabled": True,
                "min_length": 15,
                "max_length": 30,
                "required": False,
                "strip": False,
                "confidence_threshold": 0.6,
                "n_region_basic_threshold": 0.3,  # Min fraction of K/R in N-region
                "h_region_hydrophobic_threshold": 0.6  # Min fraction of hydrophobic residues in H-region
            }
        }
        
        # Merge provided config with defaults
        for key, default_values in self.default_config.items():
            if key not in self.config:
                self.config[key] = {}
            for param, value in default_values.items():
                self.config[key][param] = self.config.get(key, {}).get(param, value)

def validate_binder(sequence: str, config: Dict = None) -> Dict:
    """
    Perform comprehensive validation of a single binder sequence.
    
    Args:
        sequence: The amino acid sequence to validate
        config: Optional configuration dictionary with validation parameters
    
    Checks:
    - Sequence length
    - Disorder prediction
    - Signal peptide presence (configurable)
    - Cysteine content and spacing
    - Glycosylation sites
    - Physicochemical properties
    - Sequence complexity
    
    Returns:
        Dict containing comprehensive validation results
    """
    validator = SequenceValidator(sequence, config)