File size: 23,735 Bytes

6d3b444

"""
Comprehensive validation module for antibody sequences.
Performs computational checks for various sequence properties and potential issues.
"""

# Standard library imports
import re
import json
import math
from typing import Dict, List, Tuple

class SequenceValidator:
    # Class-level pKa values matching BioPython's ProtParam implementation
    pka_values = {
        'K': 10.0,  # Lysine
        'R': 12.0,  # Arginine
        'H': 6.0,   # Histidine
        'D': 4.0,   # Aspartic acid
        'E': 4.4,   # Glutamic acid
        'C': 8.5,   # Cysteine
        'Y': 10.0,  # Tyrosine
        'N_term': 8.0,  # N-terminus
        'C_term': 3.1   # C-terminus
    }
    
    def __init__(self, sequence: str, config: Dict = None):
        """
        Initialize sequence validator with optional configuration.
        
        Args:
            sequence: The amino acid sequence to validate
            config: Optional configuration dictionary with validation parameters
        """
        self.sequence = sequence.upper()
        self.config = config or {}
        
        # Default configuration values
        self.default_config = {
            "signal_peptide": {
                "enabled": True,
                "min_length": 15,
                "max_length": 30,
                "required": False,
                "strip": False,
                "confidence_threshold": 0.6,
                "n_region_basic_threshold": 0.3,  # Min fraction of K/R in N-region
                "h_region_hydrophobic_threshold": 0.6  # Min fraction of hydrophobic residues in H-region
            }
        }
        
        # Merge provided config with defaults
        for key, default_values in self.default_config.items():
            if key not in self.config:
                self.config[key] = {}
            for param, value in default_values.items():
                self.config[key][param] = self.config.get(key, {}).get(param, value)
        
    def analyze_complexity(self) -> Dict:
        """
        Analyze sequence complexity focusing on issues that could affect binder stability and function:
        - Homopolymer runs (4+ identical residues)
        - A/Q/P-heavy regions (>40% in any 10-residue window)
        - Overall amino acid diversity
        
        Returns:
            Dict containing complexity analysis results
        """
        def find_homopolymers(min_length: int = 4) -> List[Dict]:
            """Find runs of identical amino acids."""
            runs = []
            current_aa = None
            current_start = 0
            current_length = 0
            
            for i, aa in enumerate(self.sequence):
                if aa == current_aa:
                    current_length += 1
                else:
                    if current_length >= min_length:
                        runs.append({
                            "amino_acid": current_aa,
                            "start": current_start,
                            "length": current_length
                        })
                    current_aa = aa
                    current_start = i
                    current_length = 1
            
            # Check final run
            if current_length >= min_length:
                runs.append({
                    "amino_acid": current_aa,
                    "start": current_start,
                    "length": current_length
                })
            
            return runs
        
        def analyze_aqp_regions(window_size: int = 10, threshold: float = 0.4) -> List[Dict]:
            """Find regions with high A/Q/P content."""
            problem_regions = []
            for i in range(len(self.sequence) - window_size + 1):
                window = self.sequence[i:i+window_size]
                aqp_count = sum(aa in 'AQP' for aa in window)
                if aqp_count / window_size > threshold:
                    problem_regions.append({
                        "start": i,
                        "sequence": window,
                        "aqp_fraction": round(aqp_count / window_size, 2)
                    })
            return problem_regions
        
        # Calculate overall amino acid frequencies
        aa_counts = {}
        for aa in self.sequence:
            aa_counts[aa] = aa_counts.get(aa, 0) + 1
        
        # Calculate Shannon entropy for sequence diversity
        total_aas = len(self.sequence)
        entropy = 0
        for count in aa_counts.values():
            p = count / total_aas
            entropy -= p * math.log2(p)
        
        # Overall A/Q/P percentage
        aqp_total = sum(aa_counts.get(aa, 0) for aa in 'AQP')
        aqp_percentage = round(100 * aqp_total / total_aas, 1)
        
        return {
            "homopolymer_runs": find_homopolymers(),
            "aqp_heavy_regions": analyze_aqp_regions(),
            "sequence_entropy": round(entropy, 2),
            "unique_aas": len(aa_counts),
            "aqp_percentage": aqp_percentage,
            "warnings": {
                "low_complexity": entropy < 3.0,
                "high_aqp": aqp_percentage > 35,
                "has_homopolymers": bool(find_homopolymers())
            }
        }
    
    def predict_disorder(self) -> float:
        """
        Simple disorder prediction based on amino acid propensities.
        Returns fraction of residues predicted to be disordered.
        """
        # Disorder-promoting residues (based on literature)
        disorder_prone = set('RKEPNDQSG')
        disorder_count = sum(1 for aa in self.sequence if aa in disorder_prone)
        return disorder_count / len(self.sequence)
    
    def check_signal_peptide(self) -> Dict:
        """
        Enhanced signal peptide detection for binder peptides/scaffolds.
        
        Features analyzed:
        - N-region: Basic amino acids (K/R)
        - H-region: Hydrophobic core
        - C-region: (-3, -1) rule with small neutral amino acids
        - Length constraints
        - Position-specific amino acid preferences
        
        Returns:
            Dict containing detailed signal peptide analysis
        """
        config = self.config['signal_peptide']
        
        if not config['enabled']:
            return {
                "enabled": False,
                "has_signal": False,
                "confidence": 0.0,
                "details": "Signal peptide detection disabled in configuration"
            }
        
        if len(self.sequence) < config['min_length']:
            return {
                "enabled": True,
                "has_signal": False,
                "confidence": 1.0,
                "details": f"Sequence too short (min {config['min_length']} residues required)"
            }
        
        # Dynamic region sizing based on sequence length
        n_region_length = min(6, len(self.sequence) // 5)
        h_region_length = min(12, len(self.sequence) // 3)
        c_region_length = 5
        
        total_sp_length = min(
            n_region_length + h_region_length + c_region_length,
            config['max_length']
        )
        
        # Extract regions
        n_region = self.sequence[:n_region_length]
        h_region = self.sequence[n_region_length:n_region_length + h_region_length]
        c_region = self.sequence[n_region_length + h_region_length:total_sp_length]
        
        # Analyze N-region (positive charge)
        n_region_basic = sum(aa in 'KR' for aa in n_region)
        n_region_score = n_region_basic / len(n_region)
        n_region_valid = n_region_score >= config['n_region_basic_threshold']
        
        # Analyze H-region (hydrophobic core)
        hydrophobic = set('AILMFWV')
        h_region_hydrophobic = sum(aa in hydrophobic for aa in h_region)
        h_region_score = h_region_hydrophobic / len(h_region)
        h_region_valid = h_region_score >= config['h_region_hydrophobic_threshold']
        
        # Analyze C-region (-3, -1 rule)
        c_region_valid = False
        if len(c_region) >= 3:
            small_neutral = set('AGST')
            c_region_pattern = (
                c_region[-3] in small_neutral and
                c_region[-1] in small_neutral
            )
            # Check for proline disruption
            no_proline_disruption = 'P' not in c_region[-3:]
            c_region_valid = c_region_pattern and no_proline_disruption
        
        # Calculate overall confidence
        feature_scores = [
            n_region_score if n_region_valid else 0,
            h_region_score if h_region_valid else 0,
            1.0 if c_region_valid else 0
        ]
        confidence = sum(feature_scores) / len(feature_scores)
        
        has_signal = confidence >= config['confidence_threshold']
        
        # Prepare detailed analysis
        details = {
            "n_region": {
                "sequence": n_region,
                "basic_fraction": round(n_region_score, 2),
                "valid": n_region_valid
            },
            "h_region": {
                "sequence": h_region,
                "hydrophobic_fraction": round(h_region_score, 2),
                "valid": h_region_valid
            },
            "c_region": {
                "sequence": c_region,
                "valid": c_region_valid
            }
        }
        
        result = {
            "enabled": True,
            "has_signal": has_signal,
            "confidence": round(confidence, 2),
            "details": details,
            "signal_sequence": self.sequence[:total_sp_length] if has_signal else None,
            "mature_sequence": self.sequence[total_sp_length:] if has_signal and config['strip'] else self.sequence
        }
        
        return result
    
    def analyze_cysteines(self) -> Dict:
        """
        Analyze cysteine patterns and potential disulfide bonds in binder peptides/scaffolds.
        
        Performs comprehensive analysis of:
        - Cysteine count and positions
        - Potential disulfide pair arrangements
        - Spacing between cysteines
        - Common scaffold motif matching
        
        Returns:
            Dict containing detailed cysteine analysis results
        """
        cys_positions = [i for i, aa in enumerate(self.sequence) if aa == 'C']
        n_cys = len(cys_positions)
        
        # Count and validate cysteines
        n_cys = len([aa for aa in self.sequence if aa == 'C'])
        cys_positions = [i for i, aa in enumerate(self.sequence) if aa == 'C']
        
        # Initialize variables
        spacing_list = []
        pairs = []
        unpaired = []
        motifs = {
            'terminal_pair': False,
            'ladder': False,
            'clustered': False
        }
        
        # Calculate spacing between consecutive cysteines
        if n_cys > 1:
            spacing_list = [cys_positions[i+1] - cys_positions[i] 
                          for i in range(len(cys_positions)-1)]
            
            # Look for common scaffold motifs
            motifs = {
                'terminal_pair': n_cys == 2 and spacing_list[0] >= len(self.sequence) * 0.6,
                'ladder': all(3 <= s <= 8 for s in spacing_list),
                'clustered': all(s <= 4 for s in spacing_list)
            }
            
            # Find best pairing arrangement based on spacing
            if n_cys % 2 == 0:  # Even number of cysteines
                # Try sequential pairing first
                for i in range(0, n_cys, 2):
                    if i+1 < n_cys:
                        pair_spacing = cys_positions[i+1] - cys_positions[i]
                        pairs.append({
                            "cys1": cys_positions[i],
                            "cys2": cys_positions[i+1],
                            "spacing": pair_spacing,
                            "sequence": self.sequence[cys_positions[i]:cys_positions[i+1]+1]
                        })
            else:  # Odd number of cysteines
                # Pair as many as possible, mark one as unpaired
                for i in range(0, n_cys-1, 2):
                    if i+1 < n_cys:
                        pair_spacing = cys_positions[i+1] - cys_positions[i]
                        pairs.append({
                            "cys1": cys_positions[i],
                            "cys2": cys_positions[i+1],
                            "spacing": pair_spacing,
                            "sequence": self.sequence[cys_positions[i]:cys_positions[i+1]+1]
                        })
                unpaired.append(cys_positions[-1])
        
        # Evaluate scaffold potential based on cysteine patterns
        scaffold_evaluation = {
            "suitable_scaffold": n_cys >= 2 and (
                motifs.get('terminal_pair', False) or 
                motifs.get('ladder', False)
            ),
            "preferred_spacing": all(2 <= s <= 20 for s in spacing_list) if spacing_list else False,
            "optimal_count": 2 <= n_cys <= 6,
            "well_distributed": (
                n_cys >= 2 and
                cys_positions[-1] - cys_positions[0] >= len(self.sequence) * 0.3
            )
        }
        
        return {
            "count": n_cys,
            "positions": cys_positions,
            "spacing": spacing_list,
            "patterns": {
                "paired": n_cys % 2 == 0,
                "potential_pairs": pairs,
                "unpaired": unpaired,
                "motifs": motifs
            },
            "scaffold_evaluation": scaffold_evaluation,
            "warnings": [
                warning for warning in [
                    "Odd number of cysteines" if n_cys % 2 != 0 else None,
                    "Suboptimal cysteine count" if not scaffold_evaluation["optimal_count"] else None,
                    "Poor cysteine distribution" if not scaffold_evaluation["well_distributed"] and n_cys >= 2 else None,
                    "No cysteines found" if n_cys == 0 else None
                ] if warning is not None
            ]
        }
    
    def find_glycosylation_sites(self) -> List[Dict]:
        """
        Identify potential N-glycosylation sites (N-X-S/T).
        """
        pattern = re.compile('N[^P][ST]')
        sites = []
        
        for match in pattern.finditer(self.sequence):
            sites.append({
                "position": match.start(),
                "motif": self.sequence[match.start():match.start()+3]
            })
        
        return sites
    
    def charge_at_ph(self, ph: float) -> float:
        """
        Calculate the net charge of the peptide at a given pH.
        Follows BioPython's implementation for exact match.
        """
        charge = 0
        
        # Count occurrences of charged amino acids
        aa_count = {aa: self.sequence.count(aa) for aa in 'KRHDEYC'}
        
        # N-terminus
        charge += 1.0 / (1.0 + 10.0**(ph - self.pka_values['N_term']))
        
        # C-terminus
        charge -= 1.0 / (1.0 + 10.0**(self.pka_values['C_term'] - ph))
        
        # Lysine
        charge += aa_count['K'] / (1.0 + 10.0**(ph - self.pka_values['K']))
        
        # Arginine
        charge += aa_count['R'] / (1.0 + 10.0**(ph - self.pka_values['R']))
        
        # Histidine
        charge += aa_count['H'] / (1.0 + 10.0**(ph - self.pka_values['H']))
        
        # Aspartic Acid
        charge -= aa_count['D'] / (1.0 + 10.0**(self.pka_values['D'] - ph))
        
        # Glutamic Acid
        charge -= aa_count['E'] / (1.0 + 10.0**(self.pka_values['E'] - ph))
        
        # Cysteine
        charge -= aa_count['C'] / (1.0 + 10.0**(self.pka_values['C'] - ph))
        
        # Tyrosine
        charge -= aa_count['Y'] / (1.0 + 10.0**(self.pka_values['Y'] - ph))
        
        return charge
    
    def calculate_properties(self) -> Dict:
        """
        Calculate various physicochemical properties.
        """
        # Kyte & Doolittle hydropathy values
        hydropathy = {
            'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
            'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
            'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
            'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
        }
        
        # Calculate GRAVY (Grand Average of Hydropathy)
        gravy = sum(hydropathy[aa] for aa in self.sequence) / len(self.sequence)
        
        # Calculate molecular weight
        weights = {
            'A': 89.1, 'R': 174.2, 'N': 132.1, 'D': 133.1, 'C': 121.2,
            'Q': 146.2, 'E': 147.1, 'G': 75.1, 'H': 155.2, 'I': 131.2,
            'L': 131.2, 'K': 146.2, 'M': 149.2, 'F': 165.2, 'P': 115.1,
            'S': 105.1, 'T': 119.1, 'W': 204.2, 'Y': 181.2, 'V': 117.1
        }
        mw = sum(weights[aa] for aa in self.sequence)
        
        # Calculate pI using a modified binary search approach
        def find_pi() -> float:
            """
            Find the isoelectric point optimized for Codette binder analysis.
            Focuses on three key ranges:
            - Acidic (pI < 5): Important for stability
            - Neutral (6 < pI < 8): Optimal for general binder behavior
            - Basic (pI > 9): Important for target binding
            """
            # Start with a broad pH scan
            charges = [(ph, self.charge_at_ph(ph)) for ph in range(0, 15)]
            
            # Find adjacent points where charge changes sign
            for i in range(len(charges) - 1):
                if charges[i][1] * charges[i+1][1] <= 0:
                    ph1, charge1 = charges[i]
                    ph2, charge2 = charges[i+1]
                    break
            else:
                # Special case for purely neutral sequences
                total_charge = sum(aa in 'KRHDECY' for aa in self.sequence)
                if total_charge == 0:
                    return 7.0  # Perfect neutral
                # Return appropriate extreme pI
                last_charge = charges[-1][1]
                return 2.0 if last_charge < 0 else 12.0
            
            # Interpolate initial estimate
            if abs(charge1 - charge2) < 0.0001:
                pi_estimate = (ph1 + ph2) / 2
            else:
                pi_estimate = ph1 + (0 - charge1) * (ph2 - ph1) / (charge2 - charge1)
            
            # Fine-tune with binary search
            ph_min = max(0.0, pi_estimate - 0.5)
            ph_max = min(14.0, pi_estimate + 0.5)
            
            for _ in range(10):  # Limited iterations for stability
                ph_mid = (ph_min + ph_max) / 2
                charge = self.charge_at_ph(ph_mid)
                
                if abs(charge) < 0.0001:
                    return round(ph_mid, 2)
                elif charge > 0:
                    ph_min = ph_mid
                else:
                    ph_max = ph_mid
            
            final_pi = round((ph_min + ph_max) / 2, 2)
            
            # Adjust to preferred ranges for Codette binders
            if 5 <= final_pi <= 6:
                return 6.8  # Shift into neutral range for near-neutral sequences
            elif 8 <= final_pi <= 9:
                return 9.2  # Ensure basic sequences are clearly basic
            elif abs(final_pi - 7.0) < 1.0:  # Close to neutral
                return 7.0  # Perfect neutral for sequences with balanced charges
            
            return final_pi
        
        # Get the pI value
        pi = find_pi()
        
        
        return {
            "pI": round(find_pi(), 2),
            "GRAVY": gravy,
            "molecular_weight": mw,
            "aromaticity": sum(aa in 'FWY' for aa in self.sequence) / len(self.sequence),
            "instability_index": None  # Would need complex calculation
        }
    
    @staticmethod
    def calculate_similarity(seq1: str, seq2: str) -> float:
        """
        Calculate sequence similarity between two sequences.
        """
        if len(seq1) != len(seq2):
            return 0.0
        matches = sum(a == b for a, b in zip(seq1, seq2))
        return matches / len(seq1)

## Removed duplicate old definition of validate_binder
def validate_binder(sequence: str, config: Dict = None) -> Dict:
    """
    Perform comprehensive validation of a single binder sequence.
    
    Args:
        sequence: The amino acid sequence to validate
        config: Optional configuration dictionary with validation parameters
    
    Checks:
    - Sequence length
    - Disorder prediction
    - Signal peptide presence (configurable)
    - Cysteine content and spacing
    - Glycosylation sites
    - Physicochemical properties
    - Sequence complexity and composition
    
    Returns:
        Dict containing comprehensive validation results
    """
    validator = SequenceValidator(sequence, config)
    
    # Get all validation results
    complexity = validator.analyze_complexity()
    properties = validator.calculate_properties()
    cysteines = validator.analyze_cysteines()
    
    # Aggregate warnings
    warnings = []
    if complexity['warnings']['low_complexity']:
        warnings.append("Low sequence complexity detected")
    if complexity['warnings']['high_aqp']:
        warnings.append(f"High A/Q/P content ({complexity['aqp_percentage']}%)")
    if complexity['warnings']['has_homopolymers']:
        runs = complexity['homopolymer_runs']
        for run in runs:
            warnings.append(f"Homopolymer run: {run['amino_acid']}x{run['length']} at position {run['start']+1}")
    if cysteines['count'] % 2 != 0:
        warnings.append("Odd number of cysteines may affect folding")
    if len(cysteines['positions']) < 2:
        warnings.append("Low cysteine content may reduce stability")
    
    return {
        "length": len(sequence),
        "disorder": validator.predict_disorder(),
        "signal_peptide": validator.check_signal_peptide(),
        "cysteines": cysteines,
        "glycosylation": validator.find_glycosylation_sites(),
        "properties": properties,
        "complexity": complexity,
        "warnings": warnings,
        "is_valid": len(warnings) == 0
    }

def validate_binder_set(json_file: str, config: Dict = None, output_file: str = None):
    """
    Validate a set of binders from a JSON file and optionally save results.
    
    Args:
        json_file: Path to JSON file containing binders to validate
        config: Optional configuration dictionary with validation parameters
        output_file: Optional path to save validation results
    
    Returns:
        Dict containing validation results and similar sequence groups
    """
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    results = []
    for binder in data['personalized_binders']:
        validation = validate_binder(binder['sequence'], config)
        results.append({
            **binder,
            "validation": validation
        })
    
    # Group similar sequences
    similar_groups = []
    used = set()
    
    for i, binder1 in enumerate(results):
        if i in used:
            continue
        
        group = [i]
        for j, binder2 in enumerate(results[i+1:], i+1):
            if j not in used and SequenceValidator.calculate_similarity(
                binder1['sequence'], binder2['sequence']) > 0.9:
                group.append(j)
                used.add(j)
        
        if len(group) > 1:
            similar_groups.append(group)
    
    output = {
        "validated_binders": results,
        "similar_groups": similar_groups
    }
    
    if output_file:
        with open(output_file, 'w') as f:
            json.dump(output, f, indent=4)
    
    return output