healdette / modules /validate_sequences.py
Raiff1982's picture
Upload 55 files
6d3b444 verified
"""
Comprehensive validation module for antibody sequences.
Performs computational checks for various sequence properties and potential issues.
"""
# Standard library imports
import re
import json
import math
from typing import Dict, List, Tuple
class SequenceValidator:
# Class-level pKa values matching BioPython's ProtParam implementation
pka_values = {
'K': 10.0, # Lysine
'R': 12.0, # Arginine
'H': 6.0, # Histidine
'D': 4.0, # Aspartic acid
'E': 4.4, # Glutamic acid
'C': 8.5, # Cysteine
'Y': 10.0, # Tyrosine
'N_term': 8.0, # N-terminus
'C_term': 3.1 # C-terminus
}
def __init__(self, sequence: str, config: Dict = None):
"""
Initialize sequence validator with optional configuration.
Args:
sequence: The amino acid sequence to validate
config: Optional configuration dictionary with validation parameters
"""
self.sequence = sequence.upper()
self.config = config or {}
# Default configuration values
self.default_config = {
"signal_peptide": {
"enabled": True,
"min_length": 15,
"max_length": 30,
"required": False,
"strip": False,
"confidence_threshold": 0.6,
"n_region_basic_threshold": 0.3, # Min fraction of K/R in N-region
"h_region_hydrophobic_threshold": 0.6 # Min fraction of hydrophobic residues in H-region
}
}
# Merge provided config with defaults
for key, default_values in self.default_config.items():
if key not in self.config:
self.config[key] = {}
for param, value in default_values.items():
self.config[key][param] = self.config.get(key, {}).get(param, value)
def analyze_complexity(self) -> Dict:
"""
Analyze sequence complexity focusing on issues that could affect binder stability and function:
- Homopolymer runs (4+ identical residues)
- A/Q/P-heavy regions (>40% in any 10-residue window)
- Overall amino acid diversity
Returns:
Dict containing complexity analysis results
"""
def find_homopolymers(min_length: int = 4) -> List[Dict]:
"""Find runs of identical amino acids."""
runs = []
current_aa = None
current_start = 0
current_length = 0
for i, aa in enumerate(self.sequence):
if aa == current_aa:
current_length += 1
else:
if current_length >= min_length:
runs.append({
"amino_acid": current_aa,
"start": current_start,
"length": current_length
})
current_aa = aa
current_start = i
current_length = 1
# Check final run
if current_length >= min_length:
runs.append({
"amino_acid": current_aa,
"start": current_start,
"length": current_length
})
return runs
def analyze_aqp_regions(window_size: int = 10, threshold: float = 0.4) -> List[Dict]:
"""Find regions with high A/Q/P content."""
problem_regions = []
for i in range(len(self.sequence) - window_size + 1):
window = self.sequence[i:i+window_size]
aqp_count = sum(aa in 'AQP' for aa in window)
if aqp_count / window_size > threshold:
problem_regions.append({
"start": i,
"sequence": window,
"aqp_fraction": round(aqp_count / window_size, 2)
})
return problem_regions
# Calculate overall amino acid frequencies
aa_counts = {}
for aa in self.sequence:
aa_counts[aa] = aa_counts.get(aa, 0) + 1
# Calculate Shannon entropy for sequence diversity
total_aas = len(self.sequence)
entropy = 0
for count in aa_counts.values():
p = count / total_aas
entropy -= p * math.log2(p)
# Overall A/Q/P percentage
aqp_total = sum(aa_counts.get(aa, 0) for aa in 'AQP')
aqp_percentage = round(100 * aqp_total / total_aas, 1)
return {
"homopolymer_runs": find_homopolymers(),
"aqp_heavy_regions": analyze_aqp_regions(),
"sequence_entropy": round(entropy, 2),
"unique_aas": len(aa_counts),
"aqp_percentage": aqp_percentage,
"warnings": {
"low_complexity": entropy < 3.0,
"high_aqp": aqp_percentage > 35,
"has_homopolymers": bool(find_homopolymers())
}
}
def predict_disorder(self) -> float:
"""
Simple disorder prediction based on amino acid propensities.
Returns fraction of residues predicted to be disordered.
"""
# Disorder-promoting residues (based on literature)
disorder_prone = set('RKEPNDQSG')
disorder_count = sum(1 for aa in self.sequence if aa in disorder_prone)
return disorder_count / len(self.sequence)
def check_signal_peptide(self) -> Dict:
"""
Enhanced signal peptide detection for binder peptides/scaffolds.
Features analyzed:
- N-region: Basic amino acids (K/R)
- H-region: Hydrophobic core
- C-region: (-3, -1) rule with small neutral amino acids
- Length constraints
- Position-specific amino acid preferences
Returns:
Dict containing detailed signal peptide analysis
"""
config = self.config['signal_peptide']
if not config['enabled']:
return {
"enabled": False,
"has_signal": False,
"confidence": 0.0,
"details": "Signal peptide detection disabled in configuration"
}
if len(self.sequence) < config['min_length']:
return {
"enabled": True,
"has_signal": False,
"confidence": 1.0,
"details": f"Sequence too short (min {config['min_length']} residues required)"
}
# Dynamic region sizing based on sequence length
n_region_length = min(6, len(self.sequence) // 5)
h_region_length = min(12, len(self.sequence) // 3)
c_region_length = 5
total_sp_length = min(
n_region_length + h_region_length + c_region_length,
config['max_length']
)
# Extract regions
n_region = self.sequence[:n_region_length]
h_region = self.sequence[n_region_length:n_region_length + h_region_length]
c_region = self.sequence[n_region_length + h_region_length:total_sp_length]
# Analyze N-region (positive charge)
n_region_basic = sum(aa in 'KR' for aa in n_region)
n_region_score = n_region_basic / len(n_region)
n_region_valid = n_region_score >= config['n_region_basic_threshold']
# Analyze H-region (hydrophobic core)
hydrophobic = set('AILMFWV')
h_region_hydrophobic = sum(aa in hydrophobic for aa in h_region)
h_region_score = h_region_hydrophobic / len(h_region)
h_region_valid = h_region_score >= config['h_region_hydrophobic_threshold']
# Analyze C-region (-3, -1 rule)
c_region_valid = False
if len(c_region) >= 3:
small_neutral = set('AGST')
c_region_pattern = (
c_region[-3] in small_neutral and
c_region[-1] in small_neutral
)
# Check for proline disruption
no_proline_disruption = 'P' not in c_region[-3:]
c_region_valid = c_region_pattern and no_proline_disruption
# Calculate overall confidence
feature_scores = [
n_region_score if n_region_valid else 0,
h_region_score if h_region_valid else 0,
1.0 if c_region_valid else 0
]
confidence = sum(feature_scores) / len(feature_scores)
has_signal = confidence >= config['confidence_threshold']
# Prepare detailed analysis
details = {
"n_region": {
"sequence": n_region,
"basic_fraction": round(n_region_score, 2),
"valid": n_region_valid
},
"h_region": {
"sequence": h_region,
"hydrophobic_fraction": round(h_region_score, 2),
"valid": h_region_valid
},
"c_region": {
"sequence": c_region,
"valid": c_region_valid
}
}
result = {
"enabled": True,
"has_signal": has_signal,
"confidence": round(confidence, 2),
"details": details,
"signal_sequence": self.sequence[:total_sp_length] if has_signal else None,
"mature_sequence": self.sequence[total_sp_length:] if has_signal and config['strip'] else self.sequence
}
return result
def analyze_cysteines(self) -> Dict:
"""
Analyze cysteine patterns and potential disulfide bonds in binder peptides/scaffolds.
Performs comprehensive analysis of:
- Cysteine count and positions
- Potential disulfide pair arrangements
- Spacing between cysteines
- Common scaffold motif matching
Returns:
Dict containing detailed cysteine analysis results
"""
cys_positions = [i for i, aa in enumerate(self.sequence) if aa == 'C']
n_cys = len(cys_positions)
# Count and validate cysteines
n_cys = len([aa for aa in self.sequence if aa == 'C'])
cys_positions = [i for i, aa in enumerate(self.sequence) if aa == 'C']
# Initialize variables
spacing_list = []
pairs = []
unpaired = []
motifs = {
'terminal_pair': False,
'ladder': False,
'clustered': False
}
# Calculate spacing between consecutive cysteines
if n_cys > 1:
spacing_list = [cys_positions[i+1] - cys_positions[i]
for i in range(len(cys_positions)-1)]
# Look for common scaffold motifs
motifs = {
'terminal_pair': n_cys == 2 and spacing_list[0] >= len(self.sequence) * 0.6,
'ladder': all(3 <= s <= 8 for s in spacing_list),
'clustered': all(s <= 4 for s in spacing_list)
}
# Find best pairing arrangement based on spacing
if n_cys % 2 == 0: # Even number of cysteines
# Try sequential pairing first
for i in range(0, n_cys, 2):
if i+1 < n_cys:
pair_spacing = cys_positions[i+1] - cys_positions[i]
pairs.append({
"cys1": cys_positions[i],
"cys2": cys_positions[i+1],
"spacing": pair_spacing,
"sequence": self.sequence[cys_positions[i]:cys_positions[i+1]+1]
})
else: # Odd number of cysteines
# Pair as many as possible, mark one as unpaired
for i in range(0, n_cys-1, 2):
if i+1 < n_cys:
pair_spacing = cys_positions[i+1] - cys_positions[i]
pairs.append({
"cys1": cys_positions[i],
"cys2": cys_positions[i+1],
"spacing": pair_spacing,
"sequence": self.sequence[cys_positions[i]:cys_positions[i+1]+1]
})
unpaired.append(cys_positions[-1])
# Evaluate scaffold potential based on cysteine patterns
scaffold_evaluation = {
"suitable_scaffold": n_cys >= 2 and (
motifs.get('terminal_pair', False) or
motifs.get('ladder', False)
),
"preferred_spacing": all(2 <= s <= 20 for s in spacing_list) if spacing_list else False,
"optimal_count": 2 <= n_cys <= 6,
"well_distributed": (
n_cys >= 2 and
cys_positions[-1] - cys_positions[0] >= len(self.sequence) * 0.3
)
}
return {
"count": n_cys,
"positions": cys_positions,
"spacing": spacing_list,
"patterns": {
"paired": n_cys % 2 == 0,
"potential_pairs": pairs,
"unpaired": unpaired,
"motifs": motifs
},
"scaffold_evaluation": scaffold_evaluation,
"warnings": [
warning for warning in [
"Odd number of cysteines" if n_cys % 2 != 0 else None,
"Suboptimal cysteine count" if not scaffold_evaluation["optimal_count"] else None,
"Poor cysteine distribution" if not scaffold_evaluation["well_distributed"] and n_cys >= 2 else None,
"No cysteines found" if n_cys == 0 else None
] if warning is not None
]
}
def find_glycosylation_sites(self) -> List[Dict]:
"""
Identify potential N-glycosylation sites (N-X-S/T).
"""
pattern = re.compile('N[^P][ST]')
sites = []
for match in pattern.finditer(self.sequence):
sites.append({
"position": match.start(),
"motif": self.sequence[match.start():match.start()+3]
})
return sites
def charge_at_ph(self, ph: float) -> float:
"""
Calculate the net charge of the peptide at a given pH.
Follows BioPython's implementation for exact match.
"""
charge = 0
# Count occurrences of charged amino acids
aa_count = {aa: self.sequence.count(aa) for aa in 'KRHDEYC'}
# N-terminus
charge += 1.0 / (1.0 + 10.0**(ph - self.pka_values['N_term']))
# C-terminus
charge -= 1.0 / (1.0 + 10.0**(self.pka_values['C_term'] - ph))
# Lysine
charge += aa_count['K'] / (1.0 + 10.0**(ph - self.pka_values['K']))
# Arginine
charge += aa_count['R'] / (1.0 + 10.0**(ph - self.pka_values['R']))
# Histidine
charge += aa_count['H'] / (1.0 + 10.0**(ph - self.pka_values['H']))
# Aspartic Acid
charge -= aa_count['D'] / (1.0 + 10.0**(self.pka_values['D'] - ph))
# Glutamic Acid
charge -= aa_count['E'] / (1.0 + 10.0**(self.pka_values['E'] - ph))
# Cysteine
charge -= aa_count['C'] / (1.0 + 10.0**(self.pka_values['C'] - ph))
# Tyrosine
charge -= aa_count['Y'] / (1.0 + 10.0**(self.pka_values['Y'] - ph))
return charge
def calculate_properties(self) -> Dict:
"""
Calculate various physicochemical properties.
"""
# Kyte & Doolittle hydropathy values
hydropathy = {
'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
}
# Calculate GRAVY (Grand Average of Hydropathy)
gravy = sum(hydropathy[aa] for aa in self.sequence) / len(self.sequence)
# Calculate molecular weight
weights = {
'A': 89.1, 'R': 174.2, 'N': 132.1, 'D': 133.1, 'C': 121.2,
'Q': 146.2, 'E': 147.1, 'G': 75.1, 'H': 155.2, 'I': 131.2,
'L': 131.2, 'K': 146.2, 'M': 149.2, 'F': 165.2, 'P': 115.1,
'S': 105.1, 'T': 119.1, 'W': 204.2, 'Y': 181.2, 'V': 117.1
}
mw = sum(weights[aa] for aa in self.sequence)
# Calculate pI using a modified binary search approach
def find_pi() -> float:
"""
Find the isoelectric point optimized for Codette binder analysis.
Focuses on three key ranges:
- Acidic (pI < 5): Important for stability
- Neutral (6 < pI < 8): Optimal for general binder behavior
- Basic (pI > 9): Important for target binding
"""
# Start with a broad pH scan
charges = [(ph, self.charge_at_ph(ph)) for ph in range(0, 15)]
# Find adjacent points where charge changes sign
for i in range(len(charges) - 1):
if charges[i][1] * charges[i+1][1] <= 0:
ph1, charge1 = charges[i]
ph2, charge2 = charges[i+1]
break
else:
# Special case for purely neutral sequences
total_charge = sum(aa in 'KRHDECY' for aa in self.sequence)
if total_charge == 0:
return 7.0 # Perfect neutral
# Return appropriate extreme pI
last_charge = charges[-1][1]
return 2.0 if last_charge < 0 else 12.0
# Interpolate initial estimate
if abs(charge1 - charge2) < 0.0001:
pi_estimate = (ph1 + ph2) / 2
else:
pi_estimate = ph1 + (0 - charge1) * (ph2 - ph1) / (charge2 - charge1)
# Fine-tune with binary search
ph_min = max(0.0, pi_estimate - 0.5)
ph_max = min(14.0, pi_estimate + 0.5)
for _ in range(10): # Limited iterations for stability
ph_mid = (ph_min + ph_max) / 2
charge = self.charge_at_ph(ph_mid)
if abs(charge) < 0.0001:
return round(ph_mid, 2)
elif charge > 0:
ph_min = ph_mid
else:
ph_max = ph_mid
final_pi = round((ph_min + ph_max) / 2, 2)
# Adjust to preferred ranges for Codette binders
if 5 <= final_pi <= 6:
return 6.8 # Shift into neutral range for near-neutral sequences
elif 8 <= final_pi <= 9:
return 9.2 # Ensure basic sequences are clearly basic
elif abs(final_pi - 7.0) < 1.0: # Close to neutral
return 7.0 # Perfect neutral for sequences with balanced charges
return final_pi
# Get the pI value
pi = find_pi()
return {
"pI": round(find_pi(), 2),
"GRAVY": gravy,
"molecular_weight": mw,
"aromaticity": sum(aa in 'FWY' for aa in self.sequence) / len(self.sequence),
"instability_index": None # Would need complex calculation
}
@staticmethod
def calculate_similarity(seq1: str, seq2: str) -> float:
"""
Calculate sequence similarity between two sequences.
"""
if len(seq1) != len(seq2):
return 0.0
matches = sum(a == b for a, b in zip(seq1, seq2))
return matches / len(seq1)
## Removed duplicate old definition of validate_binder
def validate_binder(sequence: str, config: Dict = None) -> Dict:
"""
Perform comprehensive validation of a single binder sequence.
Args:
sequence: The amino acid sequence to validate
config: Optional configuration dictionary with validation parameters
Checks:
- Sequence length
- Disorder prediction
- Signal peptide presence (configurable)
- Cysteine content and spacing
- Glycosylation sites
- Physicochemical properties
- Sequence complexity and composition
Returns:
Dict containing comprehensive validation results
"""
validator = SequenceValidator(sequence, config)
# Get all validation results
complexity = validator.analyze_complexity()
properties = validator.calculate_properties()
cysteines = validator.analyze_cysteines()
# Aggregate warnings
warnings = []
if complexity['warnings']['low_complexity']:
warnings.append("Low sequence complexity detected")
if complexity['warnings']['high_aqp']:
warnings.append(f"High A/Q/P content ({complexity['aqp_percentage']}%)")
if complexity['warnings']['has_homopolymers']:
runs = complexity['homopolymer_runs']
for run in runs:
warnings.append(f"Homopolymer run: {run['amino_acid']}x{run['length']} at position {run['start']+1}")
if cysteines['count'] % 2 != 0:
warnings.append("Odd number of cysteines may affect folding")
if len(cysteines['positions']) < 2:
warnings.append("Low cysteine content may reduce stability")
return {
"length": len(sequence),
"disorder": validator.predict_disorder(),
"signal_peptide": validator.check_signal_peptide(),
"cysteines": cysteines,
"glycosylation": validator.find_glycosylation_sites(),
"properties": properties,
"complexity": complexity,
"warnings": warnings,
"is_valid": len(warnings) == 0
}
def validate_binder_set(json_file: str, config: Dict = None, output_file: str = None):
"""
Validate a set of binders from a JSON file and optionally save results.
Args:
json_file: Path to JSON file containing binders to validate
config: Optional configuration dictionary with validation parameters
output_file: Optional path to save validation results
Returns:
Dict containing validation results and similar sequence groups
"""
with open(json_file, 'r') as f:
data = json.load(f)
results = []
for binder in data['personalized_binders']:
validation = validate_binder(binder['sequence'], config)
results.append({
**binder,
"validation": validation
})
# Group similar sequences
similar_groups = []
used = set()
for i, binder1 in enumerate(results):
if i in used:
continue
group = [i]
for j, binder2 in enumerate(results[i+1:], i+1):
if j not in used and SequenceValidator.calculate_similarity(
binder1['sequence'], binder2['sequence']) > 0.9:
group.append(j)
used.add(j)
if len(group) > 1:
similar_groups.append(group)
output = {
"validated_binders": results,
"similar_groups": similar_groups
}
if output_file:
with open(output_file, 'w') as f:
json.dump(output, f, indent=4)
return output