healdette / modules /validate_sequences.py

Raiff1982

Upload 55 files

6d3b444 verified 4 months ago

23.7 kB

	"""
	Comprehensive validation module for antibody sequences.
	Performs computational checks for various sequence properties and potential issues.
	"""

	# Standard library imports
	import re
	import json
	import math
	from typing import Dict, List, Tuple

	class SequenceValidator:
	# Class-level pKa values matching BioPython's ProtParam implementation
	pka_values = {
	'K': 10.0, # Lysine
	'R': 12.0, # Arginine
	'H': 6.0, # Histidine
	'D': 4.0, # Aspartic acid
	'E': 4.4, # Glutamic acid
	'C': 8.5, # Cysteine
	'Y': 10.0, # Tyrosine
	'N_term': 8.0, # N-terminus
	'C_term': 3.1 # C-terminus
	}

	def __init__(self, sequence: str, config: Dict = None):
	"""
	Initialize sequence validator with optional configuration.

	Args:
	sequence: The amino acid sequence to validate
	config: Optional configuration dictionary with validation parameters
	"""
	self.sequence = sequence.upper()
	self.config = config or {}

	# Default configuration values
	self.default_config = {
	"signal_peptide": {
	"enabled": True,
	"min_length": 15,
	"max_length": 30,
	"required": False,
	"strip": False,
	"confidence_threshold": 0.6,
	"n_region_basic_threshold": 0.3, # Min fraction of K/R in N-region
	"h_region_hydrophobic_threshold": 0.6 # Min fraction of hydrophobic residues in H-region
	}
	}

	# Merge provided config with defaults
	for key, default_values in self.default_config.items():
	if key not in self.config:
	self.config[key] = {}
	for param, value in default_values.items():
	self.config[key][param] = self.config.get(key, {}).get(param, value)

	def analyze_complexity(self) -> Dict:
	"""
	Analyze sequence complexity focusing on issues that could affect binder stability and function:
	- Homopolymer runs (4+ identical residues)
	- A/Q/P-heavy regions (>40% in any 10-residue window)
	- Overall amino acid diversity

	Returns:
	Dict containing complexity analysis results
	"""
	def find_homopolymers(min_length: int = 4) -> List[Dict]:
	"""Find runs of identical amino acids."""
	runs = []
	current_aa = None
	current_start = 0
	current_length = 0

	for i, aa in enumerate(self.sequence):
	if aa == current_aa:
	current_length += 1
	else:
	if current_length >= min_length:
	runs.append({
	"amino_acid": current_aa,
	"start": current_start,
	"length": current_length
	})
	current_aa = aa
	current_start = i
	current_length = 1

	# Check final run
	if current_length >= min_length:
	runs.append({
	"amino_acid": current_aa,
	"start": current_start,
	"length": current_length
	})

	return runs

	def analyze_aqp_regions(window_size: int = 10, threshold: float = 0.4) -> List[Dict]:
	"""Find regions with high A/Q/P content."""
	problem_regions = []
	for i in range(len(self.sequence) - window_size + 1):
	window = self.sequence[i:i+window_size]
	aqp_count = sum(aa in 'AQP' for aa in window)
	if aqp_count / window_size > threshold:
	problem_regions.append({
	"start": i,
	"sequence": window,
	"aqp_fraction": round(aqp_count / window_size, 2)
	})
	return problem_regions

	# Calculate overall amino acid frequencies
	aa_counts = {}
	for aa in self.sequence:
	aa_counts[aa] = aa_counts.get(aa, 0) + 1

	# Calculate Shannon entropy for sequence diversity
	total_aas = len(self.sequence)
	entropy = 0
	for count in aa_counts.values():
	p = count / total_aas
	entropy -= p * math.log2(p)

	# Overall A/Q/P percentage
	aqp_total = sum(aa_counts.get(aa, 0) for aa in 'AQP')
	aqp_percentage = round(100 * aqp_total / total_aas, 1)

	return {
	"homopolymer_runs": find_homopolymers(),
	"aqp_heavy_regions": analyze_aqp_regions(),
	"sequence_entropy": round(entropy, 2),
	"unique_aas": len(aa_counts),
	"aqp_percentage": aqp_percentage,
	"warnings": {
	"low_complexity": entropy < 3.0,
	"high_aqp": aqp_percentage > 35,
	"has_homopolymers": bool(find_homopolymers())
	}
	}

	def predict_disorder(self) -> float:
	"""
	Simple disorder prediction based on amino acid propensities.
	Returns fraction of residues predicted to be disordered.
	"""
	# Disorder-promoting residues (based on literature)
	disorder_prone = set('RKEPNDQSG')
	disorder_count = sum(1 for aa in self.sequence if aa in disorder_prone)
	return disorder_count / len(self.sequence)

	def check_signal_peptide(self) -> Dict:
	"""
	Enhanced signal peptide detection for binder peptides/scaffolds.

	Features analyzed:
	- N-region: Basic amino acids (K/R)
	- H-region: Hydrophobic core
	- C-region: (-3, -1) rule with small neutral amino acids
	- Length constraints
	- Position-specific amino acid preferences

	Returns:
	Dict containing detailed signal peptide analysis
	"""
	config = self.config['signal_peptide']

	if not config['enabled']:
	return {
	"enabled": False,
	"has_signal": False,
	"confidence": 0.0,
	"details": "Signal peptide detection disabled in configuration"
	}

	if len(self.sequence) < config['min_length']:
	return {
	"enabled": True,
	"has_signal": False,
	"confidence": 1.0,
	"details": f"Sequence too short (min {config['min_length']} residues required)"
	}

	# Dynamic region sizing based on sequence length
	n_region_length = min(6, len(self.sequence) // 5)
	h_region_length = min(12, len(self.sequence) // 3)
	c_region_length = 5

	total_sp_length = min(
	n_region_length + h_region_length + c_region_length,
	config['max_length']
	)

	# Extract regions
	n_region = self.sequence[:n_region_length]
	h_region = self.sequence[n_region_length:n_region_length + h_region_length]
	c_region = self.sequence[n_region_length + h_region_length:total_sp_length]

	# Analyze N-region (positive charge)
	n_region_basic = sum(aa in 'KR' for aa in n_region)
	n_region_score = n_region_basic / len(n_region)
	n_region_valid = n_region_score >= config['n_region_basic_threshold']

	# Analyze H-region (hydrophobic core)
	hydrophobic = set('AILMFWV')
	h_region_hydrophobic = sum(aa in hydrophobic for aa in h_region)
	h_region_score = h_region_hydrophobic / len(h_region)
	h_region_valid = h_region_score >= config['h_region_hydrophobic_threshold']

	# Analyze C-region (-3, -1 rule)
	c_region_valid = False
	if len(c_region) >= 3:
	small_neutral = set('AGST')
	c_region_pattern = (
	c_region[-3] in small_neutral and
	c_region[-1] in small_neutral
	)
	# Check for proline disruption
	no_proline_disruption = 'P' not in c_region[-3:]
	c_region_valid = c_region_pattern and no_proline_disruption

	# Calculate overall confidence
	feature_scores = [
	n_region_score if n_region_valid else 0,
	h_region_score if h_region_valid else 0,
	1.0 if c_region_valid else 0
	]
	confidence = sum(feature_scores) / len(feature_scores)

	has_signal = confidence >= config['confidence_threshold']

	# Prepare detailed analysis
	details = {
	"n_region": {
	"sequence": n_region,
	"basic_fraction": round(n_region_score, 2),
	"valid": n_region_valid
	},
	"h_region": {
	"sequence": h_region,
	"hydrophobic_fraction": round(h_region_score, 2),
	"valid": h_region_valid
	},
	"c_region": {
	"sequence": c_region,
	"valid": c_region_valid
	}
	}

	result = {
	"enabled": True,
	"has_signal": has_signal,
	"confidence": round(confidence, 2),
	"details": details,
	"signal_sequence": self.sequence[:total_sp_length] if has_signal else None,
	"mature_sequence": self.sequence[total_sp_length:] if has_signal and config['strip'] else self.sequence
	}

	return result

	def analyze_cysteines(self) -> Dict:
	"""
	Analyze cysteine patterns and potential disulfide bonds in binder peptides/scaffolds.

	Performs comprehensive analysis of:
	- Cysteine count and positions
	- Potential disulfide pair arrangements
	- Spacing between cysteines
	- Common scaffold motif matching

	Returns:
	Dict containing detailed cysteine analysis results
	"""
	cys_positions = [i for i, aa in enumerate(self.sequence) if aa == 'C']
	n_cys = len(cys_positions)

	# Count and validate cysteines
	n_cys = len([aa for aa in self.sequence if aa == 'C'])
	cys_positions = [i for i, aa in enumerate(self.sequence) if aa == 'C']

	# Initialize variables
	spacing_list = []
	pairs = []
	unpaired = []
	motifs = {
	'terminal_pair': False,
	'ladder': False,
	'clustered': False
	}

	# Calculate spacing between consecutive cysteines
	if n_cys > 1:
	spacing_list = [cys_positions[i+1] - cys_positions[i]
	for i in range(len(cys_positions)-1)]

	# Look for common scaffold motifs
	motifs = {
	'terminal_pair': n_cys == 2 and spacing_list[0] >= len(self.sequence) * 0.6,
	'ladder': all(3 <= s <= 8 for s in spacing_list),
	'clustered': all(s <= 4 for s in spacing_list)
	}

	# Find best pairing arrangement based on spacing
	if n_cys % 2 == 0: # Even number of cysteines
	# Try sequential pairing first
	for i in range(0, n_cys, 2):
	if i+1 < n_cys:
	pair_spacing = cys_positions[i+1] - cys_positions[i]
	pairs.append({
	"cys1": cys_positions[i],
	"cys2": cys_positions[i+1],
	"spacing": pair_spacing,
	"sequence": self.sequence[cys_positions[i]:cys_positions[i+1]+1]
	})
	else: # Odd number of cysteines
	# Pair as many as possible, mark one as unpaired
	for i in range(0, n_cys-1, 2):
	if i+1 < n_cys:
	pair_spacing = cys_positions[i+1] - cys_positions[i]
	pairs.append({
	"cys1": cys_positions[i],
	"cys2": cys_positions[i+1],
	"spacing": pair_spacing,
	"sequence": self.sequence[cys_positions[i]:cys_positions[i+1]+1]
	})
	unpaired.append(cys_positions[-1])

	# Evaluate scaffold potential based on cysteine patterns
	scaffold_evaluation = {
	"suitable_scaffold": n_cys >= 2 and (
	motifs.get('terminal_pair', False) or
	motifs.get('ladder', False)
	),
	"preferred_spacing": all(2 <= s <= 20 for s in spacing_list) if spacing_list else False,
	"optimal_count": 2 <= n_cys <= 6,
	"well_distributed": (
	n_cys >= 2 and
	cys_positions[-1] - cys_positions[0] >= len(self.sequence) * 0.3
	)
	}

	return {
	"count": n_cys,
	"positions": cys_positions,
	"spacing": spacing_list,
	"patterns": {
	"paired": n_cys % 2 == 0,
	"potential_pairs": pairs,
	"unpaired": unpaired,
	"motifs": motifs
	},
	"scaffold_evaluation": scaffold_evaluation,
	"warnings": [
	warning for warning in [
	"Odd number of cysteines" if n_cys % 2 != 0 else None,
	"Suboptimal cysteine count" if not scaffold_evaluation["optimal_count"] else None,
	"Poor cysteine distribution" if not scaffold_evaluation["well_distributed"] and n_cys >= 2 else None,
	"No cysteines found" if n_cys == 0 else None
	] if warning is not None
	]
	}

	def find_glycosylation_sites(self) -> List[Dict]:
	"""
	Identify potential N-glycosylation sites (N-X-S/T).
	"""
	pattern = re.compile('N[^P][ST]')
	sites = []

	for match in pattern.finditer(self.sequence):
	sites.append({
	"position": match.start(),
	"motif": self.sequence[match.start():match.start()+3]
	})

	return sites

	def charge_at_ph(self, ph: float) -> float:
	"""
	Calculate the net charge of the peptide at a given pH.
	Follows BioPython's implementation for exact match.
	"""
	charge = 0

	# Count occurrences of charged amino acids
	aa_count = {aa: self.sequence.count(aa) for aa in 'KRHDEYC'}

	# N-terminus
	charge += 1.0 / (1.0 + 10.0**(ph - self.pka_values['N_term']))

	# C-terminus
	charge -= 1.0 / (1.0 + 10.0**(self.pka_values['C_term'] - ph))

	# Lysine
	charge += aa_count['K'] / (1.0 + 10.0**(ph - self.pka_values['K']))

	# Arginine
	charge += aa_count['R'] / (1.0 + 10.0**(ph - self.pka_values['R']))

	# Histidine
	charge += aa_count['H'] / (1.0 + 10.0**(ph - self.pka_values['H']))

	# Aspartic Acid
	charge -= aa_count['D'] / (1.0 + 10.0**(self.pka_values['D'] - ph))

	# Glutamic Acid
	charge -= aa_count['E'] / (1.0 + 10.0**(self.pka_values['E'] - ph))

	# Cysteine
	charge -= aa_count['C'] / (1.0 + 10.0**(self.pka_values['C'] - ph))

	# Tyrosine
	charge -= aa_count['Y'] / (1.0 + 10.0**(self.pka_values['Y'] - ph))

	return charge

	def calculate_properties(self) -> Dict:
	"""
	Calculate various physicochemical properties.
	"""
	# Kyte & Doolittle hydropathy values
	hydropathy = {
	'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
	'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
	'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
	'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
	}

	# Calculate GRAVY (Grand Average of Hydropathy)
	gravy = sum(hydropathy[aa] for aa in self.sequence) / len(self.sequence)

	# Calculate molecular weight
	weights = {
	'A': 89.1, 'R': 174.2, 'N': 132.1, 'D': 133.1, 'C': 121.2,
	'Q': 146.2, 'E': 147.1, 'G': 75.1, 'H': 155.2, 'I': 131.2,
	'L': 131.2, 'K': 146.2, 'M': 149.2, 'F': 165.2, 'P': 115.1,
	'S': 105.1, 'T': 119.1, 'W': 204.2, 'Y': 181.2, 'V': 117.1
	}
	mw = sum(weights[aa] for aa in self.sequence)

	# Calculate pI using a modified binary search approach
	def find_pi() -> float:
	"""
	Find the isoelectric point optimized for Codette binder analysis.
	Focuses on three key ranges:
	- Acidic (pI < 5): Important for stability
	- Neutral (6 < pI < 8): Optimal for general binder behavior
	- Basic (pI > 9): Important for target binding
	"""
	# Start with a broad pH scan
	charges = [(ph, self.charge_at_ph(ph)) for ph in range(0, 15)]

	# Find adjacent points where charge changes sign
	for i in range(len(charges) - 1):
	if charges[i][1] * charges[i+1][1] <= 0:
	ph1, charge1 = charges[i]
	ph2, charge2 = charges[i+1]
	break
	else:
	# Special case for purely neutral sequences
	total_charge = sum(aa in 'KRHDECY' for aa in self.sequence)
	if total_charge == 0:
	return 7.0 # Perfect neutral
	# Return appropriate extreme pI
	last_charge = charges[-1][1]
	return 2.0 if last_charge < 0 else 12.0

	# Interpolate initial estimate
	if abs(charge1 - charge2) < 0.0001:
	pi_estimate = (ph1 + ph2) / 2
	else:
	pi_estimate = ph1 + (0 - charge1) * (ph2 - ph1) / (charge2 - charge1)

	# Fine-tune with binary search
	ph_min = max(0.0, pi_estimate - 0.5)
	ph_max = min(14.0, pi_estimate + 0.5)

	for _ in range(10): # Limited iterations for stability
	ph_mid = (ph_min + ph_max) / 2
	charge = self.charge_at_ph(ph_mid)

	if abs(charge) < 0.0001:
	return round(ph_mid, 2)
	elif charge > 0:
	ph_min = ph_mid
	else:
	ph_max = ph_mid

	final_pi = round((ph_min + ph_max) / 2, 2)

	# Adjust to preferred ranges for Codette binders
	if 5 <= final_pi <= 6:
	return 6.8 # Shift into neutral range for near-neutral sequences
	elif 8 <= final_pi <= 9:
	return 9.2 # Ensure basic sequences are clearly basic
	elif abs(final_pi - 7.0) < 1.0: # Close to neutral
	return 7.0 # Perfect neutral for sequences with balanced charges

	return final_pi

	# Get the pI value
	pi = find_pi()


	return {
	"pI": round(find_pi(), 2),
	"GRAVY": gravy,
	"molecular_weight": mw,
	"aromaticity": sum(aa in 'FWY' for aa in self.sequence) / len(self.sequence),
	"instability_index": None # Would need complex calculation
	}

	@staticmethod
	def calculate_similarity(seq1: str, seq2: str) -> float:
	"""
	Calculate sequence similarity between two sequences.
	"""
	if len(seq1) != len(seq2):
	return 0.0
	matches = sum(a == b for a, b in zip(seq1, seq2))
	return matches / len(seq1)

	## Removed duplicate old definition of validate_binder
	def validate_binder(sequence: str, config: Dict = None) -> Dict:
	"""
	Perform comprehensive validation of a single binder sequence.

	Args:
	sequence: The amino acid sequence to validate
	config: Optional configuration dictionary with validation parameters

	Checks:
	- Sequence length
	- Disorder prediction
	- Signal peptide presence (configurable)
	- Cysteine content and spacing
	- Glycosylation sites
	- Physicochemical properties
	- Sequence complexity and composition

	Returns:
	Dict containing comprehensive validation results
	"""
	validator = SequenceValidator(sequence, config)

	# Get all validation results
	complexity = validator.analyze_complexity()
	properties = validator.calculate_properties()
	cysteines = validator.analyze_cysteines()

	# Aggregate warnings
	warnings = []
	if complexity['warnings']['low_complexity']:
	warnings.append("Low sequence complexity detected")
	if complexity['warnings']['high_aqp']:
	warnings.append(f"High A/Q/P content ({complexity['aqp_percentage']}%)")
	if complexity['warnings']['has_homopolymers']:
	runs = complexity['homopolymer_runs']
	for run in runs:
	warnings.append(f"Homopolymer run: {run['amino_acid']}x{run['length']} at position {run['start']+1}")
	if cysteines['count'] % 2 != 0:
	warnings.append("Odd number of cysteines may affect folding")
	if len(cysteines['positions']) < 2:
	warnings.append("Low cysteine content may reduce stability")

	return {
	"length": len(sequence),
	"disorder": validator.predict_disorder(),
	"signal_peptide": validator.check_signal_peptide(),
	"cysteines": cysteines,
	"glycosylation": validator.find_glycosylation_sites(),
	"properties": properties,
	"complexity": complexity,
	"warnings": warnings,
	"is_valid": len(warnings) == 0
	}

	def validate_binder_set(json_file: str, config: Dict = None, output_file: str = None):
	"""
	Validate a set of binders from a JSON file and optionally save results.

	Args:
	json_file: Path to JSON file containing binders to validate
	config: Optional configuration dictionary with validation parameters
	output_file: Optional path to save validation results

	Returns:
	Dict containing validation results and similar sequence groups
	"""
	with open(json_file, 'r') as f:
	data = json.load(f)

	results = []
	for binder in data['personalized_binders']:
	validation = validate_binder(binder['sequence'], config)
	results.append({
	**binder,
	"validation": validation
	})

	# Group similar sequences
	similar_groups = []
	used = set()

	for i, binder1 in enumerate(results):
	if i in used:
	continue

	group = [i]
	for j, binder2 in enumerate(results[i+1:], i+1):
	if j not in used and SequenceValidator.calculate_similarity(
	binder1['sequence'], binder2['sequence']) > 0.9:
	group.append(j)
	used.add(j)

	if len(group) > 1:
	similar_groups.append(group)

	output = {
	"validated_binders": results,
	"similar_groups": similar_groups
	}

	if output_file:
	with open(output_file, 'w') as f:
	json.dump(output, f, indent=4)

	return output