File size: 9,895 Bytes
6d3b444
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
"""
Unit tests for sequence validation module, including pI calculation and complexity analysis.
"""

import unittest
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import sys
import os

# Add the parent directory to the path so we can import our modules
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from modules.validate_sequences import SequenceValidator, validate_binder

class TestSequenceValidation(unittest.TestCase):
    def setUp(self):
        # Test sequences for different validation aspects
        self.test_sequences = {
            'acidic': 'DDDEEEDDDEEE',  # Should have low pI
            'basic': 'KKRRKKKRRKRK',   # Should have high pI
            'neutral': 'GGGGGGGGGGGG', # Should be near neutral
            'mixed': 'KDKEFGYWAPTS',   # Mix of amino acids
            'real_peptide': 'MKKSFWLVLLVALNLWIKANA',  # Realistic signal peptide
            # Sequences for complexity testing
            'homopolymer': 'MKAAAAATWLVLLVALNLWIKANA',  # Has AAAAA run
            'aqp_heavy': 'MAPQAPQAPQAPQAPQAPQTWLVL',    # High A/Q/P content
            'low_complexity': 'AQPAQPAQPAQPAQPAQPAQP',   # Very repetitive
            'good_binder': 'MKKSFWLVLLCALNLWIKANACR',    # Well-balanced sequence
            # Sequences for cysteine pattern testing
            'terminal_pair': 'MKKCFWLVLLVALNLWIKANACT',   # Terminal cysteine pair
            'ladder_motif': 'MKCDEFCGHICKLMCNOPQCR',     # Evenly spaced cysteines
            'odd_cysteines': 'MKCDEFCGHICKLMNOPQRS',     # Odd number of cysteines
            'optimal_scaffold': 'MKCDEFGHICKLMNOPQCRSTC' # Good scaffold pattern (4 cysteines)
        }
        
    def test_pi_calculation_range(self):
        """
        Test pI calculation for Codette binder requirements:
        - Acidic sequences (pI < 5): Important for stability in physiological conditions
        - Neutral sequences (6 < pI < 8): Typical for well-behaved binders
        - Basic sequences (pI > 9): Important for target binding
        """
        test_cases = [
            ('DDDEEEDDDEEE', 'acidic', lambda x: x < 5),
            ('GGGGGGGGGGGG', 'neutral', lambda x: 5 <= x <= 8),  # Broader neutral range for Codette binders
            ('KKRRKKKRRKRK', 'basic', lambda x: x > 9)
        ]
        
        for seq, category, validator_func in test_cases:
            validator = SequenceValidator(seq)
            pi = validator.calculate_properties()['pI']
            
            self.assertTrue(
                validator_func(pi),
                f"pI {pi} for {category} sequence {seq} is outside expected range"
            )
            
    def test_charge_ph_relationships(self):
        """Test specific charge/pH relationships required for Codette binders"""
        # Test acidic sequence
        acidic_seq = 'DDDEEEDDDEEE'
        validator = SequenceValidator(acidic_seq)
        
        # At pH 7.4 (physiological), acidic sequences should have significant negative charge
        charge_phys = validator.charge_at_ph(7.4)
        self.assertLess(
            charge_phys, 
            -5.0,
            f"Acidic sequence charge at pH 7.4 ({charge_phys}) not negative enough"
        )
        
        # Basic sequence at physiological pH
        basic_seq = 'KKRRKKKRRKRK'
        validator = SequenceValidator(basic_seq)
        charge_phys = validator.charge_at_ph(7.4)
        self.assertGreater(
            charge_phys,
            5.0,
            f"Basic sequence charge at pH 7.4 ({charge_phys}) not positive enough"
        )
    
    def test_sequence_complexity(self):
        """Test sequence complexity analysis"""
        # Test homopolymer detection
        validator = SequenceValidator(self.test_sequences['homopolymer'])
        complexity = validator.analyze_complexity()
        self.assertTrue(
            any(run['amino_acid'] == 'A' and run['length'] >= 5 
                for run in complexity['homopolymer_runs']),
            "Failed to detect AAAAA homopolymer run"
        )
        
        # Test A/Q/P-heavy regions
        validator = SequenceValidator(self.test_sequences['aqp_heavy'])
        complexity = validator.analyze_complexity()
        self.assertTrue(
            complexity['warnings']['high_aqp'],
            "Failed to detect high A/Q/P content"
        )
        self.assertGreater(
            len(complexity['aqp_heavy_regions']),
            0,
            "Failed to identify A/Q/P-heavy regions"
        )
        
        # Test sequence entropy
        validator = SequenceValidator(self.test_sequences['low_complexity'])
        complexity = validator.analyze_complexity()
        self.assertTrue(
            complexity['warnings']['low_complexity'],
            "Failed to detect low complexity sequence"
        )
        self.assertLess(
            complexity['sequence_entropy'],
            3.0,
            "Low complexity sequence has unexpectedly high entropy"
        )
        
        # Test well-balanced sequence
        validator = SequenceValidator(self.test_sequences['good_binder'])
        complexity = validator.analyze_complexity()
        self.assertFalse(
            any(complexity['warnings'].values()),
            "Good binder sequence incorrectly flagged with warnings"
        )
        
    def test_cysteine_analysis(self):
        """Test enhanced cysteine pattern analysis for binder scaffolds"""
        # Test terminal pair pattern
        validator = SequenceValidator(self.test_sequences['terminal_pair'])
        analysis = validator.analyze_cysteines()
        self.assertTrue(
            analysis['patterns']['motifs']['terminal_pair'],
            "Failed to detect terminal cysteine pair pattern"
        )
        self.assertTrue(
            analysis['scaffold_evaluation']['suitable_scaffold'],
            "Terminal pair pattern not recognized as suitable scaffold"
        )
        
        # Test ladder motif
        validator = SequenceValidator(self.test_sequences['ladder_motif'])
        analysis = validator.analyze_cysteines()
        self.assertTrue(
            analysis['patterns']['motifs']['ladder'],
            "Failed to detect ladder-like cysteine pattern"
        )
        self.assertTrue(
            analysis['scaffold_evaluation']['suitable_scaffold'],
            "Ladder pattern not recognized as suitable scaffold"
        )
        
        # Test odd number of cysteines
        validator = SequenceValidator(self.test_sequences['odd_cysteines'])
        analysis = validator.analyze_cysteines()
        self.assertFalse(
            analysis['patterns']['paired'],
            "Odd number of cysteines incorrectly marked as paired"
        )
        self.assertTrue(
            any("Odd number of cysteines" in warning 
                for warning in analysis['warnings'] if warning),
            "No warning for odd number of cysteines"
        )
        
        # Test optimal scaffold
        test_seq = self.test_sequences['optimal_scaffold']
        cys_count = test_seq.count('C')
        print(f"\nDebug - Optimal scaffold sequence: {test_seq}")
        print(f"Debug - Cysteine count: {cys_count}")
        print(f"Debug - Cysteine positions: {[i for i, aa in enumerate(test_seq) if aa == 'C']}")
        
        validator = SequenceValidator(self.test_sequences['optimal_scaffold'])
        analysis = validator.analyze_cysteines()
        
        print(f"Debug - Analysis result: {analysis}")
        
        self.assertTrue(
            analysis['scaffold_evaluation']['optimal_count'],
            "Optimal cysteine count not recognized"
        )
        self.assertTrue(
            analysis['scaffold_evaluation']['well_distributed'],
            "Well-distributed cysteines not recognized"
        )
        self.assertTrue(
            analysis['scaffold_evaluation']['suitable_scaffold'],
            "Optimal scaffold pattern not recognized"
        )
        warnings = [w for w in analysis['warnings'] if w]
        self.assertEqual(
            len(warnings),
            0,
            f"Unexpected warnings for optimal scaffold: {warnings}"
        )

    def test_comprehensive_validation(self):
        """Test the complete binder validation process"""
        # Test problematic sequence
        bad_sequence = 'AAAAAQQQQPPPPPAAAQQP'
        result = validate_binder(bad_sequence)
        
        self.assertTrue('warnings' in result)
        self.assertGreater(len(result['warnings']), 0)
        self.assertFalse(result['is_valid'])
        
        # Warnings should mention specific issues
        warning_text = ' '.join(result['warnings']).lower()
        self.assertTrue(
            any(term in warning_text for term in ['homopolymer', 'low complexity', 'high a/q/p']),
            "Warnings don't specify sequence complexity issues"
        )
        
        # Test good sequence
        good_sequence = self.test_sequences['good_binder']
        result = validate_binder(good_sequence)
        
        self.assertTrue('warnings' in result)
        self.assertEqual(len(result['warnings']), 0)
        self.assertTrue(result['is_valid'])
            
    def test_charge_calculation(self):
        """Test charge calculation at specific pH values."""
        test_cases = [
            ('DDDEEEDDDEEE', 7.0, -12),  # Acidic sequence at neutral pH
            ('KKRRKKKRRKRK', 7.0, 12),   # Basic sequence at neutral pH
            ('GGGGGGGGGGGG', 7.0, 0),    # Neutral sequence
        ]
        
        for seq, ph, expected_charge in test_cases:
            validator = SequenceValidator(seq)
            charge = validator.charge_at_ph(ph)
            self.assertAlmostEqual(
                charge, 
                expected_charge, 
                places=0,
                msg=f"Charge calculation incorrect for sequence {seq} at pH {ph}"
            )

if __name__ == '__main__':
    unittest.main()