| """ |
| Reproducibility harness for Healdette pipeline. |
| Runs the full pipeline with validation and generates all artifacts. |
| """ |
|
|
| import argparse |
| import json |
| import sys |
| import os |
| import hashlib |
| from datetime import datetime |
| import numpy as np |
| import torch |
| import pandas as pd |
|
|
| from modules.validate_sequences import validate_binder_set |
|
|
| def set_random_seeds(seed=42): |
| """Set random seeds for reproducibility.""" |
| np.random.seed(seed) |
| torch.manual_seed(seed) |
| if torch.cuda.is_available(): |
| torch.cuda.manual_seed(seed) |
|
|
| def calculate_sha256(filepath): |
| """Calculate SHA256 hash of a file.""" |
| sha256_hash = hashlib.sha256() |
| with open(filepath, "rb") as f: |
| for byte_block in iter(lambda: f.read(4096), b""): |
| sha256_hash.update(byte_block) |
| return sha256_hash.hexdigest() |
|
|
| def validate_criteria(results, criteria): |
| """Validate results against pre-registered criteria.""" |
| failures = [] |
| for binder in results['validated_binders']: |
| validation = binder['validation'] |
| |
| |
| if validation['disorder'] > criteria['disorder_threshold']: |
| failures.append(f"Sequence {binder['sequence'][:20]}... has high disorder: {validation['disorder']:.3f}") |
| |
| |
| if criteria['signal_peptide'] == 'disallow' and validation['signal_peptide']['has_signal']: |
| failures.append(f"Sequence {binder['sequence'][:20]}... has signal peptide") |
| |
| |
| if criteria['cys_pairs'] == 'required' and not validation['cysteines']['patterns']['paired']: |
| failures.append(f"Sequence {binder['sequence'][:20]}... lacks paired cysteines") |
| |
| |
| gravy = validation['properties']['GRAVY'] |
| if not (criteria['gravy_range'][0] <= gravy <= criteria['gravy_range'][1]): |
| failures.append(f"Sequence {binder['sequence'][:20]}... has GRAVY {gravy:.3f} outside range") |
| |
| return failures |
|
|
| def generate_triage_table(results): |
| """Generate triage table with key metrics.""" |
| rows = [] |
| for binder in results['validated_binders']: |
| rows.append({ |
| 'sequence_length': len(binder['sequence']), |
| 'personalization_score': binder['personalization_score'], |
| 'disorder': binder['validation']['disorder'], |
| 'cys_pairs': binder['validation']['cysteines']['count'] // 2, |
| 'glyco_sites': len(binder['validation']['glycosylation']), |
| 'gravy': binder['validation']['properties']['GRAVY'], |
| 'pI': binder['validation']['properties']['pI'] |
| }) |
| |
| df = pd.DataFrame(rows) |
| return df.sort_values('personalization_score', ascending=False) |
|
|
| def main(args): |
| |
| with open('run_manifest.json', 'r') as f: |
| manifest = json.load(f) |
| |
| |
| if args.deterministic: |
| set_random_seeds() |
| |
| |
| results = validate_binder_set(args.input_json) |
| |
| |
| triage_table = generate_triage_table(results) |
| triage_table.to_csv('output/triage_table.csv') |
| |
| |
| failures = validate_criteria(results, manifest['validation_criteria']) |
| |
| |
| checksums = {} |
| for filepath in [args.input_json, 'output/triage_table.csv', 'output/sequence_analysis.png']: |
| checksums[os.path.basename(filepath)] = calculate_sha256(filepath) |
| |
| with open('checksums.sha256', 'w') as f: |
| for filename, checksum in checksums.items(): |
| f.write(f"{checksum} {filename}\n") |
| |
| |
| if failures: |
| print("\nValidation failures:") |
| for failure in failures: |
| print(f"- {failure}") |
| sys.exit(1) |
| |
| print("\nValidation successful!") |
| print(f"Results saved to {args.output_dir}") |
| sys.exit(0) |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description='Run Healdette pipeline with validation') |
| parser.add_argument('--input-json', default='output/codette_antibody_designs_20250912_150658.json', |
| help='Input JSON file with antibody designs') |
| parser.add_argument('--output-dir', default='output', |
| help='Output directory for results') |
| parser.add_argument('--deterministic', action='store_true', |
| help='Run in deterministic mode with fixed seeds') |
| |
| args = parser.parse_args() |
| main(args) |