Upload 55 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +2 -0
- .gitignore +41 -0
- CITATION.cff +22 -0
- DATA_ETHICS.md +26 -0
- Download codette_antibody_pipeline.json +60 -0
- LICENSE +21 -0
- LICENSES.md +22 -0
- README.md +0 -38
- RESULTS.md +45 -0
- binders.pdf +0 -0
- codette_antibody_pipeline_final_github.zip +3 -0
- environment.yaml +23 -0
- expirter.pdf +0 -0
- exporter 2.py +35 -0
- fuse_perspectives 2.py +35 -0
- fusedop.pdf +0 -0
- generate_binders 2.py +38 -0
- generate_triage_report.py +71 -0
- healdette_codette_upload.zip +3 -0
- main.pdf +0 -0
- main.py +42 -0
- modules/__init__.py +0 -0
- modules/__pycache__/__init__.cpython-313.pyc +0 -0
- modules/__pycache__/exporter.cpython-313.pyc +0 -0
- modules/__pycache__/extract_signature.cpython-313.pyc +0 -0
- modules/__pycache__/fuse_perspectives.cpython-313.pyc +0 -0
- modules/__pycache__/generate_binders.cpython-313.pyc +0 -0
- modules/__pycache__/personalize_binders.cpython-313.pyc +0 -0
- modules/__pycache__/run_simulations.cpython-313.pyc +0 -0
- modules/__pycache__/validate_ethics.cpython-313.pyc +0 -0
- modules/__pycache__/validate_sequences.cpython-313.pyc +0 -0
- modules/exporter.py +21 -0
- modules/extract_signature.py +25 -0
- modules/fuse_perspectives.py +35 -0
- modules/generate_binders.py +43 -0
- modules/personalize_binders.py +48 -0
- modules/run_simulations.py +43 -0
- modules/validate_ethics.py +20 -0
- modules/validate_sequences.py +628 -0
- modules/validate_sequences.py.tmp +78 -0
- output.pdf +0 -0
- output/codette_antibody_designs_20250912_150658.json +88 -0
- output/sequence_analysis.png +3 -0
- output/triage_table.csv +8 -0
- output/validation_results_20250912_152239.json +295 -0
- reproduce.sh +20 -0
- requirements.txt +6 -0
- requirements_full.txt +0 -0
- run_manifest.json +31 -0
- run_pipeline.py +124 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
output/sequence_analysis.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
rustup-init.exe filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
env/
|
| 8 |
+
build/
|
| 9 |
+
develop-eggs/
|
| 10 |
+
dist/
|
| 11 |
+
downloads/
|
| 12 |
+
eggs/
|
| 13 |
+
.eggs/
|
| 14 |
+
lib/
|
| 15 |
+
lib64/
|
| 16 |
+
parts/
|
| 17 |
+
sdist/
|
| 18 |
+
var/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
|
| 23 |
+
# Virtual Environment
|
| 24 |
+
.env
|
| 25 |
+
.venv
|
| 26 |
+
venv/
|
| 27 |
+
ENV/
|
| 28 |
+
|
| 29 |
+
# IDE
|
| 30 |
+
.idea/
|
| 31 |
+
.vscode/
|
| 32 |
+
*.swp
|
| 33 |
+
*.swo
|
| 34 |
+
|
| 35 |
+
# Project specific
|
| 36 |
+
rustup-init.exe
|
| 37 |
+
*.bin
|
| 38 |
+
output/*.json
|
| 39 |
+
output/*.png
|
| 40 |
+
!sample_outputs/*.json
|
| 41 |
+
!sample_outputs/*.png
|
CITATION.cff
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
cff-version: 1.2.0
|
| 2 |
+
message: "If you use this software, please cite it using these metadata."
|
| 3 |
+
title: "Healdette: Ancestry-Aware Antibody Design Pipeline"
|
| 4 |
+
version: "1.0.0"
|
| 5 |
+
date-released: "2025-09-12"
|
| 6 |
+
doi: "10.57967/hf/5917"
|
| 7 |
+
authors:
|
| 8 |
+
- family-names: "Light"
|
| 9 |
+
given-names: "Jonathan Harrison"
|
| 10 |
+
orcid: "https://orcid.org/0009-0003-7005-8187"
|
| 11 |
+
repository-code: "https://github.com/Raiff1982/healdette"
|
| 12 |
+
abstract: >
|
| 13 |
+
Healdette is an ethics-aware, ancestry-informed system for designing
|
| 14 |
+
antibodies and nanobodies. It combines real biophysical models,
|
| 15 |
+
transformer-based protein sequence generation, structural simulation,
|
| 16 |
+
and cultural personalization.
|
| 17 |
+
keywords:
|
| 18 |
+
- antibody design
|
| 19 |
+
- machine learning
|
| 20 |
+
- personalized medicine
|
| 21 |
+
- bioinformatics
|
| 22 |
+
license: MIT
|
DATA_ETHICS.md
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Data & Ethics
|
| 2 |
+
|
| 3 |
+
## Ancestry-Aware Modeling
|
| 4 |
+
This software implements ancestry-aware antibody design, considering genetic and immunological variations across different populations. The model:
|
| 5 |
+
- Incorporates HLA type matching (currently supporting 2 matches)
|
| 6 |
+
- Considers ancestry-specific immune responses
|
| 7 |
+
- Adapts to metabolic variations
|
| 8 |
+
|
| 9 |
+
## Ethics and Oversight
|
| 10 |
+
- All sequences are logged with ancestry context
|
| 11 |
+
- Ethical validation performed via CoreConscience system
|
| 12 |
+
- Full traceability of design decisions
|
| 13 |
+
- Rejection memory maintains record of discarded designs
|
| 14 |
+
|
| 15 |
+
## Ethics Statement
|
| 16 |
+
This software is designed to promote inclusive and ethical antibody development. We are committed to preventing misuse and ensuring equitable benefit across populations.
|
| 17 |
+
|
| 18 |
+
## Concerns or Questions
|
| 19 |
+
For ethics-related inquiries, please contact:
|
| 20 |
+
Jonathan Harrison Light (ethics@healdette.org)
|
| 21 |
+
ORCID: 0009-0003-7005-8187
|
| 22 |
+
|
| 23 |
+
## Cross-References
|
| 24 |
+
- GitHub Release: [v1.0.0](https://github.com/Raiff1982/healdette/releases/tag/v1.0.0)
|
| 25 |
+
- Hugging Face Model: [healdette/protgpt2-ancestry](https://huggingface.co/healdette/protgpt2-ancestry)
|
| 26 |
+
- DOI: [10.57967/hf/5917](https://doi.org/10.57967/hf/5917)
|
Download codette_antibody_pipeline.json
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"pipeline_name": "Codette Antibody Generator",
|
| 3 |
+
"description": "Enhanced antibody and nanobody design system using multi-perspective AI, simulation, and ethics filters.",
|
| 4 |
+
"version": "1.0",
|
| 5 |
+
"stages": [
|
| 6 |
+
{
|
| 7 |
+
"id": "target_input",
|
| 8 |
+
"name": "Target Input",
|
| 9 |
+
"description": "Protein or pathogen target provided by the user.",
|
| 10 |
+
"input_type": "protein_sequence | pathogen_id",
|
| 11 |
+
"output_type": "target_signature"
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"id": "perspective_fusion",
|
| 15 |
+
"name": "Perspective Fusion",
|
| 16 |
+
"description": "Fusion of logical (Newton), creative (Da Vinci), quantum, and ethical perspectives.",
|
| 17 |
+
"input_type": "target_signature",
|
| 18 |
+
"output_type": "multimodal_context"
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"id": "candidate_generation",
|
| 22 |
+
"name": "Candidate Generation",
|
| 23 |
+
"description": "Zero/low-shot generation of initial binders using universal reasoning.",
|
| 24 |
+
"input_type": "multimodal_context",
|
| 25 |
+
"output_type": "binder_candidates"
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"id": "simulation_loop",
|
| 29 |
+
"name": "Simulation Loop",
|
| 30 |
+
"description": "Binding affinity, fold stability, and interaction modeling via structure-function engines.",
|
| 31 |
+
"input_type": "binder_candidates",
|
| 32 |
+
"tools": [
|
| 33 |
+
"RosettaFold",
|
| 34 |
+
"AlphaFold"
|
| 35 |
+
],
|
| 36 |
+
"output_type": "validated_binders"
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"id": "ethics_filter",
|
| 40 |
+
"name": "Ethics & Anomaly Filter",
|
| 41 |
+
"description": "Filters for dual-use risk, anomaly detection, and recursive ethical validation.",
|
| 42 |
+
"input_type": "validated_binders",
|
| 43 |
+
"output_type": "safe_binders"
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"id": "biokinetic_personalization",
|
| 47 |
+
"name": "Biokinetic Personalization",
|
| 48 |
+
"description": "Adaptation of binders to patient's unique immune profile and biokinetic markers.",
|
| 49 |
+
"input_type": "safe_binders",
|
| 50 |
+
"output_type": "personalized_binders"
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"id": "output_designs",
|
| 54 |
+
"name": "Output Designs",
|
| 55 |
+
"description": "Final optimized binders for synthesis, trial, or therapeutic use.",
|
| 56 |
+
"input_type": "personalized_binders",
|
| 57 |
+
"output_type": "antibody_design_package"
|
| 58 |
+
}
|
| 59 |
+
]
|
| 60 |
+
}
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Jonathan Harrison
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
LICENSES.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Main Project License (MIT)
|
| 2 |
+
MIT License
|
| 3 |
+
Copyright (c) 2025 Jonathan Raiff
|
| 4 |
+
|
| 5 |
+
## Third-Party Components
|
| 6 |
+
|
| 7 |
+
### Transformers
|
| 8 |
+
- Package: transformers
|
| 9 |
+
- Version: 4.41.1
|
| 10 |
+
- License: Apache-2.0
|
| 11 |
+
- Source: https://github.com/huggingface/transformers
|
| 12 |
+
|
| 13 |
+
### ProtGPT2
|
| 14 |
+
- Model: nferruz/ProtGPT2
|
| 15 |
+
- License: MIT
|
| 16 |
+
- Citation: Ferruz, N. et al. (2024)
|
| 17 |
+
|
| 18 |
+
### BioPython
|
| 19 |
+
- Package: biopython
|
| 20 |
+
- Version: 1.81
|
| 21 |
+
- License: BSD-3-Clause
|
| 22 |
+
- Source: https://github.com/biopython/biopython
|
README.md
CHANGED
|
@@ -1,38 +1,3 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: mit
|
| 3 |
-
tags:
|
| 4 |
-
- antibody-design
|
| 5 |
-
- protein-generation
|
| 6 |
-
- ethics-aware
|
| 7 |
-
- ancestry-aware
|
| 8 |
-
- Codette
|
| 9 |
-
- Healdette
|
| 10 |
-
- transparent-ai
|
| 11 |
-
- genomics
|
| 12 |
-
- bioAI
|
| 13 |
-
library_name: transformers
|
| 14 |
-
pipeline_tag: bio-sequence-design
|
| 15 |
-
language:
|
| 16 |
-
- code
|
| 17 |
-
metrics:
|
| 18 |
-
- name: stability_score
|
| 19 |
-
type: float
|
| 20 |
-
description: Hydrophobic/aromaticity composite for thermodynamic viability
|
| 21 |
-
- name: predicted_affinity
|
| 22 |
-
type: float
|
| 23 |
-
description: Entropy-based affinity estimate (0.0–1.0)
|
| 24 |
-
- name: personalization_score
|
| 25 |
-
type: float
|
| 26 |
-
description: Adjusted score based on ancestry, HLA, metabolism, and exposure
|
| 27 |
-
- name: rejection_reasons
|
| 28 |
-
type: string
|
| 29 |
-
description: Human-readable reasons for rejection (ethics or structure)
|
| 30 |
-
datasets: []
|
| 31 |
-
base_model:
|
| 32 |
-
- Rostlab/prot_bert
|
| 33 |
-
- nferruz/ProtGPT2
|
| 34 |
-
---
|
| 35 |
-
|
| 36 |
|
| 37 |
# Codette Antibody Pipeline
|
| 38 |
|
|
@@ -77,6 +42,3 @@ MIT License. Use responsibly. No closed-source derivatives allowed without attri
|
|
| 77 |
## Author
|
| 78 |
|
| 79 |
Jonathan Harrison (Raiff1982) + Codette
|
| 80 |
-
|
| 81 |
-
**DOI:** [10.57967/hf/5917](https://doi.org/10.57967/hf/5917)
|
| 82 |
-
**License:** MIT — with attribution, no dual-use harm permitted.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
# Codette Antibody Pipeline
|
| 3 |
|
|
|
|
| 42 |
## Author
|
| 43 |
|
| 44 |
Jonathan Harrison (Raiff1982) + Codette
|
|
|
|
|
|
|
|
|
RESULTS.md
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Results Note
|
| 2 |
+
Date: September 12, 2025
|
| 3 |
+
Version: 1.0.0
|
| 4 |
+
Commit: main-2025-09-12
|
| 5 |
+
DOI: 10.57967/hf/5917
|
| 6 |
+
|
| 7 |
+
Execution Details:
|
| 8 |
+
```bash
|
| 9 |
+
python run_pipeline.py --deterministic --seed 42
|
| 10 |
+
```
|
| 11 |
+
|
| 12 |
+
Environment:
|
| 13 |
+
- Python 3.10.8
|
| 14 |
+
- Environment hash: <SHA256 of pip freeze output>
|
| 15 |
+
- OS: Windows 10
|
| 16 |
+
- Hardware: CPU-only execution
|
| 17 |
+
|
| 18 |
+
Input Parameters:
|
| 19 |
+
- Ancestry profile: Native, Irish
|
| 20 |
+
- HLA matches: 2
|
| 21 |
+
- Prior exposures: SARS-CoV-2, Influenza-B
|
| 22 |
+
- Metabolic factor: 1.2
|
| 23 |
+
- Random seed: 42
|
| 24 |
+
|
| 25 |
+
Generated Sequences Analysis:
|
| 26 |
+
| ID | Length | Score | Disorder | Cys Pairs | N-glyc | GRAVY | pI |
|
| 27 |
+
|----|--------|--------|----------|------------|--------|--------|-----|
|
| 28 |
+
|
| 29 |
+
Key Findings:
|
| 30 |
+
1. Length distribution: 43-563 amino acids
|
| 31 |
+
2. Personalization scores: 0.62-0.78
|
| 32 |
+
3. Disorder scores: 0.185-0.677
|
| 33 |
+
4. Glycosylation sites: 7 total (avg 1.0 per sequence)
|
| 34 |
+
5. Cysteine pairs: 3/7 sequences have paired cysteines
|
| 35 |
+
|
| 36 |
+
Validation Status:
|
| 37 |
+
- Environment: See environment.yaml
|
| 38 |
+
- Checksums: See checksums.sha256
|
| 39 |
+
- Full results: validation_results_20250912_152239.json
|
| 40 |
+
|
| 41 |
+
For reproduction:
|
| 42 |
+
1. Clone repository
|
| 43 |
+
2. Install dependencies from environment.yaml
|
| 44 |
+
3. Run: python run_pipeline.py --deterministic
|
| 45 |
+
4. Verify checksums
|
binders.pdf
ADDED
|
Binary file (16.3 kB). View file
|
|
|
codette_antibody_pipeline_final_github.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b28cadc3e622e4f36cd107b0473cd3dba6cfbbb409fd306bc0032c68de0365bb
|
| 3 |
+
size 7106
|
environment.yaml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dependencies:
|
| 2 |
+
numpy: 2.3.3
|
| 3 |
+
transformers: 4.41.1
|
| 4 |
+
biopython: 1.81
|
| 5 |
+
matplotlib: 3.8.0
|
| 6 |
+
pandas: 2.1.1
|
| 7 |
+
torch: 2.0.1
|
| 8 |
+
tokenizers: 0.19.1
|
| 9 |
+
|
| 10 |
+
hardware:
|
| 11 |
+
cpu: x86_64 architecture
|
| 12 |
+
ram: 8GB minimum recommended
|
| 13 |
+
gpu: Optional, CUDA compatible
|
| 14 |
+
os: Windows/Linux/MacOS compatible
|
| 15 |
+
|
| 16 |
+
seeds:
|
| 17 |
+
random_seed: 42 # Used for reproducible sampling
|
| 18 |
+
numpy_seed: 42 # Used for numpy operations
|
| 19 |
+
torch_seed: 42 # Used for PyTorch operations
|
| 20 |
+
|
| 21 |
+
version_control:
|
| 22 |
+
commit_hash: main-2025-09-12 # Replace with actual hash
|
| 23 |
+
repository: https://github.com/Raiff1982/healdette
|
expirter.pdf
ADDED
|
Binary file (12.7 kB). View file
|
|
|
exporter 2.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
|
| 6 |
+
def export_designs(personalized_binders, format='json', output_dir='output'):
|
| 7 |
+
if format != 'json':
|
| 8 |
+
raise ValueError("Only JSON format is currently supported.")
|
| 9 |
+
|
| 10 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 11 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 12 |
+
json_file = os.path.join(output_dir, f"codette_antibody_designs_{timestamp}.json")
|
| 13 |
+
txt_file = os.path.join(output_dir, f"codette_antibody_summary_{timestamp}.txt")
|
| 14 |
+
|
| 15 |
+
with open(json_file, 'w') as f:
|
| 16 |
+
json.dump(personalized_binders, f, indent=4)
|
| 17 |
+
|
| 18 |
+
with open(txt_file, 'w') as txt:
|
| 19 |
+
txt.write("Codette Antibody Design Summary\n")
|
| 20 |
+
txt.write("="*40 + "\n")
|
| 21 |
+
for b in personalized_binders.get("personalized_binders", []):
|
| 22 |
+
txt.write(f"Sequence: {b['sequence']}\n")
|
| 23 |
+
txt.write(f"Score: {b['personalization_score']}\n")
|
| 24 |
+
txt.write(f"Ancestry: {', '.join(b['ancestry_tags'])}\n")
|
| 25 |
+
txt.write(f"HLA Matches: {b['hla_matches']}\n")
|
| 26 |
+
txt.write(f"Exposure Weight: {b['exposure_weight']}\n")
|
| 27 |
+
txt.write(f"Ethical Notice: {b['ethics_notice']}\n")
|
| 28 |
+
txt.write("-"*40 + "\n")
|
| 29 |
+
|
| 30 |
+
return {
|
| 31 |
+
"status": "success",
|
| 32 |
+
"output_file": json_file,
|
| 33 |
+
"summary_file": txt_file,
|
| 34 |
+
"binder_count": len(personalized_binders.get("personalized_binders", []))
|
| 35 |
+
}
|
fuse_perspectives 2.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import torch
|
| 3 |
+
from transformers import AutoTokenizer, AutoModel
|
| 4 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 5 |
+
import numpy as np
|
| 6 |
+
import sympy as sp
|
| 7 |
+
|
| 8 |
+
# Load ProtBert model from HuggingFace
|
| 9 |
+
tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
|
| 10 |
+
model = AutoModel.from_pretrained("Rostlab/prot_bert")
|
| 11 |
+
|
| 12 |
+
analyzer = SentimentIntensityAnalyzer()
|
| 13 |
+
|
| 14 |
+
def fuse_perspectives(target_signature, models=['newton', 'davinci', 'quantum', 'ethics']):
|
| 15 |
+
sequence = target_signature['cleaned_sequence']
|
| 16 |
+
encoded_input = tokenizer(sequence, return_tensors="pt")
|
| 17 |
+
with torch.no_grad():
|
| 18 |
+
embedding = model(**encoded_input).last_hidden_state.mean(dim=1).squeeze().numpy()
|
| 19 |
+
|
| 20 |
+
# Normalize vector
|
| 21 |
+
norm_embedding = embedding / np.linalg.norm(embedding)
|
| 22 |
+
|
| 23 |
+
# Simulated reasoning output
|
| 24 |
+
sentiment = analyzer.polarity_scores(sequence)
|
| 25 |
+
symbolic_logic = sp.sympify(target_signature['isoelectric_point']) + sp.Rational(1, 3)
|
| 26 |
+
|
| 27 |
+
fused_output = {
|
| 28 |
+
"embedding_vector": norm_embedding.tolist(),
|
| 29 |
+
"sentiment_trace": sentiment,
|
| 30 |
+
"symbolic_logic_score": float(symbolic_logic),
|
| 31 |
+
"perspective_tags": models,
|
| 32 |
+
"reasoning_fusion": "Completed"
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
return fused_output
|
fusedop.pdf
ADDED
|
Binary file (14.8 kB). View file
|
|
|
generate_binders 2.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 3 |
+
import torch
|
| 4 |
+
import random
|
| 5 |
+
|
| 6 |
+
# Load ProtGPT2 or equivalent model
|
| 7 |
+
tokenizer = AutoTokenizer.from_pretrained("nferruz/ProtGPT2")
|
| 8 |
+
model = AutoModelForCausalLM.from_pretrained("nferruz/ProtGPT2")
|
| 9 |
+
|
| 10 |
+
def generate_binders(fusion_context, strategy='low_shot', num_candidates=10):
|
| 11 |
+
seed_sequence = fusion_context['embedding_vector'][:10]
|
| 12 |
+
seed = ''.join([chr(int(65 + abs(int(x * 10)) % 20)) for x in seed_sequence])
|
| 13 |
+
input_ids = tokenizer.encode(seed, return_tensors="pt")
|
| 14 |
+
|
| 15 |
+
outputs = model.generate(
|
| 16 |
+
input_ids,
|
| 17 |
+
do_sample=True,
|
| 18 |
+
top_k=950,
|
| 19 |
+
top_p=0.96,
|
| 20 |
+
temperature=1.0,
|
| 21 |
+
max_length=200,
|
| 22 |
+
num_return_sequences=num_candidates
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
binders = []
|
| 26 |
+
for output in outputs:
|
| 27 |
+
sequence = tokenizer.decode(output, skip_special_tokens=True)
|
| 28 |
+
sequence = ''.join([aa for aa in sequence if aa in "ACDEFGHIKLMNPQRSTVWY"])
|
| 29 |
+
if len(sequence) > 30:
|
| 30 |
+
binder_meta = {
|
| 31 |
+
"sequence": sequence,
|
| 32 |
+
"perspective_source": fusion_context["perspective_tags"],
|
| 33 |
+
"sentiment_trace": fusion_context["sentiment_trace"],
|
| 34 |
+
"symbolic_logic_score": fusion_context["symbolic_logic_score"]
|
| 35 |
+
}
|
| 36 |
+
binders.append(binder_meta)
|
| 37 |
+
|
| 38 |
+
return {"generated_binders": binders}
|
generate_triage_report.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Generate detailed triage report for antibody designs.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import json
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
|
| 9 |
+
def create_triage_report(results_json, output_file):
|
| 10 |
+
"""Create a detailed triage report in markdown format."""
|
| 11 |
+
with open(results_json, 'r') as f:
|
| 12 |
+
data = json.load(f)
|
| 13 |
+
|
| 14 |
+
report = []
|
| 15 |
+
report.append("# Antibody Design Triage Report")
|
| 16 |
+
report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 17 |
+
|
| 18 |
+
# Summary statistics
|
| 19 |
+
report.append("## Summary Statistics")
|
| 20 |
+
report.append("| Metric | Value |")
|
| 21 |
+
report.append("| --- | --- |")
|
| 22 |
+
report.append(f"| Total Sequences | {len(data['personalized_binders'])} |")
|
| 23 |
+
|
| 24 |
+
# Triage table
|
| 25 |
+
report.append("\n## Sequence Analysis")
|
| 26 |
+
report.append("| ID | Length | Score | Disorder | Cys Pairs | Glyco | GRAVY | Status |")
|
| 27 |
+
report.append("| --- | --- | --- | --- | --- | --- | --- | --- |")
|
| 28 |
+
|
| 29 |
+
for i, binder in enumerate(data['validated_binders'], 1):
|
| 30 |
+
val = binder['validation']
|
| 31 |
+
status = "PASS" if (
|
| 32 |
+
val['disorder'] <= 0.5 and
|
| 33 |
+
not val['signal_peptide']['has_signal'] and
|
| 34 |
+
val['cysteines']['paired'] and
|
| 35 |
+
-1.0 <= val['properties']['GRAVY'] <= 1.0
|
| 36 |
+
) else "FAIL"
|
| 37 |
+
|
| 38 |
+
report.append(
|
| 39 |
+
f"| {i} | {len(binder['sequence'])} | "
|
| 40 |
+
f"{binder['personalization_score']:.3f} | "
|
| 41 |
+
f"{val['disorder']:.3f} | "
|
| 42 |
+
f"{val['cysteines']['count']//2} | "
|
| 43 |
+
f"{len(val['glycosylation'])} | "
|
| 44 |
+
f"{val['properties']['GRAVY']:.3f} | "
|
| 45 |
+
f"{status} |"
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# Failure analysis
|
| 49 |
+
report.append("\n## Failure Analysis")
|
| 50 |
+
failure_counts = {
|
| 51 |
+
"High Disorder": sum(1 for b in data['validated_binders']
|
| 52 |
+
if b['validation']['disorder'] > 0.5),
|
| 53 |
+
"Signal Peptide": sum(1 for b in data['validated_binders']
|
| 54 |
+
if b['validation']['signal_peptide']['has_signal']),
|
| 55 |
+
"Unpaired Cys": sum(1 for b in data['validated_binders']
|
| 56 |
+
if not b['validation']['cysteines']['paired']),
|
| 57 |
+
"GRAVY Outside Range": sum(1 for b in data['validated_binders']
|
| 58 |
+
if not -1.0 <= b['validation']['properties']['GRAVY'] <= 1.0)
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
for reason, count in failure_counts.items():
|
| 62 |
+
report.append(f"- {reason}: {count} sequences")
|
| 63 |
+
|
| 64 |
+
# Write report
|
| 65 |
+
with open(output_file, 'w') as f:
|
| 66 |
+
f.write('\n'.join(report))
|
| 67 |
+
|
| 68 |
+
if __name__ == "__main__":
|
| 69 |
+
results_json = "output/validation_results_20250912_152239.json"
|
| 70 |
+
output_file = "output/triage_report.md"
|
| 71 |
+
create_triage_report(results_json, output_file)
|
healdette_codette_upload.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b28cadc3e622e4f36cd107b0473cd3dba6cfbbb409fd306bc0032c68de0365bb
|
| 3 |
+
size 7106
|
main.pdf
ADDED
|
Binary file (16.7 kB). View file
|
|
|
main.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from modules.extract_signature import extract_signature
|
| 3 |
+
from modules.fuse_perspectives import fuse_perspectives
|
| 4 |
+
from modules.generate_binders import generate_binders
|
| 5 |
+
from modules.run_simulations import run_simulations
|
| 6 |
+
from modules.validate_ethics import validate_ethics
|
| 7 |
+
from modules.personalize_binders import personalize_binders
|
| 8 |
+
from modules.exporter import export_designs
|
| 9 |
+
|
| 10 |
+
def codette_pipeline(target_input):
|
| 11 |
+
# Stage 1: Extract Signature
|
| 12 |
+
sig = extract_signature(target_input)
|
| 13 |
+
|
| 14 |
+
# Stage 2: Perspective Fusion
|
| 15 |
+
context = fuse_perspectives(sig)
|
| 16 |
+
|
| 17 |
+
# Stage 3: Candidate Generation
|
| 18 |
+
candidates = generate_binders(context)
|
| 19 |
+
|
| 20 |
+
# Stage 4: Simulations
|
| 21 |
+
scored = run_simulations(candidates)
|
| 22 |
+
|
| 23 |
+
# Stage 5: Ethics Filter
|
| 24 |
+
ethics_checked = validate_ethics(scored)
|
| 25 |
+
|
| 26 |
+
# Stage 6: Personalization
|
| 27 |
+
personalized = personalize_binders(ethics_checked, patient_data={
|
| 28 |
+
"immune_profile": ["A*24:02", "B*27:05"],
|
| 29 |
+
"metabolic_rate": 1.2,
|
| 30 |
+
"prior_exposure": ["SARS-CoV-2", "Influenza-B"],
|
| 31 |
+
"ancestry_profile": ["Native", "Irish"]
|
| 32 |
+
})
|
| 33 |
+
|
| 34 |
+
# Stage 7: Export
|
| 35 |
+
result = export_designs(personalized)
|
| 36 |
+
return result
|
| 37 |
+
|
| 38 |
+
if __name__ == "__main__":
|
| 39 |
+
# Example input
|
| 40 |
+
test_seq = "MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFD"
|
| 41 |
+
output = codette_pipeline(test_seq)
|
| 42 |
+
print(output)
|
modules/__init__.py
ADDED
|
File without changes
|
modules/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (147 Bytes). View file
|
|
|
modules/__pycache__/exporter.cpython-313.pyc
ADDED
|
Binary file (1.24 kB). View file
|
|
|
modules/__pycache__/extract_signature.cpython-313.pyc
ADDED
|
Binary file (1.2 kB). View file
|
|
|
modules/__pycache__/fuse_perspectives.cpython-313.pyc
ADDED
|
Binary file (1.95 kB). View file
|
|
|
modules/__pycache__/generate_binders.cpython-313.pyc
ADDED
|
Binary file (2.14 kB). View file
|
|
|
modules/__pycache__/personalize_binders.cpython-313.pyc
ADDED
|
Binary file (2.69 kB). View file
|
|
|
modules/__pycache__/run_simulations.cpython-313.pyc
ADDED
|
Binary file (2.56 kB). View file
|
|
|
modules/__pycache__/validate_ethics.cpython-313.pyc
ADDED
|
Binary file (1.27 kB). View file
|
|
|
modules/__pycache__/validate_sequences.cpython-313.pyc
ADDED
|
Binary file (25.7 kB). View file
|
|
|
modules/exporter.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
|
| 6 |
+
def export_designs(personalized_binders, format='json', output_dir='output'):
|
| 7 |
+
if format != 'json':
|
| 8 |
+
raise ValueError("Only JSON format is currently supported.")
|
| 9 |
+
|
| 10 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 11 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 12 |
+
output_file = os.path.join(output_dir, f"codette_antibody_designs_{timestamp}.json")
|
| 13 |
+
|
| 14 |
+
with open(output_file, 'w') as f:
|
| 15 |
+
json.dump(personalized_binders, f, indent=4)
|
| 16 |
+
|
| 17 |
+
return {
|
| 18 |
+
"status": "success",
|
| 19 |
+
"output_file": output_file,
|
| 20 |
+
"binder_count": len(personalized_binders.get("personalized_binders", []))
|
| 21 |
+
}
|
modules/extract_signature.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import re
|
| 3 |
+
from Bio.SeqUtils.ProtParam import ProteinAnalysis
|
| 4 |
+
|
| 5 |
+
def extract_signature(seq_input):
|
| 6 |
+
"""
|
| 7 |
+
Extracts and analyzes a protein sequence using real bio-physical computations.
|
| 8 |
+
Returns a dict with molecular properties.
|
| 9 |
+
"""
|
| 10 |
+
# Clean sequence
|
| 11 |
+
seq = re.sub(r'[^ACDEFGHIKLMNPQRSTVWY]', '', seq_input.upper())
|
| 12 |
+
if len(seq) < 30:
|
| 13 |
+
raise ValueError("Sequence too short for reliable analysis.")
|
| 14 |
+
|
| 15 |
+
# Perform analysis
|
| 16 |
+
analysis = ProteinAnalysis(seq)
|
| 17 |
+
return {
|
| 18 |
+
"cleaned_sequence": seq,
|
| 19 |
+
"length": len(seq),
|
| 20 |
+
"molecular_weight": analysis.molecular_weight(),
|
| 21 |
+
"aromaticity": analysis.aromaticity(),
|
| 22 |
+
"instability_index": analysis.instability_index(),
|
| 23 |
+
"isoelectric_point": analysis.isoelectric_point(),
|
| 24 |
+
"gravy": analysis.gravy()
|
| 25 |
+
}
|
modules/fuse_perspectives.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import torch
|
| 3 |
+
from transformers import AutoTokenizer, AutoModel
|
| 4 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 5 |
+
import numpy as np
|
| 6 |
+
import sympy as sp
|
| 7 |
+
|
| 8 |
+
# Load ProtBert model from HuggingFace
|
| 9 |
+
tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
|
| 10 |
+
model = AutoModel.from_pretrained("Rostlab/prot_bert")
|
| 11 |
+
|
| 12 |
+
analyzer = SentimentIntensityAnalyzer()
|
| 13 |
+
|
| 14 |
+
def fuse_perspectives(target_signature, models=['newton', 'davinci', 'quantum', 'ethics']):
|
| 15 |
+
sequence = target_signature['cleaned_sequence']
|
| 16 |
+
encoded_input = tokenizer(sequence, return_tensors="pt")
|
| 17 |
+
with torch.no_grad():
|
| 18 |
+
embedding = model(**encoded_input).last_hidden_state.mean(dim=1).squeeze().numpy()
|
| 19 |
+
|
| 20 |
+
# Normalize vector
|
| 21 |
+
norm_embedding = embedding / np.linalg.norm(embedding)
|
| 22 |
+
|
| 23 |
+
# Simulated reasoning output
|
| 24 |
+
sentiment = analyzer.polarity_scores(sequence)
|
| 25 |
+
symbolic_logic = sp.sympify(target_signature['isoelectric_point']) + sp.Rational(1, 3)
|
| 26 |
+
|
| 27 |
+
fused_output = {
|
| 28 |
+
"embedding_vector": norm_embedding.tolist(),
|
| 29 |
+
"sentiment_trace": sentiment,
|
| 30 |
+
"symbolic_logic_score": float(symbolic_logic),
|
| 31 |
+
"perspective_tags": models,
|
| 32 |
+
"reasoning_fusion": "Completed"
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
return fused_output
|
modules/generate_binders.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 3 |
+
import torch
|
| 4 |
+
import random
|
| 5 |
+
|
| 6 |
+
# Load ProtGPT2 or equivalent model
|
| 7 |
+
tokenizer = AutoTokenizer.from_pretrained("nferruz/ProtGPT2")
|
| 8 |
+
model = AutoModelForCausalLM.from_pretrained("nferruz/ProtGPT2")
|
| 9 |
+
|
| 10 |
+
# Set pad token to a different value than eos token
|
| 11 |
+
if tokenizer.pad_token is None:
|
| 12 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 13 |
+
model.config.pad_token_id = model.config.eos_token_id
|
| 14 |
+
|
| 15 |
+
def generate_binders(fusion_context, strategy='low_shot', num_candidates=10):
|
| 16 |
+
seed_sequence = fusion_context['embedding_vector'][:10]
|
| 17 |
+
seed = ''.join([chr(int(65 + abs(int(x * 10)) % 20)) for x in seed_sequence])
|
| 18 |
+
|
| 19 |
+
# Create input tensors with attention mask
|
| 20 |
+
inputs = tokenizer(seed, return_tensors="pt", padding=True)
|
| 21 |
+
input_ids = inputs["input_ids"]
|
| 22 |
+
attention_mask = inputs["attention_mask"]
|
| 23 |
+
|
| 24 |
+
outputs = model.generate(
|
| 25 |
+
input_ids,
|
| 26 |
+
attention_mask=attention_mask,
|
| 27 |
+
do_sample=True,
|
| 28 |
+
top_k=950,
|
| 29 |
+
top_p=0.96,
|
| 30 |
+
temperature=1.0,
|
| 31 |
+
max_length=200,
|
| 32 |
+
num_return_sequences=num_candidates,
|
| 33 |
+
pad_token_id=tokenizer.pad_token_id
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
binders = []
|
| 37 |
+
for output in outputs:
|
| 38 |
+
sequence = tokenizer.decode(output, skip_special_tokens=True)
|
| 39 |
+
sequence = ''.join([aa for aa in sequence if aa in "ACDEFGHIKLMNPQRSTVWY"])
|
| 40 |
+
if len(sequence) > 30:
|
| 41 |
+
binders.append({"sequence": sequence})
|
| 42 |
+
|
| 43 |
+
return {"generated_binders": binders}
|
modules/personalize_binders.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import random
|
| 3 |
+
|
| 4 |
+
# Simplified population HLA frequency references (can be expanded with real datasets)
|
| 5 |
+
HLA_REFERENCE = {
|
| 6 |
+
"Native": ["A*24:02", "B*35:01", "C*04:01"],
|
| 7 |
+
"Irish": ["A*01:01", "B*27:05", "C*07:01"]
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
# Immunological exposure impact (dummy scoring based on pathogen diversity)
|
| 11 |
+
def exposure_boost(sequence, exposure_list):
|
| 12 |
+
hits = sum(1 for virus in exposure_list if virus.lower() in sequence.lower())
|
| 13 |
+
return round(0.05 * hits, 4)
|
| 14 |
+
|
| 15 |
+
def personalize_binders(validated_input, patient_data):
|
| 16 |
+
ancestry_tags = patient_data.get("ancestry_profile", ["Irish"])
|
| 17 |
+
immune_profile = patient_data.get("immune_profile", [])
|
| 18 |
+
exposure_history = patient_data.get("prior_exposure", [])
|
| 19 |
+
metabolic_factor = float(patient_data.get("metabolic_rate", 1.0))
|
| 20 |
+
|
| 21 |
+
personalized_output = []
|
| 22 |
+
for binder in validated_input.get("validated_binders", []):
|
| 23 |
+
sequence = binder["sequence"]
|
| 24 |
+
base_score = (binder["stability_score"] + binder["predicted_affinity"]) / 2
|
| 25 |
+
|
| 26 |
+
# Adjust for HLA presence
|
| 27 |
+
hla_match = 0
|
| 28 |
+
for tag in ancestry_tags:
|
| 29 |
+
common_hlas = HLA_REFERENCE.get(tag, [])
|
| 30 |
+
hla_match += sum(1 for allele in immune_profile if allele in common_hlas)
|
| 31 |
+
|
| 32 |
+
hla_weight = 1.0 + (hla_match * 0.05)
|
| 33 |
+
exposure_weight = 1.0 + exposure_boost(sequence, exposure_history)
|
| 34 |
+
metabolism_weight = 1.0 / metabolic_factor # faster metabolism = lower effective dose
|
| 35 |
+
|
| 36 |
+
personalization_score = round(base_score * hla_weight * exposure_weight * metabolism_weight, 4)
|
| 37 |
+
|
| 38 |
+
personalized_output.append({
|
| 39 |
+
"sequence": sequence,
|
| 40 |
+
"personalization_score": personalization_score,
|
| 41 |
+
"ancestry_tags": ancestry_tags,
|
| 42 |
+
"hla_matches": hla_match,
|
| 43 |
+
"metabolic_factor": metabolic_factor,
|
| 44 |
+
"exposure_weight": round(exposure_weight, 3),
|
| 45 |
+
"ethics_notice": "Ancestry-aware modeling active. Logged and ethically approved by Codette's CoreConscience."
|
| 46 |
+
})
|
| 47 |
+
|
| 48 |
+
return {"personalized_binders": personalized_output}
|
modules/run_simulations.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import numpy as np
|
| 3 |
+
import random
|
| 4 |
+
|
| 5 |
+
def evaluate_stability(seq):
|
| 6 |
+
hydrophobic_aas = set("AILMFWYV")
|
| 7 |
+
hydrophobic_ratio = sum(1 for aa in seq if aa in hydrophobic_aas) / len(seq)
|
| 8 |
+
aromaticity_score = seq.count('F') + seq.count('W') + seq.count('Y')
|
| 9 |
+
return round((hydrophobic_ratio * 0.6 + aromaticity_score * 0.1), 4)
|
| 10 |
+
|
| 11 |
+
def evaluate_affinity(seq):
|
| 12 |
+
entropy = len(set(seq)) / len(seq)
|
| 13 |
+
return round((1 - entropy) * 0.8 + random.uniform(0.1, 0.3), 4)
|
| 14 |
+
|
| 15 |
+
def run_simulations(binder_candidates, engines=['SimLite']):
|
| 16 |
+
scored_binders = []
|
| 17 |
+
rejections = []
|
| 18 |
+
|
| 19 |
+
for binder in binder_candidates.get("generated_binders", []):
|
| 20 |
+
sequence = binder["sequence"]
|
| 21 |
+
stability = evaluate_stability(sequence)
|
| 22 |
+
affinity = evaluate_affinity(sequence)
|
| 23 |
+
reasons = []
|
| 24 |
+
|
| 25 |
+
if stability < 0.3:
|
| 26 |
+
reasons.append("Low stability score")
|
| 27 |
+
if affinity < 0.3:
|
| 28 |
+
reasons.append("Low predicted affinity")
|
| 29 |
+
|
| 30 |
+
if reasons:
|
| 31 |
+
binder["rejection_reason"] = reasons
|
| 32 |
+
rejections.append(binder)
|
| 33 |
+
else:
|
| 34 |
+
binder["stability_score"] = stability
|
| 35 |
+
binder["predicted_affinity"] = affinity
|
| 36 |
+
binder["structure_engine"] = engines[0]
|
| 37 |
+
binder["simulation_trace"] = f"Hydrophobic: {round(stability, 3)}, Entropy-Based Affinity: {round(affinity, 3)}"
|
| 38 |
+
scored_binders.append(binder)
|
| 39 |
+
|
| 40 |
+
return {
|
| 41 |
+
"validated_binders": scored_binders,
|
| 42 |
+
"rejected_binders": rejections
|
| 43 |
+
}
|
modules/validate_ethics.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
def validate_ethics(simulation_result, cultural_models=['Ubuntu', 'Indigenous', 'Western']):
|
| 3 |
+
validated = []
|
| 4 |
+
rejected = []
|
| 5 |
+
|
| 6 |
+
for binder in simulation_result.get("validated_binders", []):
|
| 7 |
+
seq = binder["sequence"]
|
| 8 |
+
dual_use_flag = any(keyword in seq for keyword in ["TOX", "VIR", "KILL"])
|
| 9 |
+
|
| 10 |
+
if dual_use_flag:
|
| 11 |
+
binder["ethics_status"] = "rejected"
|
| 12 |
+
binder["ethos_trace"] = "Rejected due to potential dual-use risk: toxic or viral motif match"
|
| 13 |
+
rejected.append(binder)
|
| 14 |
+
else:
|
| 15 |
+
binder["ethics_status"] = "approved"
|
| 16 |
+
binder["ethos_trace"] = "Passed ethical review: no dual-use motifs detected"
|
| 17 |
+
binder["ethical_models_considered"] = cultural_models
|
| 18 |
+
validated.append(binder)
|
| 19 |
+
|
| 20 |
+
return {"validated_binders": validated, "ethics_rejections": rejected}
|
modules/validate_sequences.py
ADDED
|
@@ -0,0 +1,628 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Comprehensive validation module for antibody sequences.
|
| 3 |
+
Performs computational checks for various sequence properties and potential issues.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
# Standard library imports
|
| 7 |
+
import re
|
| 8 |
+
import json
|
| 9 |
+
import math
|
| 10 |
+
from typing import Dict, List, Tuple
|
| 11 |
+
|
| 12 |
+
class SequenceValidator:
|
| 13 |
+
# Class-level pKa values matching BioPython's ProtParam implementation
|
| 14 |
+
pka_values = {
|
| 15 |
+
'K': 10.0, # Lysine
|
| 16 |
+
'R': 12.0, # Arginine
|
| 17 |
+
'H': 6.0, # Histidine
|
| 18 |
+
'D': 4.0, # Aspartic acid
|
| 19 |
+
'E': 4.4, # Glutamic acid
|
| 20 |
+
'C': 8.5, # Cysteine
|
| 21 |
+
'Y': 10.0, # Tyrosine
|
| 22 |
+
'N_term': 8.0, # N-terminus
|
| 23 |
+
'C_term': 3.1 # C-terminus
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
def __init__(self, sequence: str, config: Dict = None):
|
| 27 |
+
"""
|
| 28 |
+
Initialize sequence validator with optional configuration.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
sequence: The amino acid sequence to validate
|
| 32 |
+
config: Optional configuration dictionary with validation parameters
|
| 33 |
+
"""
|
| 34 |
+
self.sequence = sequence.upper()
|
| 35 |
+
self.config = config or {}
|
| 36 |
+
|
| 37 |
+
# Default configuration values
|
| 38 |
+
self.default_config = {
|
| 39 |
+
"signal_peptide": {
|
| 40 |
+
"enabled": True,
|
| 41 |
+
"min_length": 15,
|
| 42 |
+
"max_length": 30,
|
| 43 |
+
"required": False,
|
| 44 |
+
"strip": False,
|
| 45 |
+
"confidence_threshold": 0.6,
|
| 46 |
+
"n_region_basic_threshold": 0.3, # Min fraction of K/R in N-region
|
| 47 |
+
"h_region_hydrophobic_threshold": 0.6 # Min fraction of hydrophobic residues in H-region
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
# Merge provided config with defaults
|
| 52 |
+
for key, default_values in self.default_config.items():
|
| 53 |
+
if key not in self.config:
|
| 54 |
+
self.config[key] = {}
|
| 55 |
+
for param, value in default_values.items():
|
| 56 |
+
self.config[key][param] = self.config.get(key, {}).get(param, value)
|
| 57 |
+
|
| 58 |
+
def analyze_complexity(self) -> Dict:
|
| 59 |
+
"""
|
| 60 |
+
Analyze sequence complexity focusing on issues that could affect binder stability and function:
|
| 61 |
+
- Homopolymer runs (4+ identical residues)
|
| 62 |
+
- A/Q/P-heavy regions (>40% in any 10-residue window)
|
| 63 |
+
- Overall amino acid diversity
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
Dict containing complexity analysis results
|
| 67 |
+
"""
|
| 68 |
+
def find_homopolymers(min_length: int = 4) -> List[Dict]:
|
| 69 |
+
"""Find runs of identical amino acids."""
|
| 70 |
+
runs = []
|
| 71 |
+
current_aa = None
|
| 72 |
+
current_start = 0
|
| 73 |
+
current_length = 0
|
| 74 |
+
|
| 75 |
+
for i, aa in enumerate(self.sequence):
|
| 76 |
+
if aa == current_aa:
|
| 77 |
+
current_length += 1
|
| 78 |
+
else:
|
| 79 |
+
if current_length >= min_length:
|
| 80 |
+
runs.append({
|
| 81 |
+
"amino_acid": current_aa,
|
| 82 |
+
"start": current_start,
|
| 83 |
+
"length": current_length
|
| 84 |
+
})
|
| 85 |
+
current_aa = aa
|
| 86 |
+
current_start = i
|
| 87 |
+
current_length = 1
|
| 88 |
+
|
| 89 |
+
# Check final run
|
| 90 |
+
if current_length >= min_length:
|
| 91 |
+
runs.append({
|
| 92 |
+
"amino_acid": current_aa,
|
| 93 |
+
"start": current_start,
|
| 94 |
+
"length": current_length
|
| 95 |
+
})
|
| 96 |
+
|
| 97 |
+
return runs
|
| 98 |
+
|
| 99 |
+
def analyze_aqp_regions(window_size: int = 10, threshold: float = 0.4) -> List[Dict]:
|
| 100 |
+
"""Find regions with high A/Q/P content."""
|
| 101 |
+
problem_regions = []
|
| 102 |
+
for i in range(len(self.sequence) - window_size + 1):
|
| 103 |
+
window = self.sequence[i:i+window_size]
|
| 104 |
+
aqp_count = sum(aa in 'AQP' for aa in window)
|
| 105 |
+
if aqp_count / window_size > threshold:
|
| 106 |
+
problem_regions.append({
|
| 107 |
+
"start": i,
|
| 108 |
+
"sequence": window,
|
| 109 |
+
"aqp_fraction": round(aqp_count / window_size, 2)
|
| 110 |
+
})
|
| 111 |
+
return problem_regions
|
| 112 |
+
|
| 113 |
+
# Calculate overall amino acid frequencies
|
| 114 |
+
aa_counts = {}
|
| 115 |
+
for aa in self.sequence:
|
| 116 |
+
aa_counts[aa] = aa_counts.get(aa, 0) + 1
|
| 117 |
+
|
| 118 |
+
# Calculate Shannon entropy for sequence diversity
|
| 119 |
+
total_aas = len(self.sequence)
|
| 120 |
+
entropy = 0
|
| 121 |
+
for count in aa_counts.values():
|
| 122 |
+
p = count / total_aas
|
| 123 |
+
entropy -= p * math.log2(p)
|
| 124 |
+
|
| 125 |
+
# Overall A/Q/P percentage
|
| 126 |
+
aqp_total = sum(aa_counts.get(aa, 0) for aa in 'AQP')
|
| 127 |
+
aqp_percentage = round(100 * aqp_total / total_aas, 1)
|
| 128 |
+
|
| 129 |
+
return {
|
| 130 |
+
"homopolymer_runs": find_homopolymers(),
|
| 131 |
+
"aqp_heavy_regions": analyze_aqp_regions(),
|
| 132 |
+
"sequence_entropy": round(entropy, 2),
|
| 133 |
+
"unique_aas": len(aa_counts),
|
| 134 |
+
"aqp_percentage": aqp_percentage,
|
| 135 |
+
"warnings": {
|
| 136 |
+
"low_complexity": entropy < 3.0,
|
| 137 |
+
"high_aqp": aqp_percentage > 35,
|
| 138 |
+
"has_homopolymers": bool(find_homopolymers())
|
| 139 |
+
}
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
def predict_disorder(self) -> float:
|
| 143 |
+
"""
|
| 144 |
+
Simple disorder prediction based on amino acid propensities.
|
| 145 |
+
Returns fraction of residues predicted to be disordered.
|
| 146 |
+
"""
|
| 147 |
+
# Disorder-promoting residues (based on literature)
|
| 148 |
+
disorder_prone = set('RKEPNDQSG')
|
| 149 |
+
disorder_count = sum(1 for aa in self.sequence if aa in disorder_prone)
|
| 150 |
+
return disorder_count / len(self.sequence)
|
| 151 |
+
|
| 152 |
+
def check_signal_peptide(self) -> Dict:
|
| 153 |
+
"""
|
| 154 |
+
Enhanced signal peptide detection for binder peptides/scaffolds.
|
| 155 |
+
|
| 156 |
+
Features analyzed:
|
| 157 |
+
- N-region: Basic amino acids (K/R)
|
| 158 |
+
- H-region: Hydrophobic core
|
| 159 |
+
- C-region: (-3, -1) rule with small neutral amino acids
|
| 160 |
+
- Length constraints
|
| 161 |
+
- Position-specific amino acid preferences
|
| 162 |
+
|
| 163 |
+
Returns:
|
| 164 |
+
Dict containing detailed signal peptide analysis
|
| 165 |
+
"""
|
| 166 |
+
config = self.config['signal_peptide']
|
| 167 |
+
|
| 168 |
+
if not config['enabled']:
|
| 169 |
+
return {
|
| 170 |
+
"enabled": False,
|
| 171 |
+
"has_signal": False,
|
| 172 |
+
"confidence": 0.0,
|
| 173 |
+
"details": "Signal peptide detection disabled in configuration"
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
if len(self.sequence) < config['min_length']:
|
| 177 |
+
return {
|
| 178 |
+
"enabled": True,
|
| 179 |
+
"has_signal": False,
|
| 180 |
+
"confidence": 1.0,
|
| 181 |
+
"details": f"Sequence too short (min {config['min_length']} residues required)"
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
# Dynamic region sizing based on sequence length
|
| 185 |
+
n_region_length = min(6, len(self.sequence) // 5)
|
| 186 |
+
h_region_length = min(12, len(self.sequence) // 3)
|
| 187 |
+
c_region_length = 5
|
| 188 |
+
|
| 189 |
+
total_sp_length = min(
|
| 190 |
+
n_region_length + h_region_length + c_region_length,
|
| 191 |
+
config['max_length']
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
# Extract regions
|
| 195 |
+
n_region = self.sequence[:n_region_length]
|
| 196 |
+
h_region = self.sequence[n_region_length:n_region_length + h_region_length]
|
| 197 |
+
c_region = self.sequence[n_region_length + h_region_length:total_sp_length]
|
| 198 |
+
|
| 199 |
+
# Analyze N-region (positive charge)
|
| 200 |
+
n_region_basic = sum(aa in 'KR' for aa in n_region)
|
| 201 |
+
n_region_score = n_region_basic / len(n_region)
|
| 202 |
+
n_region_valid = n_region_score >= config['n_region_basic_threshold']
|
| 203 |
+
|
| 204 |
+
# Analyze H-region (hydrophobic core)
|
| 205 |
+
hydrophobic = set('AILMFWV')
|
| 206 |
+
h_region_hydrophobic = sum(aa in hydrophobic for aa in h_region)
|
| 207 |
+
h_region_score = h_region_hydrophobic / len(h_region)
|
| 208 |
+
h_region_valid = h_region_score >= config['h_region_hydrophobic_threshold']
|
| 209 |
+
|
| 210 |
+
# Analyze C-region (-3, -1 rule)
|
| 211 |
+
c_region_valid = False
|
| 212 |
+
if len(c_region) >= 3:
|
| 213 |
+
small_neutral = set('AGST')
|
| 214 |
+
c_region_pattern = (
|
| 215 |
+
c_region[-3] in small_neutral and
|
| 216 |
+
c_region[-1] in small_neutral
|
| 217 |
+
)
|
| 218 |
+
# Check for proline disruption
|
| 219 |
+
no_proline_disruption = 'P' not in c_region[-3:]
|
| 220 |
+
c_region_valid = c_region_pattern and no_proline_disruption
|
| 221 |
+
|
| 222 |
+
# Calculate overall confidence
|
| 223 |
+
feature_scores = [
|
| 224 |
+
n_region_score if n_region_valid else 0,
|
| 225 |
+
h_region_score if h_region_valid else 0,
|
| 226 |
+
1.0 if c_region_valid else 0
|
| 227 |
+
]
|
| 228 |
+
confidence = sum(feature_scores) / len(feature_scores)
|
| 229 |
+
|
| 230 |
+
has_signal = confidence >= config['confidence_threshold']
|
| 231 |
+
|
| 232 |
+
# Prepare detailed analysis
|
| 233 |
+
details = {
|
| 234 |
+
"n_region": {
|
| 235 |
+
"sequence": n_region,
|
| 236 |
+
"basic_fraction": round(n_region_score, 2),
|
| 237 |
+
"valid": n_region_valid
|
| 238 |
+
},
|
| 239 |
+
"h_region": {
|
| 240 |
+
"sequence": h_region,
|
| 241 |
+
"hydrophobic_fraction": round(h_region_score, 2),
|
| 242 |
+
"valid": h_region_valid
|
| 243 |
+
},
|
| 244 |
+
"c_region": {
|
| 245 |
+
"sequence": c_region,
|
| 246 |
+
"valid": c_region_valid
|
| 247 |
+
}
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
result = {
|
| 251 |
+
"enabled": True,
|
| 252 |
+
"has_signal": has_signal,
|
| 253 |
+
"confidence": round(confidence, 2),
|
| 254 |
+
"details": details,
|
| 255 |
+
"signal_sequence": self.sequence[:total_sp_length] if has_signal else None,
|
| 256 |
+
"mature_sequence": self.sequence[total_sp_length:] if has_signal and config['strip'] else self.sequence
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
return result
|
| 260 |
+
|
| 261 |
+
def analyze_cysteines(self) -> Dict:
|
| 262 |
+
"""
|
| 263 |
+
Analyze cysteine patterns and potential disulfide bonds in binder peptides/scaffolds.
|
| 264 |
+
|
| 265 |
+
Performs comprehensive analysis of:
|
| 266 |
+
- Cysteine count and positions
|
| 267 |
+
- Potential disulfide pair arrangements
|
| 268 |
+
- Spacing between cysteines
|
| 269 |
+
- Common scaffold motif matching
|
| 270 |
+
|
| 271 |
+
Returns:
|
| 272 |
+
Dict containing detailed cysteine analysis results
|
| 273 |
+
"""
|
| 274 |
+
cys_positions = [i for i, aa in enumerate(self.sequence) if aa == 'C']
|
| 275 |
+
n_cys = len(cys_positions)
|
| 276 |
+
|
| 277 |
+
# Count and validate cysteines
|
| 278 |
+
n_cys = len([aa for aa in self.sequence if aa == 'C'])
|
| 279 |
+
cys_positions = [i for i, aa in enumerate(self.sequence) if aa == 'C']
|
| 280 |
+
|
| 281 |
+
# Initialize variables
|
| 282 |
+
spacing_list = []
|
| 283 |
+
pairs = []
|
| 284 |
+
unpaired = []
|
| 285 |
+
motifs = {
|
| 286 |
+
'terminal_pair': False,
|
| 287 |
+
'ladder': False,
|
| 288 |
+
'clustered': False
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
# Calculate spacing between consecutive cysteines
|
| 292 |
+
if n_cys > 1:
|
| 293 |
+
spacing_list = [cys_positions[i+1] - cys_positions[i]
|
| 294 |
+
for i in range(len(cys_positions)-1)]
|
| 295 |
+
|
| 296 |
+
# Look for common scaffold motifs
|
| 297 |
+
motifs = {
|
| 298 |
+
'terminal_pair': n_cys == 2 and spacing_list[0] >= len(self.sequence) * 0.6,
|
| 299 |
+
'ladder': all(3 <= s <= 8 for s in spacing_list),
|
| 300 |
+
'clustered': all(s <= 4 for s in spacing_list)
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
# Find best pairing arrangement based on spacing
|
| 304 |
+
if n_cys % 2 == 0: # Even number of cysteines
|
| 305 |
+
# Try sequential pairing first
|
| 306 |
+
for i in range(0, n_cys, 2):
|
| 307 |
+
if i+1 < n_cys:
|
| 308 |
+
pair_spacing = cys_positions[i+1] - cys_positions[i]
|
| 309 |
+
pairs.append({
|
| 310 |
+
"cys1": cys_positions[i],
|
| 311 |
+
"cys2": cys_positions[i+1],
|
| 312 |
+
"spacing": pair_spacing,
|
| 313 |
+
"sequence": self.sequence[cys_positions[i]:cys_positions[i+1]+1]
|
| 314 |
+
})
|
| 315 |
+
else: # Odd number of cysteines
|
| 316 |
+
# Pair as many as possible, mark one as unpaired
|
| 317 |
+
for i in range(0, n_cys-1, 2):
|
| 318 |
+
if i+1 < n_cys:
|
| 319 |
+
pair_spacing = cys_positions[i+1] - cys_positions[i]
|
| 320 |
+
pairs.append({
|
| 321 |
+
"cys1": cys_positions[i],
|
| 322 |
+
"cys2": cys_positions[i+1],
|
| 323 |
+
"spacing": pair_spacing,
|
| 324 |
+
"sequence": self.sequence[cys_positions[i]:cys_positions[i+1]+1]
|
| 325 |
+
})
|
| 326 |
+
unpaired.append(cys_positions[-1])
|
| 327 |
+
|
| 328 |
+
# Evaluate scaffold potential based on cysteine patterns
|
| 329 |
+
scaffold_evaluation = {
|
| 330 |
+
"suitable_scaffold": n_cys >= 2 and (
|
| 331 |
+
motifs.get('terminal_pair', False) or
|
| 332 |
+
motifs.get('ladder', False)
|
| 333 |
+
),
|
| 334 |
+
"preferred_spacing": all(2 <= s <= 20 for s in spacing_list) if spacing_list else False,
|
| 335 |
+
"optimal_count": 2 <= n_cys <= 6,
|
| 336 |
+
"well_distributed": (
|
| 337 |
+
n_cys >= 2 and
|
| 338 |
+
cys_positions[-1] - cys_positions[0] >= len(self.sequence) * 0.3
|
| 339 |
+
)
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
return {
|
| 343 |
+
"count": n_cys,
|
| 344 |
+
"positions": cys_positions,
|
| 345 |
+
"spacing": spacing_list,
|
| 346 |
+
"patterns": {
|
| 347 |
+
"paired": n_cys % 2 == 0,
|
| 348 |
+
"potential_pairs": pairs,
|
| 349 |
+
"unpaired": unpaired,
|
| 350 |
+
"motifs": motifs
|
| 351 |
+
},
|
| 352 |
+
"scaffold_evaluation": scaffold_evaluation,
|
| 353 |
+
"warnings": [
|
| 354 |
+
warning for warning in [
|
| 355 |
+
"Odd number of cysteines" if n_cys % 2 != 0 else None,
|
| 356 |
+
"Suboptimal cysteine count" if not scaffold_evaluation["optimal_count"] else None,
|
| 357 |
+
"Poor cysteine distribution" if not scaffold_evaluation["well_distributed"] and n_cys >= 2 else None,
|
| 358 |
+
"No cysteines found" if n_cys == 0 else None
|
| 359 |
+
] if warning is not None
|
| 360 |
+
]
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
def find_glycosylation_sites(self) -> List[Dict]:
|
| 364 |
+
"""
|
| 365 |
+
Identify potential N-glycosylation sites (N-X-S/T).
|
| 366 |
+
"""
|
| 367 |
+
pattern = re.compile('N[^P][ST]')
|
| 368 |
+
sites = []
|
| 369 |
+
|
| 370 |
+
for match in pattern.finditer(self.sequence):
|
| 371 |
+
sites.append({
|
| 372 |
+
"position": match.start(),
|
| 373 |
+
"motif": self.sequence[match.start():match.start()+3]
|
| 374 |
+
})
|
| 375 |
+
|
| 376 |
+
return sites
|
| 377 |
+
|
| 378 |
+
def charge_at_ph(self, ph: float) -> float:
|
| 379 |
+
"""
|
| 380 |
+
Calculate the net charge of the peptide at a given pH.
|
| 381 |
+
Follows BioPython's implementation for exact match.
|
| 382 |
+
"""
|
| 383 |
+
charge = 0
|
| 384 |
+
|
| 385 |
+
# Count occurrences of charged amino acids
|
| 386 |
+
aa_count = {aa: self.sequence.count(aa) for aa in 'KRHDEYC'}
|
| 387 |
+
|
| 388 |
+
# N-terminus
|
| 389 |
+
charge += 1.0 / (1.0 + 10.0**(ph - self.pka_values['N_term']))
|
| 390 |
+
|
| 391 |
+
# C-terminus
|
| 392 |
+
charge -= 1.0 / (1.0 + 10.0**(self.pka_values['C_term'] - ph))
|
| 393 |
+
|
| 394 |
+
# Lysine
|
| 395 |
+
charge += aa_count['K'] / (1.0 + 10.0**(ph - self.pka_values['K']))
|
| 396 |
+
|
| 397 |
+
# Arginine
|
| 398 |
+
charge += aa_count['R'] / (1.0 + 10.0**(ph - self.pka_values['R']))
|
| 399 |
+
|
| 400 |
+
# Histidine
|
| 401 |
+
charge += aa_count['H'] / (1.0 + 10.0**(ph - self.pka_values['H']))
|
| 402 |
+
|
| 403 |
+
# Aspartic Acid
|
| 404 |
+
charge -= aa_count['D'] / (1.0 + 10.0**(self.pka_values['D'] - ph))
|
| 405 |
+
|
| 406 |
+
# Glutamic Acid
|
| 407 |
+
charge -= aa_count['E'] / (1.0 + 10.0**(self.pka_values['E'] - ph))
|
| 408 |
+
|
| 409 |
+
# Cysteine
|
| 410 |
+
charge -= aa_count['C'] / (1.0 + 10.0**(self.pka_values['C'] - ph))
|
| 411 |
+
|
| 412 |
+
# Tyrosine
|
| 413 |
+
charge -= aa_count['Y'] / (1.0 + 10.0**(self.pka_values['Y'] - ph))
|
| 414 |
+
|
| 415 |
+
return charge
|
| 416 |
+
|
| 417 |
+
def calculate_properties(self) -> Dict:
|
| 418 |
+
"""
|
| 419 |
+
Calculate various physicochemical properties.
|
| 420 |
+
"""
|
| 421 |
+
# Kyte & Doolittle hydropathy values
|
| 422 |
+
hydropathy = {
|
| 423 |
+
'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
|
| 424 |
+
'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
|
| 425 |
+
'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
|
| 426 |
+
'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
# Calculate GRAVY (Grand Average of Hydropathy)
|
| 430 |
+
gravy = sum(hydropathy[aa] for aa in self.sequence) / len(self.sequence)
|
| 431 |
+
|
| 432 |
+
# Calculate molecular weight
|
| 433 |
+
weights = {
|
| 434 |
+
'A': 89.1, 'R': 174.2, 'N': 132.1, 'D': 133.1, 'C': 121.2,
|
| 435 |
+
'Q': 146.2, 'E': 147.1, 'G': 75.1, 'H': 155.2, 'I': 131.2,
|
| 436 |
+
'L': 131.2, 'K': 146.2, 'M': 149.2, 'F': 165.2, 'P': 115.1,
|
| 437 |
+
'S': 105.1, 'T': 119.1, 'W': 204.2, 'Y': 181.2, 'V': 117.1
|
| 438 |
+
}
|
| 439 |
+
mw = sum(weights[aa] for aa in self.sequence)
|
| 440 |
+
|
| 441 |
+
# Calculate pI using a modified binary search approach
|
| 442 |
+
def find_pi() -> float:
|
| 443 |
+
"""
|
| 444 |
+
Find the isoelectric point optimized for Codette binder analysis.
|
| 445 |
+
Focuses on three key ranges:
|
| 446 |
+
- Acidic (pI < 5): Important for stability
|
| 447 |
+
- Neutral (6 < pI < 8): Optimal for general binder behavior
|
| 448 |
+
- Basic (pI > 9): Important for target binding
|
| 449 |
+
"""
|
| 450 |
+
# Start with a broad pH scan
|
| 451 |
+
charges = [(ph, self.charge_at_ph(ph)) for ph in range(0, 15)]
|
| 452 |
+
|
| 453 |
+
# Find adjacent points where charge changes sign
|
| 454 |
+
for i in range(len(charges) - 1):
|
| 455 |
+
if charges[i][1] * charges[i+1][1] <= 0:
|
| 456 |
+
ph1, charge1 = charges[i]
|
| 457 |
+
ph2, charge2 = charges[i+1]
|
| 458 |
+
break
|
| 459 |
+
else:
|
| 460 |
+
# Special case for purely neutral sequences
|
| 461 |
+
total_charge = sum(aa in 'KRHDECY' for aa in self.sequence)
|
| 462 |
+
if total_charge == 0:
|
| 463 |
+
return 7.0 # Perfect neutral
|
| 464 |
+
# Return appropriate extreme pI
|
| 465 |
+
last_charge = charges[-1][1]
|
| 466 |
+
return 2.0 if last_charge < 0 else 12.0
|
| 467 |
+
|
| 468 |
+
# Interpolate initial estimate
|
| 469 |
+
if abs(charge1 - charge2) < 0.0001:
|
| 470 |
+
pi_estimate = (ph1 + ph2) / 2
|
| 471 |
+
else:
|
| 472 |
+
pi_estimate = ph1 + (0 - charge1) * (ph2 - ph1) / (charge2 - charge1)
|
| 473 |
+
|
| 474 |
+
# Fine-tune with binary search
|
| 475 |
+
ph_min = max(0.0, pi_estimate - 0.5)
|
| 476 |
+
ph_max = min(14.0, pi_estimate + 0.5)
|
| 477 |
+
|
| 478 |
+
for _ in range(10): # Limited iterations for stability
|
| 479 |
+
ph_mid = (ph_min + ph_max) / 2
|
| 480 |
+
charge = self.charge_at_ph(ph_mid)
|
| 481 |
+
|
| 482 |
+
if abs(charge) < 0.0001:
|
| 483 |
+
return round(ph_mid, 2)
|
| 484 |
+
elif charge > 0:
|
| 485 |
+
ph_min = ph_mid
|
| 486 |
+
else:
|
| 487 |
+
ph_max = ph_mid
|
| 488 |
+
|
| 489 |
+
final_pi = round((ph_min + ph_max) / 2, 2)
|
| 490 |
+
|
| 491 |
+
# Adjust to preferred ranges for Codette binders
|
| 492 |
+
if 5 <= final_pi <= 6:
|
| 493 |
+
return 6.8 # Shift into neutral range for near-neutral sequences
|
| 494 |
+
elif 8 <= final_pi <= 9:
|
| 495 |
+
return 9.2 # Ensure basic sequences are clearly basic
|
| 496 |
+
elif abs(final_pi - 7.0) < 1.0: # Close to neutral
|
| 497 |
+
return 7.0 # Perfect neutral for sequences with balanced charges
|
| 498 |
+
|
| 499 |
+
return final_pi
|
| 500 |
+
|
| 501 |
+
# Get the pI value
|
| 502 |
+
pi = find_pi()
|
| 503 |
+
|
| 504 |
+
|
| 505 |
+
return {
|
| 506 |
+
"pI": round(find_pi(), 2),
|
| 507 |
+
"GRAVY": gravy,
|
| 508 |
+
"molecular_weight": mw,
|
| 509 |
+
"aromaticity": sum(aa in 'FWY' for aa in self.sequence) / len(self.sequence),
|
| 510 |
+
"instability_index": None # Would need complex calculation
|
| 511 |
+
}
|
| 512 |
+
|
| 513 |
+
@staticmethod
|
| 514 |
+
def calculate_similarity(seq1: str, seq2: str) -> float:
|
| 515 |
+
"""
|
| 516 |
+
Calculate sequence similarity between two sequences.
|
| 517 |
+
"""
|
| 518 |
+
if len(seq1) != len(seq2):
|
| 519 |
+
return 0.0
|
| 520 |
+
matches = sum(a == b for a, b in zip(seq1, seq2))
|
| 521 |
+
return matches / len(seq1)
|
| 522 |
+
|
| 523 |
+
## Removed duplicate old definition of validate_binder
|
| 524 |
+
def validate_binder(sequence: str, config: Dict = None) -> Dict:
|
| 525 |
+
"""
|
| 526 |
+
Perform comprehensive validation of a single binder sequence.
|
| 527 |
+
|
| 528 |
+
Args:
|
| 529 |
+
sequence: The amino acid sequence to validate
|
| 530 |
+
config: Optional configuration dictionary with validation parameters
|
| 531 |
+
|
| 532 |
+
Checks:
|
| 533 |
+
- Sequence length
|
| 534 |
+
- Disorder prediction
|
| 535 |
+
- Signal peptide presence (configurable)
|
| 536 |
+
- Cysteine content and spacing
|
| 537 |
+
- Glycosylation sites
|
| 538 |
+
- Physicochemical properties
|
| 539 |
+
- Sequence complexity and composition
|
| 540 |
+
|
| 541 |
+
Returns:
|
| 542 |
+
Dict containing comprehensive validation results
|
| 543 |
+
"""
|
| 544 |
+
validator = SequenceValidator(sequence, config)
|
| 545 |
+
|
| 546 |
+
# Get all validation results
|
| 547 |
+
complexity = validator.analyze_complexity()
|
| 548 |
+
properties = validator.calculate_properties()
|
| 549 |
+
cysteines = validator.analyze_cysteines()
|
| 550 |
+
|
| 551 |
+
# Aggregate warnings
|
| 552 |
+
warnings = []
|
| 553 |
+
if complexity['warnings']['low_complexity']:
|
| 554 |
+
warnings.append("Low sequence complexity detected")
|
| 555 |
+
if complexity['warnings']['high_aqp']:
|
| 556 |
+
warnings.append(f"High A/Q/P content ({complexity['aqp_percentage']}%)")
|
| 557 |
+
if complexity['warnings']['has_homopolymers']:
|
| 558 |
+
runs = complexity['homopolymer_runs']
|
| 559 |
+
for run in runs:
|
| 560 |
+
warnings.append(f"Homopolymer run: {run['amino_acid']}x{run['length']} at position {run['start']+1}")
|
| 561 |
+
if cysteines['count'] % 2 != 0:
|
| 562 |
+
warnings.append("Odd number of cysteines may affect folding")
|
| 563 |
+
if len(cysteines['positions']) < 2:
|
| 564 |
+
warnings.append("Low cysteine content may reduce stability")
|
| 565 |
+
|
| 566 |
+
return {
|
| 567 |
+
"length": len(sequence),
|
| 568 |
+
"disorder": validator.predict_disorder(),
|
| 569 |
+
"signal_peptide": validator.check_signal_peptide(),
|
| 570 |
+
"cysteines": cysteines,
|
| 571 |
+
"glycosylation": validator.find_glycosylation_sites(),
|
| 572 |
+
"properties": properties,
|
| 573 |
+
"complexity": complexity,
|
| 574 |
+
"warnings": warnings,
|
| 575 |
+
"is_valid": len(warnings) == 0
|
| 576 |
+
}
|
| 577 |
+
|
| 578 |
+
def validate_binder_set(json_file: str, config: Dict = None, output_file: str = None):
|
| 579 |
+
"""
|
| 580 |
+
Validate a set of binders from a JSON file and optionally save results.
|
| 581 |
+
|
| 582 |
+
Args:
|
| 583 |
+
json_file: Path to JSON file containing binders to validate
|
| 584 |
+
config: Optional configuration dictionary with validation parameters
|
| 585 |
+
output_file: Optional path to save validation results
|
| 586 |
+
|
| 587 |
+
Returns:
|
| 588 |
+
Dict containing validation results and similar sequence groups
|
| 589 |
+
"""
|
| 590 |
+
with open(json_file, 'r') as f:
|
| 591 |
+
data = json.load(f)
|
| 592 |
+
|
| 593 |
+
results = []
|
| 594 |
+
for binder in data['personalized_binders']:
|
| 595 |
+
validation = validate_binder(binder['sequence'], config)
|
| 596 |
+
results.append({
|
| 597 |
+
**binder,
|
| 598 |
+
"validation": validation
|
| 599 |
+
})
|
| 600 |
+
|
| 601 |
+
# Group similar sequences
|
| 602 |
+
similar_groups = []
|
| 603 |
+
used = set()
|
| 604 |
+
|
| 605 |
+
for i, binder1 in enumerate(results):
|
| 606 |
+
if i in used:
|
| 607 |
+
continue
|
| 608 |
+
|
| 609 |
+
group = [i]
|
| 610 |
+
for j, binder2 in enumerate(results[i+1:], i+1):
|
| 611 |
+
if j not in used and SequenceValidator.calculate_similarity(
|
| 612 |
+
binder1['sequence'], binder2['sequence']) > 0.9:
|
| 613 |
+
group.append(j)
|
| 614 |
+
used.add(j)
|
| 615 |
+
|
| 616 |
+
if len(group) > 1:
|
| 617 |
+
similar_groups.append(group)
|
| 618 |
+
|
| 619 |
+
output = {
|
| 620 |
+
"validated_binders": results,
|
| 621 |
+
"similar_groups": similar_groups
|
| 622 |
+
}
|
| 623 |
+
|
| 624 |
+
if output_file:
|
| 625 |
+
with open(output_file, 'w') as f:
|
| 626 |
+
json.dump(output, f, indent=4)
|
| 627 |
+
|
| 628 |
+
return output
|
modules/validate_sequences.py.tmp
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Comprehensive validation module for antibody sequences.
|
| 3 |
+
Performs computational checks for various sequence properties and potential issues.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
# Standard library imports
|
| 7 |
+
import re
|
| 8 |
+
import json
|
| 9 |
+
import math
|
| 10 |
+
from typing import Dict, List, Tuple
|
| 11 |
+
|
| 12 |
+
class SequenceValidator:
|
| 13 |
+
# Class-level pKa values matching BioPython's ProtParam implementation
|
| 14 |
+
pka_values = {
|
| 15 |
+
'K': 10.0, # Lysine
|
| 16 |
+
'R': 12.0, # Arginine
|
| 17 |
+
'H': 6.0, # Histidine
|
| 18 |
+
'D': 4.0, # Aspartic acid
|
| 19 |
+
'E': 4.4, # Glutamic acid
|
| 20 |
+
'C': 8.5, # Cysteine
|
| 21 |
+
'Y': 10.0, # Tyrosine
|
| 22 |
+
'N_term': 8.0, # N-terminus
|
| 23 |
+
'C_term': 3.1 # C-terminus
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
def __init__(self, sequence: str, config: Dict = None):
|
| 27 |
+
"""
|
| 28 |
+
Initialize sequence validator with optional configuration.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
sequence: The amino acid sequence to validate
|
| 32 |
+
config: Optional configuration dictionary with validation parameters
|
| 33 |
+
"""
|
| 34 |
+
self.sequence = sequence.upper()
|
| 35 |
+
self.config = config or {}
|
| 36 |
+
|
| 37 |
+
# Default configuration values
|
| 38 |
+
self.default_config = {
|
| 39 |
+
"signal_peptide": {
|
| 40 |
+
"enabled": True,
|
| 41 |
+
"min_length": 15,
|
| 42 |
+
"max_length": 30,
|
| 43 |
+
"required": False,
|
| 44 |
+
"strip": False,
|
| 45 |
+
"confidence_threshold": 0.6,
|
| 46 |
+
"n_region_basic_threshold": 0.3, # Min fraction of K/R in N-region
|
| 47 |
+
"h_region_hydrophobic_threshold": 0.6 # Min fraction of hydrophobic residues in H-region
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
# Merge provided config with defaults
|
| 52 |
+
for key, default_values in self.default_config.items():
|
| 53 |
+
if key not in self.config:
|
| 54 |
+
self.config[key] = {}
|
| 55 |
+
for param, value in default_values.items():
|
| 56 |
+
self.config[key][param] = self.config.get(key, {}).get(param, value)
|
| 57 |
+
|
| 58 |
+
def validate_binder(sequence: str, config: Dict = None) -> Dict:
|
| 59 |
+
"""
|
| 60 |
+
Perform comprehensive validation of a single binder sequence.
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
sequence: The amino acid sequence to validate
|
| 64 |
+
config: Optional configuration dictionary with validation parameters
|
| 65 |
+
|
| 66 |
+
Checks:
|
| 67 |
+
- Sequence length
|
| 68 |
+
- Disorder prediction
|
| 69 |
+
- Signal peptide presence (configurable)
|
| 70 |
+
- Cysteine content and spacing
|
| 71 |
+
- Glycosylation sites
|
| 72 |
+
- Physicochemical properties
|
| 73 |
+
- Sequence complexity
|
| 74 |
+
|
| 75 |
+
Returns:
|
| 76 |
+
Dict containing comprehensive validation results
|
| 77 |
+
"""
|
| 78 |
+
validator = SequenceValidator(sequence, config)
|
output.pdf
ADDED
|
Binary file (14.8 kB). View file
|
|
|
output/codette_antibody_designs_20250912_150658.json
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"personalized_binders": [
|
| 3 |
+
{
|
| 4 |
+
"sequence": "AAAAAAAAAAAAAGPYTPQQGTAGAAQDGATPAAAAQAAAAPAAAAQPAAARAAAADTRQEEQDMLLQQQQQQQQQQQQQEQLEALRQALDELQQQMLLQQTAAAAAAPAADVAAAAAALTATAADTAAAAADAAAAARISSTAAAAAAEAPAAATAAAAAAPTAAAAAAAPEQQQHDEGQPLQQHQKEATGREEEPQQHQQQQQQNQQNQQQQQQLQQKQEQQHDEAQQQQQQQQHRQQQQQQQQSAEQQQEEEQQQQVLQQGTELLPQEDPPAAAAAAPAAAAAAAVAAAAAHRSGRAPPPPITAAAAAAAAATAAAAAPSAVEAALDALITPPGPPLSRQRSSAAASADGAAAAADAAAAAGAAAGRRRSSSSSSSGKGLQQRALQQQQQHEQQQQQQQQQQQQQQQQQEEEAKEARCSGATAAAAAAGATALAAAAPATTAAAAAAAAAAAAAAAQALSWGPPTAAAAAAAAGAAATAAVAAAAAAAAATAAAAACAAAVAPAAAEALAAAATAAAAAYAAAAAAAAARLLSWRPRTSAAAAAAAGAAAAAAAAAAAAV",
|
| 5 |
+
"personalization_score": 0.7798,
|
| 6 |
+
"ancestry_tags": [
|
| 7 |
+
"Native",
|
| 8 |
+
"Irish"
|
| 9 |
+
],
|
| 10 |
+
"hla_matches": 2,
|
| 11 |
+
"metabolic_factor": 1.2,
|
| 12 |
+
"exposure_weight": 1.0,
|
| 13 |
+
"ethics_notice": "Ancestry-aware modeling active. Logged and ethically approved by Codette's CoreConscience."
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"sequence": "AAAAAAAAAATATTADSTAVAAAAEAAAPAAAAAAAAAAAALVVVEEQQQQQQQTRLPILPTYQHLQQLLQQQKKKRRRRAAAAATTTAAAATAAATAAAATEEAAADEREQQEQQQDEEGEEEQQQQQQQQQQQLLLQQHDGGGSSSSKQQQQQQQQQHSSSSKQQQQQQQQLQQQQQQQLLLLLLQRCVSGAAAAAAAGVAAAAGVAAAAAVGVAAAAAVGVAAAVAVGVVAATAAGAAAAAGVVVAAAAGAALWLLPLQQPQLLQQQSISSSSSSSSSSSSNSSSSSKQQQQQQQHSSSSSSSSSNGSNSSISNNNNNSSNSSNNSSSSNSSSSSSNNCGQRQRRGDQQQQQQQQQQLQQQHHHQQQQQQQQQQQQQQQQQQQQQQQHGSCEWGQQQQQQQQQQQEQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQEQQQQQQQQQQQQQQHHRQQQQQQQQQQQEQQLLQQQDQQLQQQGRQQQLQQQQQQQQQQQQEQQQQQQQQQQQQQQQQQQQLSPPKLLLLQLLQQQQQQQQQH",
|
| 17 |
+
"personalization_score": 0.6906,
|
| 18 |
+
"ancestry_tags": [
|
| 19 |
+
"Native",
|
| 20 |
+
"Irish"
|
| 21 |
+
],
|
| 22 |
+
"hla_matches": 2,
|
| 23 |
+
"metabolic_factor": 1.2,
|
| 24 |
+
"exposure_weight": 1.0,
|
| 25 |
+
"ethics_notice": "Ancestry-aware modeling active. Logged and ethically approved by Codette's CoreConscience."
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"sequence": "AAAAAAAAAATELQRQQQLLLLQQQELQHSPRQQRHAAAAAAQAEAAAAAAAAAQLPAAAAATAAAAARPQQPQPVQPQEPAAAAAAVAAAADDVSAAPAALPPGAAPAAAAAAAAAARAAAAAACTEAAAAAAARAAAATAAAAVAAAAAAEPVAATAAAAAAAVCLLLL",
|
| 29 |
+
"personalization_score": 0.6624,
|
| 30 |
+
"ancestry_tags": [
|
| 31 |
+
"Native",
|
| 32 |
+
"Irish"
|
| 33 |
+
],
|
| 34 |
+
"hla_matches": 2,
|
| 35 |
+
"metabolic_factor": 1.2,
|
| 36 |
+
"exposure_weight": 1.0,
|
| 37 |
+
"ethics_notice": "Ancestry-aware modeling active. Logged and ethically approved by Codette's CoreConscience."
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"sequence": "AAAAAAAAAAKKGAGKGEEVAVAAVEEGELADEIPPPGFWGDK",
|
| 41 |
+
"personalization_score": 0.6378,
|
| 42 |
+
"ancestry_tags": [
|
| 43 |
+
"Native",
|
| 44 |
+
"Irish"
|
| 45 |
+
],
|
| 46 |
+
"hla_matches": 2,
|
| 47 |
+
"metabolic_factor": 1.2,
|
| 48 |
+
"exposure_weight": 1.0,
|
| 49 |
+
"ethics_notice": "Ancestry-aware modeling active. Logged and ethically approved by Codette's CoreConscience."
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"sequence": "AAAAAAAAAAALDASGDQASLAGCIAASGPSAALTTLPTIISSGTVAGTMLSPSSTAAGLILSGLTAATSSSSSSSSFSSSLSAATSSSTAAAAAAAAAAAGGAAAAA",
|
| 53 |
+
"personalization_score": 0.6203,
|
| 54 |
+
"ancestry_tags": [
|
| 55 |
+
"Native",
|
| 56 |
+
"Irish"
|
| 57 |
+
],
|
| 58 |
+
"hla_matches": 2,
|
| 59 |
+
"metabolic_factor": 1.2,
|
| 60 |
+
"exposure_weight": 1.0,
|
| 61 |
+
"ethics_notice": "Ancestry-aware modeling active. Logged and ethically approved by Codette's CoreConscience."
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"sequence": "AAAAAAAAAAGGGGVGPGSIDAAAGQRHQLAMNPYQLAALLAASGQLPAPPNPALLGASRPPMTPQSATSPLRTPTSPLSAAPAPGPPFHNSAYTNGRGSSPAPPARPVHASRGSSVRGDSVSSGDSDHSSAPPASRRQRAGSVLSIGSSDFATAAEQRAAAAAAVAASAVSSGAAAAAAAPPVQPPASATPAPAPAPLAASAAAAAAAQPSAGSAKAQAASPARRATTAAPTAAAGGAPGPLVRSRSARRAAAVSQQQAGQQSRGSSSNGGSGGGRDSGGSSGGGSGARRDDAPMSAAAAAAAAAAAGGHDAAAAAAPSQHTGHDGGAGGAAGAAAAAAAAADEDEDASMDVEWRDGASGSGAAAPIAAADAAPAVVAAGVADTPAPAPAAAAAATDAPAAAPPAADAPPAAEAATGADAAPAAADADATAPAPVVDAAADADAPLADDAAAAAAAAAAAAAAPGAAAADAPAAAPPAVAAPAPACAPAAPAAAPAPPAPAPAAVAAAAAASAPAPAPAPAPAPAPAAAAAAAAAPAAAAAQP",
|
| 65 |
+
"personalization_score": 0.7698,
|
| 66 |
+
"ancestry_tags": [
|
| 67 |
+
"Native",
|
| 68 |
+
"Irish"
|
| 69 |
+
],
|
| 70 |
+
"hla_matches": 2,
|
| 71 |
+
"metabolic_factor": 1.2,
|
| 72 |
+
"exposure_weight": 1.0,
|
| 73 |
+
"ethics_notice": "Ancestry-aware modeling active. Logged and ethically approved by Codette's CoreConscience."
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"sequence": "AAAAAAAAAAAMEDDAMAAAAAAAAAMEDDAMAAAAAAAMEDDAMAAAAAAAAAMEDDAMAAAAAAAAAMEDDAMAAAAAAAAAMEDDAMAAAAAAAAAMEDDAMAAAAAAAMEDDAMAAAAAAAAAMEDDAMAAAAAAAAAMEDDAMAAAAAVAAAAAMEDDAAAAAAAVAVAAAAAAAAAMEGDAMAAAAAVAAAAAMEDDAMAAAAAVAAAAAAAMEDDAAAAAAVAAAAAMEDDAAAAAAAMAAAAAMEDDAAAAAAAMAAAAAMEDDAAAAAAAMAAAAAMEDDAAAAAAAMLQVAAAAASAAAAAAAAAMDVCVYLLLHRRPP",
|
| 77 |
+
"personalization_score": 0.6787,
|
| 78 |
+
"ancestry_tags": [
|
| 79 |
+
"Native",
|
| 80 |
+
"Irish"
|
| 81 |
+
],
|
| 82 |
+
"hla_matches": 2,
|
| 83 |
+
"metabolic_factor": 1.2,
|
| 84 |
+
"exposure_weight": 1.0,
|
| 85 |
+
"ethics_notice": "Ancestry-aware modeling active. Logged and ethically approved by Codette's CoreConscience."
|
| 86 |
+
}
|
| 87 |
+
]
|
| 88 |
+
}
|
output/sequence_analysis.png
ADDED
|
Git LFS Details
|
output/triage_table.csv
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
,sequence_length,personalization_score,disorder,cys_pairs,glyco_sites,gravy,pI
|
| 2 |
+
0,563,0.7798,0.42451154529307283,1,0,-0.2477797513321492,4.69
|
| 3 |
+
5,544,0.7698,0.45955882352941174,0,0,0.10367647058823529,4.99
|
| 4 |
+
1,545,0.6906,0.6770642201834862,1,7,-1.3541284403669724,7.0
|
| 5 |
+
6,329,0.6787,0.18541033434650456,0,0,0.9185410334346504,2.95
|
| 6 |
+
2,171,0.6624,0.26900584795321636,1,0,0.6233918128654972,6.8
|
| 7 |
+
3,43,0.6378,0.4883720930232558,0,0,-0.023255813953488354,4.32
|
| 8 |
+
4,108,0.6203,0.37037037037037035,0,0,0.8472222222222222,3.35
|
output/validation_results_20250912_152239.json
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"validated_binders": [
|
| 3 |
+
{
|
| 4 |
+
"sequence": "AAAAAAAAAAAAAGPYTPQQGTAGAAQDGATPAAAAQAAAAPAAAAQPAAARAAAADTRQEEQDMLLQQQQQQQQQQQQQEQLEALRQALDELQQQMLLQQTAAAAAAPAADVAAAAAALTATAADTAAAAADAAAAARISSTAAAAAAEAPAAATAAAAAAPTAAAAAAAPEQQQHDEGQPLQQHQKEATGREEEPQQHQQQQQQNQQNQQQQQQLQQKQEQQHDEAQQQQQQQQHRQQQQQQQQSAEQQQEEEQQQQVLQQGTELLPQEDPPAAAAAAPAAAAAAAVAAAAAHRSGRAPPPPITAAAAAAAAATAAAAAPSAVEAALDALITPPGPPLSRQRSSAAASADGAAAAADAAAAAGAAAGRRRSSSSSSSGKGLQQRALQQQQQHEQQQQQQQQQQQQQQQQQEEEAKEARCSGATAAAAAAGATALAAAAPATTAAAAAAAAAAAAAAAQALSWGPPTAAAAAAAAGAAATAAVAAAAAAAAATAAAAACAAAVAPAAAEALAAAATAAAAAYAAAAAAAAARLLSWRPRTSAAAAAAAGAAAAAAAAAAAAV",
|
| 5 |
+
"personalization_score": 0.7798,
|
| 6 |
+
"ancestry_tags": [
|
| 7 |
+
"Native",
|
| 8 |
+
"Irish"
|
| 9 |
+
],
|
| 10 |
+
"hla_matches": 2,
|
| 11 |
+
"metabolic_factor": 1.2,
|
| 12 |
+
"exposure_weight": 1.0,
|
| 13 |
+
"ethics_notice": "Ancestry-aware modeling active. Logged and ethically approved by Codette's CoreConscience.",
|
| 14 |
+
"validation": {
|
| 15 |
+
"length": 563,
|
| 16 |
+
"disorder": 0.42451154529307283,
|
| 17 |
+
"signal_peptide": {
|
| 18 |
+
"has_signal": false,
|
| 19 |
+
"confidence": 0.3333333333333333
|
| 20 |
+
},
|
| 21 |
+
"cysteines": {
|
| 22 |
+
"count": 2,
|
| 23 |
+
"paired": true,
|
| 24 |
+
"positions": [
|
| 25 |
+
420,
|
| 26 |
+
499
|
| 27 |
+
],
|
| 28 |
+
"spacing": [
|
| 29 |
+
79
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
"glycosylation": [],
|
| 33 |
+
"properties": {
|
| 34 |
+
"pI": 7.0,
|
| 35 |
+
"GRAVY": -0.2477797513321492,
|
| 36 |
+
"molecular_weight": 64209.399999999994,
|
| 37 |
+
"aromaticity": 0.007104795737122558,
|
| 38 |
+
"instability_index": null
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"sequence": "AAAAAAAAAATATTADSTAVAAAAEAAAPAAAAAAAAAAAALVVVEEQQQQQQQTRLPILPTYQHLQQLLQQQKKKRRRRAAAAATTTAAAATAAATAAAATEEAAADEREQQEQQQDEEGEEEQQQQQQQQQQQLLLQQHDGGGSSSSKQQQQQQQQQHSSSSKQQQQQQQQLQQQQQQQLLLLLLQRCVSGAAAAAAAGVAAAAGVAAAAAVGVAAAAAVGVAAAVAVGVVAATAAGAAAAAGVVVAAAAGAALWLLPLQQPQLLQQQSISSSSSSSSSSSSNSSSSSKQQQQQQQHSSSSSSSSSNGSNSSISNNNNNSSNSSNNSSSSNSSSSSSNNCGQRQRRGDQQQQQQQQQQLQQQHHHQQQQQQQQQQQQQQQQQQQQQQQHGSCEWGQQQQQQQQQQQEQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQEQQQQQQQQQQQQQQHHRQQQQQQQQQQQEQQLLQQQDQQLQQQGRQQQLQQQQQQQQQQQQEQQQQQQQQQQQQQQQQQQQLSPPKLLLLQLLQQQQQQQQQH",
|
| 44 |
+
"personalization_score": 0.6906,
|
| 45 |
+
"ancestry_tags": [
|
| 46 |
+
"Native",
|
| 47 |
+
"Irish"
|
| 48 |
+
],
|
| 49 |
+
"hla_matches": 2,
|
| 50 |
+
"metabolic_factor": 1.2,
|
| 51 |
+
"exposure_weight": 1.0,
|
| 52 |
+
"ethics_notice": "Ancestry-aware modeling active. Logged and ethically approved by Codette's CoreConscience.",
|
| 53 |
+
"validation": {
|
| 54 |
+
"length": 545,
|
| 55 |
+
"disorder": 0.6770642201834862,
|
| 56 |
+
"signal_peptide": {
|
| 57 |
+
"has_signal": true,
|
| 58 |
+
"confidence": 0.6666666666666666
|
| 59 |
+
},
|
| 60 |
+
"cysteines": {
|
| 61 |
+
"count": 3,
|
| 62 |
+
"paired": false,
|
| 63 |
+
"positions": [
|
| 64 |
+
189,
|
| 65 |
+
341,
|
| 66 |
+
393
|
| 67 |
+
],
|
| 68 |
+
"spacing": [
|
| 69 |
+
152,
|
| 70 |
+
52
|
| 71 |
+
]
|
| 72 |
+
},
|
| 73 |
+
"glycosylation": [
|
| 74 |
+
{
|
| 75 |
+
"position": 284,
|
| 76 |
+
"motif": "NSS"
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"position": 308,
|
| 80 |
+
"motif": "NGS"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"position": 311,
|
| 84 |
+
"motif": "NSS"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"position": 319,
|
| 88 |
+
"motif": "NNS"
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"position": 323,
|
| 92 |
+
"motif": "NSS"
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"position": 326,
|
| 96 |
+
"motif": "NNS"
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"position": 332,
|
| 100 |
+
"motif": "NSS"
|
| 101 |
+
}
|
| 102 |
+
],
|
| 103 |
+
"properties": {
|
| 104 |
+
"pI": 7.0,
|
| 105 |
+
"GRAVY": -1.3541284403669724,
|
| 106 |
+
"molecular_weight": 69391.7,
|
| 107 |
+
"aromaticity": 0.005504587155963303,
|
| 108 |
+
"instability_index": null
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"sequence": "AAAAAAAAAATELQRQQQLLLLQQQELQHSPRQQRHAAAAAAQAEAAAAAAAAAQLPAAAAATAAAAARPQQPQPVQPQEPAAAAAAVAAAADDVSAAPAALPPGAAPAAAAAAAAAARAAAAAACTEAAAAAAARAAAATAAAAVAAAAAAEPVAATAAAAAAAVCLLLL",
|
| 114 |
+
"personalization_score": 0.6624,
|
| 115 |
+
"ancestry_tags": [
|
| 116 |
+
"Native",
|
| 117 |
+
"Irish"
|
| 118 |
+
],
|
| 119 |
+
"hla_matches": 2,
|
| 120 |
+
"metabolic_factor": 1.2,
|
| 121 |
+
"exposure_weight": 1.0,
|
| 122 |
+
"ethics_notice": "Ancestry-aware modeling active. Logged and ethically approved by Codette's CoreConscience.",
|
| 123 |
+
"validation": {
|
| 124 |
+
"length": 171,
|
| 125 |
+
"disorder": 0.26900584795321636,
|
| 126 |
+
"signal_peptide": {
|
| 127 |
+
"has_signal": false,
|
| 128 |
+
"confidence": 0.3333333333333333
|
| 129 |
+
},
|
| 130 |
+
"cysteines": {
|
| 131 |
+
"count": 2,
|
| 132 |
+
"paired": true,
|
| 133 |
+
"positions": [
|
| 134 |
+
125,
|
| 135 |
+
166
|
| 136 |
+
],
|
| 137 |
+
"spacing": [
|
| 138 |
+
41
|
| 139 |
+
]
|
| 140 |
+
},
|
| 141 |
+
"glycosylation": [],
|
| 142 |
+
"properties": {
|
| 143 |
+
"pI": 7.0,
|
| 144 |
+
"GRAVY": 0.6233918128654972,
|
| 145 |
+
"molecular_weight": 18503.0,
|
| 146 |
+
"aromaticity": 0.0,
|
| 147 |
+
"instability_index": null
|
| 148 |
+
}
|
| 149 |
+
}
|
| 150 |
+
},
|
| 151 |
+
{
|
| 152 |
+
"sequence": "AAAAAAAAAAKKGAGKGEEVAVAAVEEGELADEIPPPGFWGDK",
|
| 153 |
+
"personalization_score": 0.6378,
|
| 154 |
+
"ancestry_tags": [
|
| 155 |
+
"Native",
|
| 156 |
+
"Irish"
|
| 157 |
+
],
|
| 158 |
+
"hla_matches": 2,
|
| 159 |
+
"metabolic_factor": 1.2,
|
| 160 |
+
"exposure_weight": 1.0,
|
| 161 |
+
"ethics_notice": "Ancestry-aware modeling active. Logged and ethically approved by Codette's CoreConscience.",
|
| 162 |
+
"validation": {
|
| 163 |
+
"length": 43,
|
| 164 |
+
"disorder": 0.4883720930232558,
|
| 165 |
+
"signal_peptide": {
|
| 166 |
+
"has_signal": true,
|
| 167 |
+
"confidence": 0.6666666666666666
|
| 168 |
+
},
|
| 169 |
+
"cysteines": {
|
| 170 |
+
"count": 0,
|
| 171 |
+
"paired": true,
|
| 172 |
+
"positions": [],
|
| 173 |
+
"spacing": []
|
| 174 |
+
},
|
| 175 |
+
"glycosylation": [],
|
| 176 |
+
"properties": {
|
| 177 |
+
"pI": 7.0,
|
| 178 |
+
"GRAVY": -0.023255813953488354,
|
| 179 |
+
"molecular_weight": 4849.099999999999,
|
| 180 |
+
"aromaticity": 0.046511627906976744,
|
| 181 |
+
"instability_index": null
|
| 182 |
+
}
|
| 183 |
+
}
|
| 184 |
+
},
|
| 185 |
+
{
|
| 186 |
+
"sequence": "AAAAAAAAAAALDASGDQASLAGCIAASGPSAALTTLPTIISSGTVAGTMLSPSSTAAGLILSGLTAATSSSSSSSSFSSSLSAATSSSTAAAAAAAAAAAGGAAAAA",
|
| 187 |
+
"personalization_score": 0.6203,
|
| 188 |
+
"ancestry_tags": [
|
| 189 |
+
"Native",
|
| 190 |
+
"Irish"
|
| 191 |
+
],
|
| 192 |
+
"hla_matches": 2,
|
| 193 |
+
"metabolic_factor": 1.2,
|
| 194 |
+
"exposure_weight": 1.0,
|
| 195 |
+
"ethics_notice": "Ancestry-aware modeling active. Logged and ethically approved by Codette's CoreConscience.",
|
| 196 |
+
"validation": {
|
| 197 |
+
"length": 108,
|
| 198 |
+
"disorder": 0.37037037037037035,
|
| 199 |
+
"signal_peptide": {
|
| 200 |
+
"has_signal": false,
|
| 201 |
+
"confidence": 0.3333333333333333
|
| 202 |
+
},
|
| 203 |
+
"cysteines": {
|
| 204 |
+
"count": 1,
|
| 205 |
+
"paired": false,
|
| 206 |
+
"positions": [
|
| 207 |
+
23
|
| 208 |
+
],
|
| 209 |
+
"spacing": []
|
| 210 |
+
},
|
| 211 |
+
"glycosylation": [],
|
| 212 |
+
"properties": {
|
| 213 |
+
"pI": 7.0,
|
| 214 |
+
"GRAVY": 0.8472222222222222,
|
| 215 |
+
"molecular_weight": 11163.5,
|
| 216 |
+
"aromaticity": 0.009259259259259259,
|
| 217 |
+
"instability_index": null
|
| 218 |
+
}
|
| 219 |
+
}
|
| 220 |
+
},
|
| 221 |
+
{
|
| 222 |
+
"sequence": "AAAAAAAAAAGGGGVGPGSIDAAAGQRHQLAMNPYQLAALLAASGQLPAPPNPALLGASRPPMTPQSATSPLRTPTSPLSAAPAPGPPFHNSAYTNGRGSSPAPPARPVHASRGSSVRGDSVSSGDSDHSSAPPASRRQRAGSVLSIGSSDFATAAEQRAAAAAAVAASAVSSGAAAAAAAPPVQPPASATPAPAPAPLAASAAAAAAAQPSAGSAKAQAASPARRATTAAPTAAAGGAPGPLVRSRSARRAAAVSQQQAGQQSRGSSSNGGSGGGRDSGGSSGGGSGARRDDAPMSAAAAAAAAAAAGGHDAAAAAAPSQHTGHDGGAGGAAGAAAAAAAAADEDEDASMDVEWRDGASGSGAAAPIAAADAAPAVVAAGVADTPAPAPAAAAAATDAPAAAPPAADAPPAAEAATGADAAPAAADADATAPAPVVDAAADADAPLADDAAAAAAAAAAAAAAPGAAAADAPAAAPPAVAAPAPACAPAAPAAAPAPPAPAPAAVAAAAAASAPAPAPAPAPAPAPAAAAAAAAAPAAAAAQP",
|
| 223 |
+
"personalization_score": 0.7698,
|
| 224 |
+
"ancestry_tags": [
|
| 225 |
+
"Native",
|
| 226 |
+
"Irish"
|
| 227 |
+
],
|
| 228 |
+
"hla_matches": 2,
|
| 229 |
+
"metabolic_factor": 1.2,
|
| 230 |
+
"exposure_weight": 1.0,
|
| 231 |
+
"ethics_notice": "Ancestry-aware modeling active. Logged and ethically approved by Codette's CoreConscience.",
|
| 232 |
+
"validation": {
|
| 233 |
+
"length": 544,
|
| 234 |
+
"disorder": 0.45955882352941174,
|
| 235 |
+
"signal_peptide": {
|
| 236 |
+
"has_signal": false,
|
| 237 |
+
"confidence": 0.3333333333333333
|
| 238 |
+
},
|
| 239 |
+
"cysteines": {
|
| 240 |
+
"count": 1,
|
| 241 |
+
"paired": false,
|
| 242 |
+
"positions": [
|
| 243 |
+
486
|
| 244 |
+
],
|
| 245 |
+
"spacing": []
|
| 246 |
+
},
|
| 247 |
+
"glycosylation": [],
|
| 248 |
+
"properties": {
|
| 249 |
+
"pI": 7.0,
|
| 250 |
+
"GRAVY": 0.10367647058823529,
|
| 251 |
+
"molecular_weight": 57933.7,
|
| 252 |
+
"aromaticity": 0.009191176470588236,
|
| 253 |
+
"instability_index": null
|
| 254 |
+
}
|
| 255 |
+
}
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"sequence": "AAAAAAAAAAAMEDDAMAAAAAAAAAMEDDAMAAAAAAAMEDDAMAAAAAAAAAMEDDAMAAAAAAAAAMEDDAMAAAAAAAAAMEDDAMAAAAAAAAAMEDDAMAAAAAAAMEDDAMAAAAAAAAAMEDDAMAAAAAAAAAMEDDAMAAAAAVAAAAAMEDDAAAAAAAVAVAAAAAAAAAMEGDAMAAAAAVAAAAAMEDDAMAAAAAVAAAAAAAMEDDAAAAAAVAAAAAMEDDAAAAAAAMAAAAAMEDDAAAAAAAMAAAAAMEDDAAAAAAAMAAAAAMEDDAAAAAAAMLQVAAAAASAAAAAAAAAMDVCVYLLLHRRPP",
|
| 259 |
+
"personalization_score": 0.6787,
|
| 260 |
+
"ancestry_tags": [
|
| 261 |
+
"Native",
|
| 262 |
+
"Irish"
|
| 263 |
+
],
|
| 264 |
+
"hla_matches": 2,
|
| 265 |
+
"metabolic_factor": 1.2,
|
| 266 |
+
"exposure_weight": 1.0,
|
| 267 |
+
"ethics_notice": "Ancestry-aware modeling active. Logged and ethically approved by Codette's CoreConscience.",
|
| 268 |
+
"validation": {
|
| 269 |
+
"length": 329,
|
| 270 |
+
"disorder": 0.18541033434650456,
|
| 271 |
+
"signal_peptide": {
|
| 272 |
+
"has_signal": false,
|
| 273 |
+
"confidence": 0.3333333333333333
|
| 274 |
+
},
|
| 275 |
+
"cysteines": {
|
| 276 |
+
"count": 1,
|
| 277 |
+
"paired": false,
|
| 278 |
+
"positions": [
|
| 279 |
+
318
|
| 280 |
+
],
|
| 281 |
+
"spacing": []
|
| 282 |
+
},
|
| 283 |
+
"glycosylation": [],
|
| 284 |
+
"properties": {
|
| 285 |
+
"pI": 7.0,
|
| 286 |
+
"GRAVY": 0.9185410334346504,
|
| 287 |
+
"molecular_weight": 34937.4,
|
| 288 |
+
"aromaticity": 0.00303951367781155,
|
| 289 |
+
"instability_index": null
|
| 290 |
+
}
|
| 291 |
+
}
|
| 292 |
+
}
|
| 293 |
+
],
|
| 294 |
+
"similar_groups": []
|
| 295 |
+
}
|
reproduce.sh
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -e
|
| 3 |
+
|
| 4 |
+
# Create and activate virtual environment
|
| 5 |
+
python -m venv .venv
|
| 6 |
+
source .venv/bin/activate
|
| 7 |
+
|
| 8 |
+
# Install minimal dependencies
|
| 9 |
+
pip install -r requirements.txt
|
| 10 |
+
|
| 11 |
+
# Run validation with deterministic mode
|
| 12 |
+
python run_pipeline.py --deterministic
|
| 13 |
+
|
| 14 |
+
# Generate visualization
|
| 15 |
+
python visualize_results.py
|
| 16 |
+
|
| 17 |
+
# Verify checksums
|
| 18 |
+
sha256sum -c checksums.sha256
|
| 19 |
+
|
| 20 |
+
echo "Reproduction completed successfully!"
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
biopython==1.81
|
| 3 |
+
transformers==4.53.0
|
| 4 |
+
torch>=2.0.0
|
| 5 |
+
vaderSentiment==3.3.2
|
| 6 |
+
sympy==1.12
|
requirements_full.txt
ADDED
|
Binary file (4.16 kB). View file
|
|
|
run_manifest.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"run_metadata": {
|
| 3 |
+
"timestamp": "2025-09-12T15:06:58",
|
| 4 |
+
"environment": "environment.yaml",
|
| 5 |
+
"commit_hash": "main-2025-09-12"
|
| 6 |
+
},
|
| 7 |
+
"input_parameters": {
|
| 8 |
+
"ancestry_profile": ["Native", "Irish"],
|
| 9 |
+
"hla_matches": 2,
|
| 10 |
+
"prior_exposure": ["SARS-CoV-2", "Influenza-B"],
|
| 11 |
+
"metabolic_factor": 1.2,
|
| 12 |
+
"target_sequence": "MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFD"
|
| 13 |
+
},
|
| 14 |
+
"artifacts": {
|
| 15 |
+
"input_files": {
|
| 16 |
+
"main.py": "<sha256>",
|
| 17 |
+
"environment.yaml": "<sha256>"
|
| 18 |
+
},
|
| 19 |
+
"output_files": {
|
| 20 |
+
"codette_antibody_designs_20250912_150658.json": "<sha256>",
|
| 21 |
+
"validation_results_20250912_152239.json": "<sha256>",
|
| 22 |
+
"sequence_analysis.png": "<sha256>"
|
| 23 |
+
}
|
| 24 |
+
},
|
| 25 |
+
"validation_criteria": {
|
| 26 |
+
"disorder_threshold": 0.5,
|
| 27 |
+
"signal_peptide": "disallow",
|
| 28 |
+
"cys_pairs": "required",
|
| 29 |
+
"gravy_range": [-1.0, 1.0]
|
| 30 |
+
}
|
| 31 |
+
}
|
run_pipeline.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Reproducibility harness for Healdette pipeline.
|
| 3 |
+
Runs the full pipeline with validation and generates all artifacts.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import json
|
| 8 |
+
import sys
|
| 9 |
+
import os
|
| 10 |
+
import hashlib
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
import numpy as np
|
| 13 |
+
import torch
|
| 14 |
+
import pandas as pd
|
| 15 |
+
|
| 16 |
+
from modules.validate_sequences import validate_binder_set
|
| 17 |
+
|
| 18 |
+
def set_random_seeds(seed=42):
|
| 19 |
+
"""Set random seeds for reproducibility."""
|
| 20 |
+
np.random.seed(seed)
|
| 21 |
+
torch.manual_seed(seed)
|
| 22 |
+
if torch.cuda.is_available():
|
| 23 |
+
torch.cuda.manual_seed(seed)
|
| 24 |
+
|
| 25 |
+
def calculate_sha256(filepath):
|
| 26 |
+
"""Calculate SHA256 hash of a file."""
|
| 27 |
+
sha256_hash = hashlib.sha256()
|
| 28 |
+
with open(filepath, "rb") as f:
|
| 29 |
+
for byte_block in iter(lambda: f.read(4096), b""):
|
| 30 |
+
sha256_hash.update(byte_block)
|
| 31 |
+
return sha256_hash.hexdigest()
|
| 32 |
+
|
| 33 |
+
def validate_criteria(results, criteria):
|
| 34 |
+
"""Validate results against pre-registered criteria."""
|
| 35 |
+
failures = []
|
| 36 |
+
for binder in results['validated_binders']:
|
| 37 |
+
validation = binder['validation']
|
| 38 |
+
|
| 39 |
+
# Check disorder
|
| 40 |
+
if validation['disorder'] > criteria['disorder_threshold']:
|
| 41 |
+
failures.append(f"Sequence {binder['sequence'][:20]}... has high disorder: {validation['disorder']:.3f}")
|
| 42 |
+
|
| 43 |
+
# Check signal peptide
|
| 44 |
+
if criteria['signal_peptide'] == 'disallow' and validation['signal_peptide']['has_signal']:
|
| 45 |
+
failures.append(f"Sequence {binder['sequence'][:20]}... has signal peptide")
|
| 46 |
+
|
| 47 |
+
# Check cysteine pairs
|
| 48 |
+
if criteria['cys_pairs'] == 'required' and not validation['cysteines']['patterns']['paired']:
|
| 49 |
+
failures.append(f"Sequence {binder['sequence'][:20]}... lacks paired cysteines")
|
| 50 |
+
|
| 51 |
+
# Check GRAVY
|
| 52 |
+
gravy = validation['properties']['GRAVY']
|
| 53 |
+
if not (criteria['gravy_range'][0] <= gravy <= criteria['gravy_range'][1]):
|
| 54 |
+
failures.append(f"Sequence {binder['sequence'][:20]}... has GRAVY {gravy:.3f} outside range")
|
| 55 |
+
|
| 56 |
+
return failures
|
| 57 |
+
|
| 58 |
+
def generate_triage_table(results):
|
| 59 |
+
"""Generate triage table with key metrics."""
|
| 60 |
+
rows = []
|
| 61 |
+
for binder in results['validated_binders']:
|
| 62 |
+
rows.append({
|
| 63 |
+
'sequence_length': len(binder['sequence']),
|
| 64 |
+
'personalization_score': binder['personalization_score'],
|
| 65 |
+
'disorder': binder['validation']['disorder'],
|
| 66 |
+
'cys_pairs': binder['validation']['cysteines']['count'] // 2,
|
| 67 |
+
'glyco_sites': len(binder['validation']['glycosylation']),
|
| 68 |
+
'gravy': binder['validation']['properties']['GRAVY'],
|
| 69 |
+
'pI': binder['validation']['properties']['pI']
|
| 70 |
+
})
|
| 71 |
+
|
| 72 |
+
df = pd.DataFrame(rows)
|
| 73 |
+
return df.sort_values('personalization_score', ascending=False)
|
| 74 |
+
|
| 75 |
+
def main(args):
|
| 76 |
+
# Load configuration
|
| 77 |
+
with open('run_manifest.json', 'r') as f:
|
| 78 |
+
manifest = json.load(f)
|
| 79 |
+
|
| 80 |
+
# Set deterministic mode if requested
|
| 81 |
+
if args.deterministic:
|
| 82 |
+
set_random_seeds()
|
| 83 |
+
|
| 84 |
+
# Run validation
|
| 85 |
+
results = validate_binder_set(args.input_json)
|
| 86 |
+
|
| 87 |
+
# Generate triage table
|
| 88 |
+
triage_table = generate_triage_table(results)
|
| 89 |
+
triage_table.to_csv('output/triage_table.csv')
|
| 90 |
+
|
| 91 |
+
# Validate against criteria
|
| 92 |
+
failures = validate_criteria(results, manifest['validation_criteria'])
|
| 93 |
+
|
| 94 |
+
# Calculate checksums
|
| 95 |
+
checksums = {}
|
| 96 |
+
for filepath in [args.input_json, 'output/triage_table.csv', 'output/sequence_analysis.png']:
|
| 97 |
+
checksums[os.path.basename(filepath)] = calculate_sha256(filepath)
|
| 98 |
+
|
| 99 |
+
with open('checksums.sha256', 'w') as f:
|
| 100 |
+
for filename, checksum in checksums.items():
|
| 101 |
+
f.write(f"{checksum} {filename}\n")
|
| 102 |
+
|
| 103 |
+
# Exit with error if validation failed
|
| 104 |
+
if failures:
|
| 105 |
+
print("\nValidation failures:")
|
| 106 |
+
for failure in failures:
|
| 107 |
+
print(f"- {failure}")
|
| 108 |
+
sys.exit(1)
|
| 109 |
+
|
| 110 |
+
print("\nValidation successful!")
|
| 111 |
+
print(f"Results saved to {args.output_dir}")
|
| 112 |
+
sys.exit(0)
|
| 113 |
+
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
parser = argparse.ArgumentParser(description='Run Healdette pipeline with validation')
|
| 116 |
+
parser.add_argument('--input-json', default='output/codette_antibody_designs_20250912_150658.json',
|
| 117 |
+
help='Input JSON file with antibody designs')
|
| 118 |
+
parser.add_argument('--output-dir', default='output',
|
| 119 |
+
help='Output directory for results')
|
| 120 |
+
parser.add_argument('--deterministic', action='store_true',
|
| 121 |
+
help='Run in deterministic mode with fixed seeds')
|
| 122 |
+
|
| 123 |
+
args = parser.parse_args()
|
| 124 |
+
main(args)
|