Upload 33 files
Browse files- .gitignore +64 -0
- ARCHITECTURE.md +256 -0
- CHANGELOG.md +182 -0
- GRAPHQL_EXAMPLES.md +258 -0
- LICENSE +36 -0
- MODEL_CARD.md +373 -0
- PROJECT_SUMMARY.md +234 -0
- QUICKSTART.md +183 -0
- README.md +172 -3
- USER_GUIDE.md +419 -0
- backend/__init__.py +5 -0
- backend/api/__init__.py +7 -0
- backend/api/main.py +317 -0
- backend/boinc/__init__.py +8 -0
- backend/boinc/client.py +262 -0
- backend/gdc/__init__.py +8 -0
- backend/gdc/client.py +365 -0
- backend/neo4j/__init__.py +25 -0
- backend/neo4j/data_importer.py +152 -0
- backend/neo4j/db_manager.py +277 -0
- backend/neo4j/graphql_schema.py +198 -0
- backend/pipeline/__init__.py +18 -0
- backend/pipeline/blast_runner.py +274 -0
- backend/pipeline/fastq_processor.py +249 -0
- backend/pipeline/variant_caller.py +208 -0
- config.yml +66 -0
- docker-compose.yml +29 -0
- frontend/index.html +563 -0
- requirements.txt +51 -0
- run.py +168 -0
- setup.ps1 +81 -0
- setup.sh +75 -0
- test_cancer_at_home.py +178 -0
.gitignore
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
|
| 23 |
+
# Virtual Environment
|
| 24 |
+
venv/
|
| 25 |
+
ENV/
|
| 26 |
+
env/
|
| 27 |
+
|
| 28 |
+
# Neo4j data
|
| 29 |
+
neo4j_data/
|
| 30 |
+
|
| 31 |
+
# Downloaded data
|
| 32 |
+
data/gdc/*
|
| 33 |
+
data/boinc/*
|
| 34 |
+
data/cache/*
|
| 35 |
+
data/processed/*
|
| 36 |
+
|
| 37 |
+
# Logs
|
| 38 |
+
logs/
|
| 39 |
+
*.log
|
| 40 |
+
|
| 41 |
+
# IDE
|
| 42 |
+
.vscode/
|
| 43 |
+
.idea/
|
| 44 |
+
*.swp
|
| 45 |
+
*.swo
|
| 46 |
+
*~
|
| 47 |
+
|
| 48 |
+
# OS
|
| 49 |
+
.DS_Store
|
| 50 |
+
Thumbs.db
|
| 51 |
+
|
| 52 |
+
# Config overrides
|
| 53 |
+
config.local.yml
|
| 54 |
+
|
| 55 |
+
# Test coverage
|
| 56 |
+
htmlcov/
|
| 57 |
+
.coverage
|
| 58 |
+
.pytest_cache/
|
| 59 |
+
|
| 60 |
+
# Jupyter
|
| 61 |
+
.ipynb_checkpoints/
|
| 62 |
+
|
| 63 |
+
# Docker
|
| 64 |
+
.dockerignore
|
ARCHITECTURE.md
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Cancer@Home v2 - Architecture Diagram
|
| 2 |
+
|
| 3 |
+
## System Architecture
|
| 4 |
+
|
| 5 |
+
```
|
| 6 |
+
┌─────────────────────────────────────────────────────────────────────────┐
|
| 7 |
+
│ WEB BROWSER │
|
| 8 |
+
│ http://localhost:5000 │
|
| 9 |
+
└────────────────────────────┬────────────────────────────────────────────┘
|
| 10 |
+
│
|
| 11 |
+
│ HTTP/WebSocket
|
| 12 |
+
▼
|
| 13 |
+
┌─────────────────────────────────────────────────────────────────────────┐
|
| 14 |
+
│ FRONTEND (HTML5/CSS3/JS) │
|
| 15 |
+
│ ┌──────────┬──────────┬──────────┬──────────┬──────────────────────┐ │
|
| 16 |
+
│ │Dashboard │ Neo4j │ BOINC │ GDC │ Pipeline │ │
|
| 17 |
+
│ │ View │ Viz │ Tasks │ Data │ Tools │ │
|
| 18 |
+
│ └──────────┴──────────┴──────────┴──────────┴──────────────────────┘ │
|
| 19 |
+
│ │
|
| 20 |
+
│ Technologies: D3.js, Chart.js, Vanilla JavaScript │
|
| 21 |
+
└────────────────────────────┬────────────────────────────────────────────┘
|
| 22 |
+
│
|
| 23 |
+
│ REST API + GraphQL
|
| 24 |
+
▼
|
| 25 |
+
┌─────────────────────────────────────────────────────────────────────────┐
|
| 26 |
+
│ BACKEND (FastAPI + Python) │
|
| 27 |
+
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
| 28 |
+
│ │ API Layer │ │
|
| 29 |
+
│ │ • REST Endpoints (/api/*) │ │
|
| 30 |
+
│ │ • GraphQL Endpoint (/graphql) │ │
|
| 31 |
+
│ │ • WebSocket Support │ │
|
| 32 |
+
│ │ • Swagger Documentation (/docs) │ │
|
| 33 |
+
│ └─────────────────────────────────────────────────────────────────┘ │
|
| 34 |
+
│ │ │
|
| 35 |
+
│ │ Python Modules │
|
| 36 |
+
│ ▼ │
|
| 37 |
+
│ ┌──────────┬──────────┬──────────┬──────────┬─────────────────────┐ │
|
| 38 |
+
│ │ BOINC │ GDC │ Neo4j │ Pipeline │ Utilities │ │
|
| 39 |
+
│ │ Client │ Client │ DB │ Tools │ │ │
|
| 40 |
+
│ └──────────┴──────────┴──────────┴──────────┴─────────────────────┘ │
|
| 41 |
+
└───────┬──────────┬──────────┬──────────┬────────────────────────────────┘
|
| 42 |
+
│ │ │ │
|
| 43 |
+
│ │ │ │
|
| 44 |
+
▼ ▼ ▼ ▼
|
| 45 |
+
┌─────────────────────────────────────────────────────────────────────────┐
|
| 46 |
+
│ DATA & SERVICES LAYER │
|
| 47 |
+
│ │
|
| 48 |
+
│ ┌────────────────────┐ ┌────────────────────┐ ┌──────────────────┐ │
|
| 49 |
+
│ │ Neo4j Graph │ │ BOINC Server │ │ GDC Portal │ │
|
| 50 |
+
│ │ Database │ │ (Distributed) │ │ (External) │ │
|
| 51 |
+
│ │ │ │ │ │ │ │
|
| 52 |
+
│ │ Port: 7687 (Bolt) │ │ Local/Remote │ │ api.gdc.cancer │
|
| 53 |
+
│ │ 7474 (HTTP) │ │ Task Processing │ │ .gov │ │
|
| 54 |
+
│ │ │ │ │ │ │ │
|
| 55 |
+
│ │ • Genes │ │ • Variant Calling │ │ • TCGA Data │ │
|
| 56 |
+
│ │ • Mutations │ │ • BLAST Search │ │ • TARGET Data │ │
|
| 57 |
+
│ │ • Patients │ │ • Alignment │ │ • Clinical Data │ │
|
| 58 |
+
│ │ • Cancer Types │ │ • Annotation │ │ • Genomic Files │ │
|
| 59 |
+
│ └────────────────────┘ └────────────────────┘ └──────────────────┘ │
|
| 60 |
+
│ │
|
| 61 |
+
│ ┌────────────────────────────────────────────────────────────────────┐│
|
| 62 |
+
│ │ Bioinformatics Tools (Local) ││
|
| 63 |
+
│ │ ││
|
| 64 |
+
│ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────┐ ││
|
| 65 |
+
│ │ │ FASTQ │ │ BLAST │ │ Variant Caller │ ││
|
| 66 |
+
│ │ │ Processor │ │ Runner │ │ │ ││
|
| 67 |
+
│ │ │ │ │ │ │ │ ││
|
| 68 |
+
│ │ │ • QC │ │ • BLASTN │ │ • VCF Generation │ ││
|
| 69 |
+
│ │ │ • Filtering │ │ • BLASTP │ │ • Annotation │ ││
|
| 70 |
+
│ │ │ • Trimming │ │ • Parsing │ │ • TMB Calculation │ ││
|
| 71 |
+
│ │ └──────────────┘ └──────────────┘ └──────────────────────┘ ││
|
| 72 |
+
│ └────────────────────────────────────────────────────────────────────┘│
|
| 73 |
+
└─────────────────────────────────────────────────────────────────────────┘
|
| 74 |
+
│
|
| 75 |
+
▼
|
| 76 |
+
┌─────────────────────────────────────────────────────────────────────────┐
|
| 77 |
+
│ FILE STORAGE │
|
| 78 |
+
│ │
|
| 79 |
+
│ data/ │
|
| 80 |
+
│ ├── gdc/ # Downloaded GDC files │
|
| 81 |
+
│ ├── boinc/ # BOINC task data │
|
| 82 |
+
│ ├── processed/ # Analysis results │
|
| 83 |
+
│ │ ├── fastq/ │
|
| 84 |
+
│ │ ├── blast/ │
|
| 85 |
+
│ │ └── variants/ │
|
| 86 |
+
│ └── cache/ # Temporary files │
|
| 87 |
+
│ │
|
| 88 |
+
│ logs/ # Application logs │
|
| 89 |
+
└─────────────────────────────────────────────────────────────────────────┘
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
## Data Flow Diagram
|
| 93 |
+
|
| 94 |
+
```
|
| 95 |
+
┌──────────────┐
|
| 96 |
+
│ User │
|
| 97 |
+
│ Browser │
|
| 98 |
+
└──────┬───────┘
|
| 99 |
+
│ 1. Request
|
| 100 |
+
▼
|
| 101 |
+
┌──────────────────────────���───────┐
|
| 102 |
+
│ Dashboard │
|
| 103 |
+
│ (View Gene/Mutation Data) │
|
| 104 |
+
└──────┬───────────────────────────┘
|
| 105 |
+
│ 2. GraphQL Query
|
| 106 |
+
▼
|
| 107 |
+
┌──────────────────────────────────┐
|
| 108 |
+
│ FastAPI Backend │
|
| 109 |
+
│ - Parse Query │
|
| 110 |
+
│ - Validate Request │
|
| 111 |
+
└──────┬───────────────────────────┘
|
| 112 |
+
│ 3. Cypher Query
|
| 113 |
+
▼
|
| 114 |
+
┌──────────────────────────────────┐
|
| 115 |
+
│ Neo4j Database │
|
| 116 |
+
│ - Execute Graph Query │
|
| 117 |
+
│ - Traverse Relationships │
|
| 118 |
+
│ - Aggregate Results │
|
| 119 |
+
└──────┬───────────────────────────┘
|
| 120 |
+
│ 4. Graph Data
|
| 121 |
+
▼
|
| 122 |
+
┌──────────────────────────────────┐
|
| 123 |
+
│ GraphQL Resolver │
|
| 124 |
+
│ - Transform Data │
|
| 125 |
+
│ - Format Response │
|
| 126 |
+
└──────┬───────────────────────────┘
|
| 127 |
+
│ 5. JSON Response
|
| 128 |
+
▼
|
| 129 |
+
┌──────────────────────────────────┐
|
| 130 |
+
│ Frontend Visualization │
|
| 131 |
+
│ - Render Graph │
|
| 132 |
+
│ - Display Charts │
|
| 133 |
+
│ - Show Statistics │
|
| 134 |
+
└──────────────────────────────────┘
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
## BOINC Task Processing Flow
|
| 138 |
+
|
| 139 |
+
```
|
| 140 |
+
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
|
| 141 |
+
│ Submit │ │ Queue │ │ Execute │
|
| 142 |
+
│ Task │─────▶│ Task │─────▶│ Analysis │
|
| 143 |
+
│ │ │ │ │ │
|
| 144 |
+
└──────────────┘ └──────────────┘ └──────┬───────┘
|
| 145 |
+
│
|
| 146 |
+
▼
|
| 147 |
+
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
|
| 148 |
+
│ Store │ │ Import to │ │ Generate │
|
| 149 |
+
│ Results │◀─────│ Neo4j │◀─────│ Results │
|
| 150 |
+
│ │ │ │ │ │
|
| 151 |
+
└──────────────┘ └──────────────┘ └──────────────┘
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
## Neo4j Graph Schema
|
| 155 |
+
|
| 156 |
+
```
|
| 157 |
+
┌─────────────────────────────────────────────────────────────────┐
|
| 158 |
+
│ Neo4j Graph Model │
|
| 159 |
+
│ │
|
| 160 |
+
│ ┌──────────┐ ┌──────────┐ │
|
| 161 |
+
│ │ Gene │ │ Mutation │ │
|
| 162 |
+
│ ├──────────┤ ├──────────┤ │
|
| 163 |
+
│ │ gene_id │◀───────AFFECTS─────│mut_id │ │
|
| 164 |
+
│ │ symbol │ │ chr │ │
|
| 165 |
+
│ │ name │ │ position │ │
|
| 166 |
+
│ │ chr │ │ ref │ │
|
| 167 |
+
│ └──────────┘ │ alt │ │
|
| 168 |
+
│ └────▲─────┘ │
|
| 169 |
+
│ │ │
|
| 170 |
+
│ │ HAS_MUTATION │
|
| 171 |
+
│ │ │
|
| 172 |
+
│ ┌──────────┐ ┌────┴─────┐ │
|
| 173 |
+
│ │ Cancer │ │ Patient │ │
|
| 174 |
+
│ │ Type │ ├──────────┤ │
|
| 175 |
+
│ ├──────────┤ │patient_id│ │
|
| 176 |
+
│ │cancer_id │ │ age │ │
|
| 177 |
+
│ │ name │◀──DIAGNOSED_WITH───│ gender │ │
|
| 178 |
+
│ │ tissue │ │ race │ │
|
| 179 |
+
│ └──────────┘ │ status │ │
|
| 180 |
+
│ └──────────┘ │
|
| 181 |
+
│ │
|
| 182 |
+
│ Relationships: │
|
| 183 |
+
│ • Gene ← AFFECTS ← Mutation │
|
| 184 |
+
│ • Patient → HAS_MUTATION → Mutation │
|
| 185 |
+
│ • Patient → DIAGNOSED_WITH → CancerType │
|
| 186 |
+
└─────────────────────────────────────────────────────────────────┘
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
## Technology Stack
|
| 190 |
+
|
| 191 |
+
```
|
| 192 |
+
┌─────────────────────────────────────────────────────────────────┐
|
| 193 |
+
│ Technology Layers │
|
| 194 |
+
│ │
|
| 195 |
+
│ Frontend: │
|
| 196 |
+
│ • HTML5, CSS3, JavaScript (ES6+) │
|
| 197 |
+
│ • D3.js (Graph Visualization) │
|
| 198 |
+
│ • Chart.js (Charts & Analytics) │
|
| 199 |
+
│ • Responsive Design │
|
| 200 |
+
│ │
|
| 201 |
+
│ Backend: │
|
| 202 |
+
│ • Python 3.8+ │
|
| 203 |
+
│ • FastAPI (Web Framework) │
|
| 204 |
+
│ • Uvicorn (ASGI Server) │
|
| 205 |
+
│ • Strawberry (GraphQL) │
|
| 206 |
+
│ │
|
| 207 |
+
│ Database: │
|
| 208 |
+
│ • Neo4j 5.13 (Graph Database) │
|
| 209 |
+
│ • Bolt Protocol │
|
| 210 |
+
│ • APOC & GDS Plugins │
|
| 211 |
+
│ │
|
| 212 |
+
│ Data Processing: │
|
| 213 |
+
│ • Biopython (Sequence Analysis) │
|
| 214 |
+
│ • NumPy & Pandas (Data Manipulation) │
|
| 215 |
+
│ • BLAST+ (Sequence Alignment) │
|
| 216 |
+
│ │
|
| 217 |
+
│ Infrastructure: │
|
| 218 |
+
│ • Docker & Docker Compose │
|
| 219 |
+
│ • YAML Configuration │
|
| 220 |
+
│ • Python Virtual Environments │
|
| 221 |
+
│ │
|
| 222 |
+
│ External APIs: │
|
| 223 |
+
│ • GDC Portal API (Cancer Data) │
|
| 224 |
+
│ • BOINC RPC (Distributed Computing) │
|
| 225 |
+
└─────────────────────────────────────────────────────────────────┘
|
| 226 |
+
```
|
| 227 |
+
|
| 228 |
+
## Deployment Architecture
|
| 229 |
+
|
| 230 |
+
```
|
| 231 |
+
┌─────────────────────────────────────────────────────────────────┐
|
| 232 |
+
│ Local Development │
|
| 233 |
+
│ │
|
| 234 |
+
│ ┌──────────────────────────────────────────────────────────┐ │
|
| 235 |
+
│ │ Host Machine │ │
|
| 236 |
+
│ │ │ │
|
| 237 |
+
│ │ ┌─────────────────┐ ┌──────────────────────┐ │ │
|
| 238 |
+
│ │ │ Python venv │ │ Docker Desktop │ │ │
|
| 239 |
+
│ │ │ Port 5000 │ │ │ │ │
|
| 240 |
+
│ │ │ │ │ ┌────────────────┐ │ │ │
|
| 241 |
+
│ │ │ • FastAPI │ │ │ Neo4j │ │ │ │
|
| 242 |
+
│ │ │ • Backend API │◀───────▶│ │ Port 7474 │ │ │ │
|
| 243 |
+
│ │ │ • GraphQL │ │ │ Port 7687 │ │ │ │
|
| 244 |
+
│ │ │ • WebSocket │ │ └────────────────┘ │ │ │
|
| 245 |
+
│ │ │ │ │ │ │ │
|
| 246 |
+
│ │ └─────────────────┘ └──────────────────────┘ │ │
|
| 247 |
+
│ │ │ │
|
| 248 |
+
│ └──────────────────────────────────────────────────────────┘ │
|
| 249 |
+
│ │
|
| 250 |
+
│ Access URLs: │
|
| 251 |
+
│ • http://localhost:5000 - Main Application │
|
| 252 |
+
│ • http://localhost:5000/docs - API Documentation │
|
| 253 |
+
│ • http://localhost:5000/graphql - GraphQL Playground │
|
| 254 |
+
│ • http://localhost:7474 - Neo4j Browser │
|
| 255 |
+
└─────────────────────────────────────────────────────────────────┘
|
| 256 |
+
```
|
CHANGELOG.md
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Changelog
|
| 2 |
+
|
| 3 |
+
All notable changes to Cancer@Home v2 will be documented in this file.
|
| 4 |
+
|
| 5 |
+
## [2.0.0] - 2025-11-19
|
| 6 |
+
|
| 7 |
+
### 🎉 Initial Release
|
| 8 |
+
|
| 9 |
+
#### Added
|
| 10 |
+
- **Core Infrastructure**
|
| 11 |
+
- FastAPI backend with REST and GraphQL APIs
|
| 12 |
+
- Neo4j graph database integration
|
| 13 |
+
- Docker Compose setup for easy deployment
|
| 14 |
+
- Python virtual environment configuration
|
| 15 |
+
- Comprehensive YAML-based configuration system
|
| 16 |
+
|
| 17 |
+
- **BOINC Integration**
|
| 18 |
+
- Distributed computing task submission
|
| 19 |
+
- Task status monitoring and tracking
|
| 20 |
+
- Support for variant calling, BLAST, and alignment tasks
|
| 21 |
+
- Task statistics and performance metrics
|
| 22 |
+
- JSON-based task persistence
|
| 23 |
+
|
| 24 |
+
- **GDC Data Portal Integration**
|
| 25 |
+
- API client for GDC cancer data
|
| 26 |
+
- File search and download capabilities
|
| 27 |
+
- Support for TCGA and TARGET projects
|
| 28 |
+
- MAF and VCF file parsers
|
| 29 |
+
- Clinical data extraction
|
| 30 |
+
|
| 31 |
+
- **Bioinformatics Pipeline**
|
| 32 |
+
- FASTQ quality control and filtering
|
| 33 |
+
- Adapter trimming
|
| 34 |
+
- BLAST sequence alignment (BLASTN/BLASTP)
|
| 35 |
+
- Variant calling from sequencing data
|
| 36 |
+
- Cancer variant identification
|
| 37 |
+
- Tumor mutation burden calculation
|
| 38 |
+
|
| 39 |
+
- **Neo4j Graph Database**
|
| 40 |
+
- Comprehensive graph schema (Genes, Mutations, Patients, Cancer Types)
|
| 41 |
+
- Repository pattern for data access
|
| 42 |
+
- GraphQL schema with flexible querying
|
| 43 |
+
- Sample dataset with 7 genes, 5 mutations, 5 patients, 4 cancer types
|
| 44 |
+
- Optimized with constraints and indexes
|
| 45 |
+
|
| 46 |
+
- **Web Dashboard**
|
| 47 |
+
- Modern, responsive HTML5/CSS3/JavaScript interface
|
| 48 |
+
- 5 main sections: Dashboard, Neo4j Visualization, BOINC Tasks, GDC Data, Pipeline
|
| 49 |
+
- Interactive D3.js graph visualization
|
| 50 |
+
- Chart.js analytics and statistics
|
| 51 |
+
- Real-time data updates
|
| 52 |
+
- Clean gradient-based design
|
| 53 |
+
|
| 54 |
+
- **API Endpoints**
|
| 55 |
+
- `/api/health` - System health check
|
| 56 |
+
- `/api/neo4j/summary` - Database statistics
|
| 57 |
+
- `/api/neo4j/genes/{symbol}` - Gene information
|
| 58 |
+
- `/api/boinc/*` - BOINC task management
|
| 59 |
+
- `/api/gdc/*` - GDC data access
|
| 60 |
+
- `/api/pipeline/*` - Bioinformatics tools
|
| 61 |
+
- `/graphql` - GraphQL playground
|
| 62 |
+
- `/docs` - Swagger API documentation
|
| 63 |
+
|
| 64 |
+
- **Documentation**
|
| 65 |
+
- Comprehensive README with installation guide
|
| 66 |
+
- Quick start guide (QUICKSTART.md)
|
| 67 |
+
- Detailed user guide (USER_GUIDE.md)
|
| 68 |
+
- GraphQL query examples (GRAPHQL_EXAMPLES.md)
|
| 69 |
+
- Architecture documentation (ARCHITECTURE.md)
|
| 70 |
+
- Project summary (PROJECT_SUMMARY.md)
|
| 71 |
+
- MIT License
|
| 72 |
+
|
| 73 |
+
- **Setup & Deployment**
|
| 74 |
+
- Automated Windows setup script (setup.ps1)
|
| 75 |
+
- Automated Linux/Mac setup script (setup.sh)
|
| 76 |
+
- One-command application launcher (run.py)
|
| 77 |
+
- Rich terminal output with progress tracking
|
| 78 |
+
- Automatic directory structure creation
|
| 79 |
+
- Database schema initialization
|
| 80 |
+
|
| 81 |
+
- **Testing**
|
| 82 |
+
- Comprehensive test suite (test_cancer_at_home.py)
|
| 83 |
+
- Module import tests
|
| 84 |
+
- Integration tests
|
| 85 |
+
- Directory structure validation
|
| 86 |
+
|
| 87 |
+
#### Features Highlights
|
| 88 |
+
|
| 89 |
+
✓ **Easy Installation**: 5-minute setup with automated scripts
|
| 90 |
+
✓ **Interactive Dashboard**: Modern web UI with real-time updates
|
| 91 |
+
✓ **Graph Visualization**: Neo4j-powered relationship mapping
|
| 92 |
+
✓ **Flexible Querying**: Both REST and GraphQL APIs
|
| 93 |
+
✓ **Distributed Computing**: BOINC integration for heavy workloads
|
| 94 |
+
✓ **Real Data**: GDC Portal integration for cancer genomics
|
| 95 |
+
✓ **Bioinformatics**: Complete FASTQ → BLAST → VCF pipeline
|
| 96 |
+
✓ **Well Documented**: 7 documentation files covering all aspects
|
| 97 |
+
✓ **Production Ready**: Error handling, logging, configuration
|
| 98 |
+
|
| 99 |
+
#### Technical Specifications
|
| 100 |
+
|
| 101 |
+
- **Python**: 3.8+
|
| 102 |
+
- **Neo4j**: 5.13 Community Edition
|
| 103 |
+
- **FastAPI**: 0.104.1
|
| 104 |
+
- **Docker**: Latest
|
| 105 |
+
- **Supported OS**: Windows, Linux, macOS
|
| 106 |
+
|
| 107 |
+
#### Sample Data Included
|
| 108 |
+
|
| 109 |
+
**Genes**: TP53, BRAF, BRCA1, BRCA2, PIK3CA, KRAS, EGFR
|
| 110 |
+
**Cancer Types**: Breast Cancer, Lung Adenocarcinoma, Colon Adenocarcinoma, Glioblastoma
|
| 111 |
+
**Projects**: TCGA-BRCA, TCGA-LUAD, TCGA-COAD, TCGA-GBM, TARGET-AML
|
| 112 |
+
|
| 113 |
+
---
|
| 114 |
+
|
| 115 |
+
## Version Numbering
|
| 116 |
+
|
| 117 |
+
This project follows [Semantic Versioning](https://semver.org/):
|
| 118 |
+
- **MAJOR**: Incompatible API changes
|
| 119 |
+
- **MINOR**: New functionality, backwards compatible
|
| 120 |
+
- **PATCH**: Bug fixes, backwards compatible
|
| 121 |
+
|
| 122 |
+
---
|
| 123 |
+
|
| 124 |
+
## Future Roadmap
|
| 125 |
+
|
| 126 |
+
### Planned Features (v2.1.0)
|
| 127 |
+
- [ ] Machine learning for mutation prediction
|
| 128 |
+
- [ ] Multi-omics data integration (RNA-seq, proteomics)
|
| 129 |
+
- [ ] Advanced graph algorithms (PageRank, community detection)
|
| 130 |
+
- [ ] Export and report generation (PDF, Excel)
|
| 131 |
+
- [ ] User authentication and authorization
|
| 132 |
+
- [ ] Data caching for improved performance
|
| 133 |
+
|
| 134 |
+
### Planned Features (v2.2.0)
|
| 135 |
+
- [ ] Survival analysis and clinical outcomes
|
| 136 |
+
- [ ] Drug response prediction
|
| 137 |
+
- [ ] Mobile-responsive design improvements
|
| 138 |
+
- [ ] Real-time collaboration features
|
| 139 |
+
- [ ] Batch data import wizard
|
| 140 |
+
- [ ] Advanced search and filtering
|
| 141 |
+
|
| 142 |
+
### Long-term Goals
|
| 143 |
+
- [ ] Cloud deployment support (AWS, Azure, GCP)
|
| 144 |
+
- [ ] Kubernetes orchestration
|
| 145 |
+
- [ ] Microservices architecture
|
| 146 |
+
- [ ] Real-time BOINC cluster management
|
| 147 |
+
- [ ] Integration with additional data sources
|
| 148 |
+
- [ ] AI-powered data analysis
|
| 149 |
+
|
| 150 |
+
---
|
| 151 |
+
|
| 152 |
+
## Contributing
|
| 153 |
+
|
| 154 |
+
Contributions are welcome! Please see CONTRIBUTING.md (to be created) for guidelines.
|
| 155 |
+
|
| 156 |
+
---
|
| 157 |
+
|
| 158 |
+
## Support
|
| 159 |
+
|
| 160 |
+
For issues, questions, or suggestions:
|
| 161 |
+
- Check the documentation first
|
| 162 |
+
- Review logs in `logs/cancer_at_home.log`
|
| 163 |
+
- Open a GitHub issue (if applicable)
|
| 164 |
+
|
| 165 |
+
---
|
| 166 |
+
|
| 167 |
+
## Acknowledgments
|
| 168 |
+
|
| 169 |
+
Built with inspiration from:
|
| 170 |
+
- Cancer@Home v1 (HeroX DCx Challenge)
|
| 171 |
+
- Andrew Kamal's Neo4j Cancer Visualization Dashboard
|
| 172 |
+
- The Cancer Genome Atlas (TCGA) Project
|
| 173 |
+
- BOINC Project at UC Berkeley
|
| 174 |
+
|
| 175 |
+
Data provided by:
|
| 176 |
+
- Genomic Data Commons (GDC) Portal
|
| 177 |
+
- National Cancer Institute (NCI)
|
| 178 |
+
- The Cancer Genome Atlas Program
|
| 179 |
+
|
| 180 |
+
---
|
| 181 |
+
|
| 182 |
+
**Cancer@Home v2** - Making cancer genomics research accessible, distributed, and visual.
|
GRAPHQL_EXAMPLES.md
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Example GraphQL Queries for Cancer@Home v2
|
| 2 |
+
|
| 3 |
+
## Basic Queries
|
| 4 |
+
|
| 5 |
+
### Get all genes
|
| 6 |
+
```graphql
|
| 7 |
+
query {
|
| 8 |
+
genes(limit: 10) {
|
| 9 |
+
gene_id
|
| 10 |
+
symbol
|
| 11 |
+
name
|
| 12 |
+
chromosome
|
| 13 |
+
gene_type
|
| 14 |
+
}
|
| 15 |
+
}
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
### Get specific gene by symbol
|
| 19 |
+
```graphql
|
| 20 |
+
query {
|
| 21 |
+
gene(symbol: "TP53") {
|
| 22 |
+
gene_id
|
| 23 |
+
symbol
|
| 24 |
+
name
|
| 25 |
+
chromosome
|
| 26 |
+
start_position
|
| 27 |
+
end_position
|
| 28 |
+
}
|
| 29 |
+
}
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
### Get mutations for a specific gene
|
| 33 |
+
```graphql
|
| 34 |
+
query {
|
| 35 |
+
mutations(gene: "TP53", limit: 20) {
|
| 36 |
+
mutation_id
|
| 37 |
+
chromosome
|
| 38 |
+
position
|
| 39 |
+
reference
|
| 40 |
+
alternate
|
| 41 |
+
consequence
|
| 42 |
+
variant_type
|
| 43 |
+
quality
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
### Get mutations on a chromosome
|
| 49 |
+
```graphql
|
| 50 |
+
query {
|
| 51 |
+
mutations(chromosome: "chr17", limit: 50) {
|
| 52 |
+
mutation_id
|
| 53 |
+
position
|
| 54 |
+
reference
|
| 55 |
+
alternate
|
| 56 |
+
consequence
|
| 57 |
+
}
|
| 58 |
+
}
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
## Patient Queries
|
| 62 |
+
|
| 63 |
+
### Get all patients
|
| 64 |
+
```graphql
|
| 65 |
+
query {
|
| 66 |
+
patients(limit: 100) {
|
| 67 |
+
patient_id
|
| 68 |
+
project_id
|
| 69 |
+
age
|
| 70 |
+
gender
|
| 71 |
+
race
|
| 72 |
+
vital_status
|
| 73 |
+
}
|
| 74 |
+
}
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
### Get patients by project
|
| 78 |
+
```graphql
|
| 79 |
+
query {
|
| 80 |
+
patients(project_id: "TCGA-BRCA") {
|
| 81 |
+
patient_id
|
| 82 |
+
age
|
| 83 |
+
gender
|
| 84 |
+
vital_status
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
### Get patients by cancer type
|
| 90 |
+
```graphql
|
| 91 |
+
query {
|
| 92 |
+
patients(cancer_type: "BRCA", limit: 50) {
|
| 93 |
+
patient_id
|
| 94 |
+
age
|
| 95 |
+
gender
|
| 96 |
+
race
|
| 97 |
+
}
|
| 98 |
+
}
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
## Cancer Type Queries
|
| 102 |
+
|
| 103 |
+
### Get all cancer types
|
| 104 |
+
```graphql
|
| 105 |
+
query {
|
| 106 |
+
cancerTypes {
|
| 107 |
+
cancer_type_id
|
| 108 |
+
name
|
| 109 |
+
tissue
|
| 110 |
+
disease_type
|
| 111 |
+
}
|
| 112 |
+
}
|
| 113 |
+
```
|
| 114 |
+
|
| 115 |
+
### Get statistics for a cancer type
|
| 116 |
+
```graphql
|
| 117 |
+
query {
|
| 118 |
+
cancerStatistics(cancer_type_id: "BRCA") {
|
| 119 |
+
cancer_type
|
| 120 |
+
total_patients
|
| 121 |
+
total_mutations
|
| 122 |
+
avg_mutations_per_patient
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
## Mutation Analysis
|
| 128 |
+
|
| 129 |
+
### Get mutation frequency
|
| 130 |
+
```graphql
|
| 131 |
+
query {
|
| 132 |
+
mutationFrequency(mutation_id: "MUT-TP53-001") {
|
| 133 |
+
mutation_id
|
| 134 |
+
patients_with_mutation
|
| 135 |
+
total_patients
|
| 136 |
+
frequency
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
## Complex Queries
|
| 142 |
+
|
| 143 |
+
### Combined gene and mutation data
|
| 144 |
+
```graphql
|
| 145 |
+
query {
|
| 146 |
+
gene(symbol: "BRCA1") {
|
| 147 |
+
symbol
|
| 148 |
+
name
|
| 149 |
+
chromosome
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
mutations(gene: "BRCA1") {
|
| 153 |
+
mutation_id
|
| 154 |
+
position
|
| 155 |
+
consequence
|
| 156 |
+
quality
|
| 157 |
+
}
|
| 158 |
+
}
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
### Multiple cancer statistics
|
| 162 |
+
```graphql
|
| 163 |
+
query {
|
| 164 |
+
breastCancer: cancerStatistics(cancer_type_id: "BRCA") {
|
| 165 |
+
cancer_type
|
| 166 |
+
total_patients
|
| 167 |
+
total_mutations
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
lungCancer: cancerStatistics(cancer_type_id: "LUAD") {
|
| 171 |
+
cancer_type
|
| 172 |
+
total_patients
|
| 173 |
+
total_mutations
|
| 174 |
+
}
|
| 175 |
+
}
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
## Using Variables
|
| 179 |
+
|
| 180 |
+
### Query with variables
|
| 181 |
+
```graphql
|
| 182 |
+
query GetGeneInfo($geneSymbol: String!) {
|
| 183 |
+
gene(symbol: $geneSymbol) {
|
| 184 |
+
symbol
|
| 185 |
+
name
|
| 186 |
+
chromosome
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
mutations(gene: $geneSymbol) {
|
| 190 |
+
mutation_id
|
| 191 |
+
position
|
| 192 |
+
consequence
|
| 193 |
+
}
|
| 194 |
+
}
|
| 195 |
+
```
|
| 196 |
+
|
| 197 |
+
Variables:
|
| 198 |
+
```json
|
| 199 |
+
{
|
| 200 |
+
"geneSymbol": "TP53"
|
| 201 |
+
}
|
| 202 |
+
```
|
| 203 |
+
|
| 204 |
+
### Pagination example
|
| 205 |
+
```graphql
|
| 206 |
+
query GetMutations($limit: Int = 10) {
|
| 207 |
+
mutations(limit: $limit) {
|
| 208 |
+
mutation_id
|
| 209 |
+
chromosome
|
| 210 |
+
position
|
| 211 |
+
}
|
| 212 |
+
}
|
| 213 |
+
```
|
| 214 |
+
|
| 215 |
+
Variables:
|
| 216 |
+
```json
|
| 217 |
+
{
|
| 218 |
+
"limit": 25
|
| 219 |
+
}
|
| 220 |
+
```
|
| 221 |
+
|
| 222 |
+
## Filtering Examples
|
| 223 |
+
|
| 224 |
+
### Get high-quality mutations
|
| 225 |
+
```graphql
|
| 226 |
+
query {
|
| 227 |
+
mutations(gene: "KRAS", limit: 100) {
|
| 228 |
+
mutation_id
|
| 229 |
+
quality
|
| 230 |
+
consequence
|
| 231 |
+
}
|
| 232 |
+
}
|
| 233 |
+
```
|
| 234 |
+
|
| 235 |
+
### Get patients by demographics
|
| 236 |
+
```graphql
|
| 237 |
+
query {
|
| 238 |
+
patients(project_id: "TCGA-BRCA") {
|
| 239 |
+
patient_id
|
| 240 |
+
age
|
| 241 |
+
gender
|
| 242 |
+
race
|
| 243 |
+
vital_status
|
| 244 |
+
}
|
| 245 |
+
}
|
| 246 |
+
```
|
| 247 |
+
|
| 248 |
+
## Tips for Using GraphQL
|
| 249 |
+
|
| 250 |
+
1. **Use the GraphQL Playground**: Navigate to http://localhost:5000/graphql for an interactive interface with autocomplete and documentation
|
| 251 |
+
|
| 252 |
+
2. **Request only needed fields**: GraphQL allows you to request exactly the data you need, improving performance
|
| 253 |
+
|
| 254 |
+
3. **Combine multiple queries**: Use aliases to fetch different datasets in a single request
|
| 255 |
+
|
| 256 |
+
4. **Use variables**: Make queries reusable by parameterizing them with variables
|
| 257 |
+
|
| 258 |
+
5. **Explore the schema**: Use the GraphQL Playground's "Docs" panel to see all available queries and fields
|
LICENSE
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Cancer@Home Contributors
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
| 22 |
+
|
| 23 |
+
---
|
| 24 |
+
|
| 25 |
+
This project is inspired by:
|
| 26 |
+
- Cancer@Home v1 (HeroX DCx Challenge)
|
| 27 |
+
- Andrew Kamal's Neo4j Cancer Visualization Dashboard
|
| 28 |
+
|
| 29 |
+
Data sources:
|
| 30 |
+
- Genomic Data Commons (GDC) Portal: https://portal.gdc.cancer.gov/
|
| 31 |
+
- The Cancer Genome Atlas (TCGA)
|
| 32 |
+
- Therapeutically Applicable Research to Generate Effective Treatments (TARGET)
|
| 33 |
+
|
| 34 |
+
For data usage and citation requirements, please refer to:
|
| 35 |
+
- GDC Data Policies: https://gdc.cancer.gov/about-gdc/gdc-policies
|
| 36 |
+
- TCGA Publication Guidelines
|
MODEL_CARD.md
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
tags:
|
| 4 |
+
- cancer-genomics
|
| 5 |
+
- bioinformatics
|
| 6 |
+
- graph-database
|
| 7 |
+
- neo4j
|
| 8 |
+
- distributed-computing
|
| 9 |
+
- boinc
|
| 10 |
+
- healthcare
|
| 11 |
+
- genomics
|
| 12 |
+
- fastq
|
| 13 |
+
- blast
|
| 14 |
+
- variant-calling
|
| 15 |
+
- gdc-portal
|
| 16 |
+
- tcga
|
| 17 |
+
library_name: fastapi
|
| 18 |
+
pipeline_tag: other
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
# Cancer@Home v2
|
| 22 |
+
|
| 23 |
+
<div align="center">
|
| 24 |
+
<img src="https://img.shields.io/badge/version-2.0.0-blue.svg" alt="Version">
|
| 25 |
+
<img src="https://img.shields.io/badge/license-MIT-green.svg" alt="License">
|
| 26 |
+
<img src="https://img.shields.io/badge/python-3.8+-blue.svg" alt="Python">
|
| 27 |
+
<img src="https://img.shields.io/badge/neo4j-5.13-brightgreen.svg" alt="Neo4j">
|
| 28 |
+
</div>
|
| 29 |
+
|
| 30 |
+
## 🧬 Overview
|
| 31 |
+
|
| 32 |
+
Cancer@Home v2 is a comprehensive distributed computing platform for cancer genomics research that combines **BOINC distributed computing**, **GDC cancer data analysis**, **sequence processing (FASTQ/BLAST)**, and **Neo4j graph visualization** into a unified, easy-to-use system.
|
| 33 |
+
|
| 34 |
+
Inspired by [Cancer@Home v1](https://www.herox.com/DCx/round/516/entry/23285) and [Andrew Kamal's Neo4j Dashboard](https://medium.com/neo4j/visualize-cancer-1c80a95f5bb4), this platform makes cancer genomics research accessible, distributed, and visual.
|
| 35 |
+
|
| 36 |
+
## 🎯 Key Features
|
| 37 |
+
|
| 38 |
+
- 🌐 **Interactive Web Dashboard** - Modern UI with real-time visualizations
|
| 39 |
+
- 🔍 **Neo4j Graph Database** - Model complex gene-mutation-patient relationships
|
| 40 |
+
- ⚡ **BOINC Integration** - Distributed computing for intensive analyses
|
| 41 |
+
- 📊 **GraphQL API** - Flexible data querying
|
| 42 |
+
- 🧪 **Bioinformatics Pipeline** - FASTQ processing, BLAST alignment, variant calling
|
| 43 |
+
- 📚 **GDC Portal Integration** - Access TCGA/TARGET cancer datasets
|
| 44 |
+
- 🚀 **Quick Setup** - Running in under 5 minutes
|
| 45 |
+
|
| 46 |
+
## 🏗️ Architecture
|
| 47 |
+
|
| 48 |
+
```
|
| 49 |
+
┌─────────────────────────────────────────────┐
|
| 50 |
+
│ Web Dashboard (D3.js + Chart.js) │
|
| 51 |
+
├─────────────────────────────────────────────┤
|
| 52 |
+
│ FastAPI Backend (REST + GraphQL) │
|
| 53 |
+
├──────┬──────┬──────┬──────┬────────────────┤
|
| 54 |
+
│Neo4j │BOINC │ GDC │FASTQ │ BLAST/Variant │
|
| 55 |
+
│Graph │Client│ API │ QC │ Calling │
|
| 56 |
+
└──────┴──────┴──────┴──────┴────────────────┘
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
## 📦 Installation
|
| 60 |
+
|
| 61 |
+
### Prerequisites
|
| 62 |
+
- Python 3.8+
|
| 63 |
+
- Docker Desktop
|
| 64 |
+
- 8GB RAM (16GB recommended)
|
| 65 |
+
|
| 66 |
+
### Quick Start
|
| 67 |
+
|
| 68 |
+
**Windows:**
|
| 69 |
+
```powershell
|
| 70 |
+
git clone https://huggingface.co/OpenPeerAI/CancerAtHomeV2
|
| 71 |
+
cd CancerAtHomeV2
|
| 72 |
+
.\setup.ps1
|
| 73 |
+
python run.py
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
**Linux/Mac:**
|
| 77 |
+
```bash
|
| 78 |
+
git clone https://huggingface.co/OpenPeerAI/CancerAtHomeV2
|
| 79 |
+
cd CancerAtHomeV2
|
| 80 |
+
chmod +x setup.sh
|
| 81 |
+
./setup.sh
|
| 82 |
+
python run.py
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
Then open: **http://localhost:5000**
|
| 86 |
+
|
| 87 |
+
## 🚀 Usage
|
| 88 |
+
|
| 89 |
+
### Web Dashboard
|
| 90 |
+
Access the interactive dashboard at http://localhost:5000 with:
|
| 91 |
+
- **Dashboard Tab**: Overview statistics and mutation charts
|
| 92 |
+
- **Neo4j Visualization**: Interactive graph of cancer relationships
|
| 93 |
+
- **BOINC Tasks**: Submit and monitor distributed computing tasks
|
| 94 |
+
- **GDC Data**: Browse and download cancer datasets
|
| 95 |
+
- **Pipeline Tools**: Run FASTQ QC, BLAST, and variant calling
|
| 96 |
+
|
| 97 |
+
### GraphQL API
|
| 98 |
+
|
| 99 |
+
Query cancer data at http://localhost:5000/graphql
|
| 100 |
+
|
| 101 |
+
**Example: Get mutations in TP53 gene**
|
| 102 |
+
```graphql
|
| 103 |
+
query {
|
| 104 |
+
mutations(gene: "TP53") {
|
| 105 |
+
mutation_id
|
| 106 |
+
chromosome
|
| 107 |
+
position
|
| 108 |
+
consequence
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
**Example: Get patient statistics**
|
| 114 |
+
```graphql
|
| 115 |
+
query {
|
| 116 |
+
cancerStatistics(cancer_type_id: "BRCA") {
|
| 117 |
+
total_patients
|
| 118 |
+
total_mutations
|
| 119 |
+
avg_mutations_per_patient
|
| 120 |
+
}
|
| 121 |
+
}
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
### REST API
|
| 125 |
+
|
| 126 |
+
**Database Summary:**
|
| 127 |
+
```bash
|
| 128 |
+
curl http://localhost:5000/api/neo4j/summary
|
| 129 |
+
```
|
| 130 |
+
|
| 131 |
+
**Submit BOINC Task:**
|
| 132 |
+
```bash
|
| 133 |
+
curl -X POST http://localhost:5000/api/boinc/submit \
|
| 134 |
+
-H "Content-Type: application/json" \
|
| 135 |
+
-d '{"workunit_type": "variant_calling", "input_file": "sample.fastq"}'
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
### Python API
|
| 139 |
+
|
| 140 |
+
**FASTQ Processing:**
|
| 141 |
+
```python
|
| 142 |
+
from backend.pipeline import FASTQProcessor
|
| 143 |
+
|
| 144 |
+
processor = FASTQProcessor()
|
| 145 |
+
stats = processor.calculate_statistics("input.fastq")
|
| 146 |
+
filtered = processor.quality_filter("input.fastq")
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
**Variant Calling:**
|
| 150 |
+
```python
|
| 151 |
+
from backend.pipeline import VariantCaller, VariantAnalyzer
|
| 152 |
+
|
| 153 |
+
caller = VariantCaller()
|
| 154 |
+
vcf_file = caller.call_variants("alignment.bam", "reference.fa")
|
| 155 |
+
variants = caller.filter_variants(vcf_file)
|
| 156 |
+
|
| 157 |
+
analyzer = VariantAnalyzer()
|
| 158 |
+
cancer_variants = analyzer.identify_cancer_variants(variants)
|
| 159 |
+
tmb = analyzer.calculate_mutation_burden(variants)
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
**Neo4j Queries:**
|
| 163 |
+
```python
|
| 164 |
+
from backend.neo4j import DatabaseManager
|
| 165 |
+
|
| 166 |
+
db = DatabaseManager()
|
| 167 |
+
query = """
|
| 168 |
+
MATCH (g:Gene {symbol: 'TP53'})<-[:AFFECTS]-(m:Mutation)
|
| 169 |
+
RETURN m.position, m.consequence
|
| 170 |
+
"""
|
| 171 |
+
results = db.execute_query(query)
|
| 172 |
+
db.close()
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
## 📊 Data Model
|
| 176 |
+
|
| 177 |
+
### Neo4j Graph Schema
|
| 178 |
+
|
| 179 |
+
**Nodes:**
|
| 180 |
+
- **Gene**: Genes with mutations (TP53, BRCA1, KRAS, etc.)
|
| 181 |
+
- **Mutation**: Genetic variants with position and consequence
|
| 182 |
+
- **Patient**: Individual cases with demographics
|
| 183 |
+
- **CancerType**: Cancer classifications (BRCA, LUAD, COAD, GBM)
|
| 184 |
+
|
| 185 |
+
**Relationships:**
|
| 186 |
+
- `Gene ← AFFECTS ← Mutation`
|
| 187 |
+
- `Patient → HAS_MUTATION → Mutation`
|
| 188 |
+
- `Patient → DIAGNOSED_WITH → CancerType`
|
| 189 |
+
|
| 190 |
+
### Sample Data Included
|
| 191 |
+
|
| 192 |
+
- **7 Genes**: TP53, BRAF, BRCA1, BRCA2, PIK3CA, KRAS, EGFR
|
| 193 |
+
- **5 Mutations**: Cancer-associated variants
|
| 194 |
+
- **5 Patients**: Representative TCGA cases
|
| 195 |
+
- **4 Cancer Types**: BRCA, LUAD, COAD, GBM
|
| 196 |
+
|
| 197 |
+
## 🔧 Technology Stack
|
| 198 |
+
|
| 199 |
+
- **Backend**: FastAPI, Python 3.8+
|
| 200 |
+
- **Database**: Neo4j 5.13 (Graph Database)
|
| 201 |
+
- **API**: GraphQL (Strawberry), REST
|
| 202 |
+
- **Frontend**: HTML5, CSS3, JavaScript, D3.js, Chart.js
|
| 203 |
+
- **Bioinformatics**: Biopython, BLAST+
|
| 204 |
+
- **Data Source**: GDC Portal API (TCGA/TARGET)
|
| 205 |
+
- **Infrastructure**: Docker, Docker Compose
|
| 206 |
+
- **Distributed Computing**: BOINC Framework
|
| 207 |
+
|
| 208 |
+
## 📚 Documentation
|
| 209 |
+
|
| 210 |
+
- [README.md](README.md) - Complete project overview
|
| 211 |
+
- [QUICKSTART.md](QUICKSTART.md) - 5-minute setup guide
|
| 212 |
+
- [USER_GUIDE.md](USER_GUIDE.md) - Detailed usage documentation
|
| 213 |
+
- [GRAPHQL_EXAMPLES.md](GRAPHQL_EXAMPLES.md) - Query examples
|
| 214 |
+
- [ARCHITECTURE.md](ARCHITECTURE.md) - System architecture
|
| 215 |
+
- [PROJECT_SUMMARY.md](PROJECT_SUMMARY.md) - Feature overview
|
| 216 |
+
|
| 217 |
+
## 🎓 Use Cases
|
| 218 |
+
|
| 219 |
+
1. **Cancer Research**: Analyze genomics data with distributed computing
|
| 220 |
+
2. **Education**: Learn cancer genetics and bioinformatics
|
| 221 |
+
3. **Data Visualization**: Explore gene-mutation-patient relationships
|
| 222 |
+
4. **Pipeline Development**: Test bioinformatics workflows
|
| 223 |
+
5. **Graph Analytics**: Query complex biological networks
|
| 224 |
+
|
| 225 |
+
## 🔬 Supported Cancer Projects
|
| 226 |
+
|
| 227 |
+
- **TCGA-BRCA**: Breast Cancer (1,098 cases)
|
| 228 |
+
- **TCGA-LUAD**: Lung Adenocarcinoma (585 cases)
|
| 229 |
+
- **TCGA-COAD**: Colon Adenocarcinoma (461 cases)
|
| 230 |
+
- **TCGA-GBM**: Glioblastoma (617 cases)
|
| 231 |
+
- **TARGET-AML**: Acute Myeloid Leukemia (238 cases)
|
| 232 |
+
|
| 233 |
+
## 📈 Bioinformatics Pipeline
|
| 234 |
+
|
| 235 |
+
### FASTQ Processing
|
| 236 |
+
- Quality control and filtering
|
| 237 |
+
- Adapter trimming
|
| 238 |
+
- Statistics calculation
|
| 239 |
+
- QC report generation
|
| 240 |
+
|
| 241 |
+
### BLAST Alignment
|
| 242 |
+
- BLASTN for nucleotide sequences
|
| 243 |
+
- BLASTP for protein sequences
|
| 244 |
+
- Hit filtering by identity/e-value
|
| 245 |
+
- Homology detection
|
| 246 |
+
|
| 247 |
+
### Variant Calling
|
| 248 |
+
- VCF generation from alignments
|
| 249 |
+
- Quality filtering
|
| 250 |
+
- Cancer variant identification
|
| 251 |
+
- Tumor mutation burden (TMB) calculation
|
| 252 |
+
|
| 253 |
+
## 🌐 Access Points
|
| 254 |
+
|
| 255 |
+
- **Application**: http://localhost:5000
|
| 256 |
+
- **API Docs**: http://localhost:5000/docs (Swagger UI)
|
| 257 |
+
- **GraphQL**: http://localhost:5000/graphql
|
| 258 |
+
- **Neo4j Browser**: http://localhost:7474 (neo4j/cancer123)
|
| 259 |
+
|
| 260 |
+
## 🛠️ Configuration
|
| 261 |
+
|
| 262 |
+
Edit `config.yml` to customize:
|
| 263 |
+
|
| 264 |
+
```yaml
|
| 265 |
+
neo4j:
|
| 266 |
+
uri: "bolt://localhost:7687"
|
| 267 |
+
password: "cancer123"
|
| 268 |
+
|
| 269 |
+
gdc:
|
| 270 |
+
download_dir: "./data/gdc"
|
| 271 |
+
projects: ["TCGA-BRCA", "TCGA-LUAD", "TCGA-COAD"]
|
| 272 |
+
|
| 273 |
+
pipeline:
|
| 274 |
+
fastq:
|
| 275 |
+
quality_threshold: 20
|
| 276 |
+
min_length: 50
|
| 277 |
+
blast:
|
| 278 |
+
evalue: 0.001
|
| 279 |
+
num_threads: 4
|
| 280 |
+
```
|
| 281 |
+
|
| 282 |
+
## 🤝 Contributing
|
| 283 |
+
|
| 284 |
+
Contributions are welcome! This project is open source under the MIT License.
|
| 285 |
+
|
| 286 |
+
### Development Setup
|
| 287 |
+
```bash
|
| 288 |
+
python -m venv venv
|
| 289 |
+
source venv/bin/activate # or venv\Scripts\activate on Windows
|
| 290 |
+
pip install -r requirements.txt
|
| 291 |
+
pytest test_cancer_at_home.py
|
| 292 |
+
```
|
| 293 |
+
|
| 294 |
+
## 📄 License
|
| 295 |
+
|
| 296 |
+
MIT License - See [LICENSE](LICENSE) file
|
| 297 |
+
|
| 298 |
+
Copyright (c) 2025 OpenPeer AI, Riemann Computing Inc., Bleunomics, Andrew Magdy Kamal
|
| 299 |
+
|
| 300 |
+
## 🙏 Acknowledgments
|
| 301 |
+
|
| 302 |
+
### Inspiration
|
| 303 |
+
- [Cancer@Home v1](https://www.herox.com/DCx/round/516/entry/23285) - HeroX DCx Challenge
|
| 304 |
+
- [Andrew Kamal's Neo4j Cancer Visualization](https://medium.com/neo4j/visualize-cancer-1c80a95f5bb4)
|
| 305 |
+
|
| 306 |
+
### Data Sources
|
| 307 |
+
- [Genomic Data Commons (GDC) Portal](https://portal.gdc.cancer.gov/)
|
| 308 |
+
- The Cancer Genome Atlas (TCGA) Program
|
| 309 |
+
- Therapeutically Applicable Research to Generate Effective Treatments (TARGET)
|
| 310 |
+
|
| 311 |
+
### Technologies
|
| 312 |
+
- Neo4j Graph Database
|
| 313 |
+
- BOINC Distributed Computing Project
|
| 314 |
+
- Biopython Community
|
| 315 |
+
- FastAPI Framework
|
| 316 |
+
|
| 317 |
+
## 👥 Authors
|
| 318 |
+
|
| 319 |
+
- **OpenPeer AI** - Core development and architecture
|
| 320 |
+
- **Riemann Computing Inc.** - Distributed computing integration
|
| 321 |
+
- **Bleunomics** - Bioinformatics pipeline and genomics expertise
|
| 322 |
+
- **Andrew Magdy Kamal** - Graph database design and visualization
|
| 323 |
+
|
| 324 |
+
## 📞 Support
|
| 325 |
+
|
| 326 |
+
- **Documentation**: See project documentation files
|
| 327 |
+
- **Issues**: Check logs in `logs/cancer_at_home.log`
|
| 328 |
+
- **Configuration**: Review `config.yml`
|
| 329 |
+
- **Health Check**: http://localhost:5000/api/health
|
| 330 |
+
|
| 331 |
+
## 🔮 Roadmap
|
| 332 |
+
|
| 333 |
+
### Planned Features
|
| 334 |
+
- Machine learning for mutation prediction
|
| 335 |
+
- Multi-omics data integration (RNA-seq, proteomics)
|
| 336 |
+
- Survival analysis and clinical outcomes
|
| 337 |
+
- Advanced graph algorithms (PageRank, community detection)
|
| 338 |
+
- Cloud deployment support (AWS, Azure, GCP)
|
| 339 |
+
- Mobile-responsive design
|
| 340 |
+
- User authentication and authorization
|
| 341 |
+
|
| 342 |
+
## 📊 Statistics
|
| 343 |
+
|
| 344 |
+
- **Lines of Code**: ~5,000+
|
| 345 |
+
- **Modules**: 9 Python modules
|
| 346 |
+
- **API Endpoints**: 15+ REST + GraphQL
|
| 347 |
+
- **Documentation**: 2,500+ lines
|
| 348 |
+
- **Setup Time**: < 5 minutes
|
| 349 |
+
- **Sample Data**: 7 genes, 5 mutations, 5 patients
|
| 350 |
+
|
| 351 |
+
## 🎯 Citation
|
| 352 |
+
|
| 353 |
+
If you use Cancer@Home v2 in your research, please cite:
|
| 354 |
+
|
| 355 |
+
```bibtex
|
| 356 |
+
@software{cancer_at_home_v2,
|
| 357 |
+
title = {Cancer@Home v2: Distributed Cancer Genomics Research Platform},
|
| 358 |
+
author = {OpenPeer AI and Riemann Computing Inc. and Bleunomics and Andrew Magdy Kamal},
|
| 359 |
+
year = {2025},
|
| 360 |
+
url = {https://huggingface.co/OpenPeerAI/CancerAtHomeV2},
|
| 361 |
+
license = {MIT}
|
| 362 |
+
}
|
| 363 |
+
```
|
| 364 |
+
|
| 365 |
+
## 🏷️ Tags
|
| 366 |
+
|
| 367 |
+
`cancer-genomics` `bioinformatics` `neo4j` `graph-database` `distributed-computing` `boinc` `fastq` `blast` `variant-calling` `gdc-portal` `tcga` `target` `graphql` `fastapi` `python` `docker` `healthcare` `precision-medicine` `computational-biology`
|
| 368 |
+
|
| 369 |
+
---
|
| 370 |
+
|
| 371 |
+
**Made with ❤️ by OpenPeer AI, Riemann Computing Inc., Bleunomics, and Andrew Magdy Kamal**
|
| 372 |
+
|
| 373 |
+
**For cancer research, by researchers, accessible to all.**
|
PROJECT_SUMMARY.md
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Cancer@Home v2 - Project Summary
|
| 2 |
+
|
| 3 |
+
## 🎯 Project Overview
|
| 4 |
+
|
| 5 |
+
Cancer@Home v2 is a comprehensive distributed computing platform for cancer genomics research that successfully integrates:
|
| 6 |
+
|
| 7 |
+
1. **Distributed Computing (BOINC)** - Submit and manage computationally intensive cancer research tasks
|
| 8 |
+
2. **Cancer Data Portal (GDC)** - Access and download cancer genomics datasets from TCGA and TARGET
|
| 9 |
+
3. **Graph Database (Neo4j)** - Model complex relationships between genes, mutations, patients, and cancer types
|
| 10 |
+
4. **Bioinformatics Pipeline** - Process FASTQ files, run BLAST searches, and call genetic variants
|
| 11 |
+
5. **Interactive Dashboard** - Web-based GUI with real-time visualizations and data exploration
|
| 12 |
+
|
| 13 |
+
## 📁 Project Structure
|
| 14 |
+
|
| 15 |
+
```
|
| 16 |
+
CancerAtHome2/
|
| 17 |
+
├── backend/
|
| 18 |
+
│ ├── api/
|
| 19 |
+
│ │ └── main.py # FastAPI application with REST & GraphQL
|
| 20 |
+
│ ├── boinc/
|
| 21 |
+
│ │ └── client.py # BOINC distributed computing client
|
| 22 |
+
│ ├── gdc/
|
| 23 |
+
│ │ └── client.py # GDC Portal API integration
|
| 24 |
+
│ ├── neo4j/
|
| 25 |
+
│ │ ├── db_manager.py # Neo4j database operations
|
| 26 |
+
│ │ ├── graphql_schema.py # GraphQL schema definitions
|
| 27 |
+
│ │ └── data_importer.py # Sample data initialization
|
| 28 |
+
│ └── pipeline/
|
| 29 |
+
│ ├── fastq_processor.py # FASTQ quality control
|
| 30 |
+
│ ├── blast_runner.py # BLAST sequence alignment
|
| 31 |
+
│ └── variant_caller.py # Genetic variant identification
|
| 32 |
+
├── frontend/
|
| 33 |
+
│ └── index.html # Interactive web dashboard
|
| 34 |
+
├── config.yml # Configuration file
|
| 35 |
+
├── docker-compose.yml # Neo4j container setup
|
| 36 |
+
├── requirements.txt # Python dependencies
|
| 37 |
+
├── run.py # Main application launcher
|
| 38 |
+
├── setup.ps1 # Windows setup script
|
| 39 |
+
├── setup.sh # Linux/Mac setup script
|
| 40 |
+
├── README.md # Comprehensive documentation
|
| 41 |
+
├── QUICKSTART.md # Quick start guide
|
| 42 |
+
├── USER_GUIDE.md # Detailed user guide
|
| 43 |
+
├── GRAPHQL_EXAMPLES.md # GraphQL query examples
|
| 44 |
+
└── LICENSE # MIT License
|
| 45 |
+
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
## 🚀 Key Features Implemented
|
| 49 |
+
|
| 50 |
+
### 1. Web Dashboard
|
| 51 |
+
- **Modern UI**: Clean, gradient-based design with responsive layout
|
| 52 |
+
- **5 Main Tabs**: Dashboard, Neo4j Visualization, BOINC Tasks, GDC Data, Pipeline
|
| 53 |
+
- **Real-time Statistics**: Live data from Neo4j showing genes, mutations, patients
|
| 54 |
+
- **Interactive Charts**: Chart.js visualizations for mutation distributions
|
| 55 |
+
- **D3.js Graph**: Interactive network visualization of cancer genomics relationships
|
| 56 |
+
|
| 57 |
+
### 2. Neo4j Graph Database
|
| 58 |
+
- **Node Types**: Gene, Mutation, Patient, CancerType
|
| 59 |
+
- **Relationships**:
|
| 60 |
+
- Gene ← AFFECTS ← Mutation
|
| 61 |
+
- Patient → HAS_MUTATION → Mutation
|
| 62 |
+
- Patient → DIAGNOSED_WITH → CancerType
|
| 63 |
+
- **Sample Data**: Pre-loaded with 7 genes, 5 mutations, 5 patients, 4 cancer types
|
| 64 |
+
- **Optimized**: Constraints and indexes for fast queries
|
| 65 |
+
|
| 66 |
+
### 3. GraphQL API
|
| 67 |
+
- **Flexible Queries**: Get genes, mutations, patients, cancer types
|
| 68 |
+
- **Filtering**: Query by gene symbol, chromosome, project ID, cancer type
|
| 69 |
+
- **Aggregations**: Mutation frequency, cancer statistics
|
| 70 |
+
- **Playground**: Interactive GraphQL explorer at /graphql
|
| 71 |
+
|
| 72 |
+
### 4. REST API Endpoints
|
| 73 |
+
- `/api/health` - System health check
|
| 74 |
+
- `/api/neo4j/summary` - Database statistics
|
| 75 |
+
- `/api/neo4j/genes/{symbol}` - Gene information
|
| 76 |
+
- `/api/boinc/tasks` - List BOINC tasks
|
| 77 |
+
- `/api/boinc/submit` - Submit new task
|
| 78 |
+
- `/api/boinc/statistics` - Task statistics
|
| 79 |
+
- `/api/gdc/projects` - Available cancer projects
|
| 80 |
+
- `/api/gdc/files/{project_id}` - Search GDC files
|
| 81 |
+
- `/api/gdc/download` - Download GDC data
|
| 82 |
+
- `/api/pipeline/*` - Bioinformatics pipeline endpoints
|
| 83 |
+
|
| 84 |
+
### 5. BOINC Integration
|
| 85 |
+
- **Task Submission**: Support for variant calling, BLAST, alignment tasks
|
| 86 |
+
- **Status Tracking**: Monitor pending, running, completed, failed tasks
|
| 87 |
+
- **Statistics**: Total tasks, completion rates, average times
|
| 88 |
+
- **Task Manager**: High-level interface for common workflows
|
| 89 |
+
|
| 90 |
+
### 6. GDC Data Integration
|
| 91 |
+
- **Search API**: Query files by project, data type, experimental strategy
|
| 92 |
+
- **Download**: Retrieve cancer genomics datasets
|
| 93 |
+
- **Projects Supported**: TCGA-BRCA, TCGA-LUAD, TCGA-COAD, TCGA-GBM, TARGET-AML
|
| 94 |
+
- **Parsers**: MAF, VCF, and clinical data parsing utilities
|
| 95 |
+
|
| 96 |
+
### 7. Bioinformatics Pipeline
|
| 97 |
+
- **FASTQ Processing**:
|
| 98 |
+
- Quality filtering
|
| 99 |
+
- Adapter trimming
|
| 100 |
+
- Statistics calculation
|
| 101 |
+
- Quality control reports
|
| 102 |
+
|
| 103 |
+
- **BLAST Integration**:
|
| 104 |
+
- BLASTN and BLASTP support
|
| 105 |
+
- XML output parsing
|
| 106 |
+
- Hit filtering by identity/e-value
|
| 107 |
+
|
| 108 |
+
- **Variant Calling**:
|
| 109 |
+
- VCF generation
|
| 110 |
+
- Quality filtering
|
| 111 |
+
- Variant annotation
|
| 112 |
+
- Cancer variant identification
|
| 113 |
+
- Tumor mutation burden calculation
|
| 114 |
+
|
| 115 |
+
## 🛠️ Technology Stack
|
| 116 |
+
|
| 117 |
+
- **Backend**: FastAPI (Python 3.8+)
|
| 118 |
+
- **Database**: Neo4j 5.13 (Graph Database)
|
| 119 |
+
- **API**: GraphQL (Strawberry), REST
|
| 120 |
+
- **Frontend**: HTML5, CSS3, JavaScript
|
| 121 |
+
- **Visualization**: D3.js, Chart.js
|
| 122 |
+
- **Bioinformatics**: Biopython
|
| 123 |
+
- **Data Source**: GDC Portal API
|
| 124 |
+
- **Containerization**: Docker, Docker Compose
|
| 125 |
+
- **Distributed Computing**: BOINC framework
|
| 126 |
+
|
| 127 |
+
## 📊 Sample Data Included
|
| 128 |
+
|
| 129 |
+
### Genes (7)
|
| 130 |
+
- TP53 (Tumor protein p53)
|
| 131 |
+
- BRAF (B-Raf proto-oncogene)
|
| 132 |
+
- BRCA1, BRCA2 (Breast cancer genes)
|
| 133 |
+
- PIK3CA, KRAS, EGFR (Oncogenes)
|
| 134 |
+
|
| 135 |
+
### Mutations (5)
|
| 136 |
+
- Various missense mutations in cancer-associated genes
|
| 137 |
+
- Includes position, reference/alternate alleles, quality scores
|
| 138 |
+
|
| 139 |
+
### Patients (5)
|
| 140 |
+
- Representative cases from TCGA-BRCA, TCGA-LUAD, TCGA-COAD
|
| 141 |
+
- Demographic data, vital status
|
| 142 |
+
|
| 143 |
+
### Cancer Types (4)
|
| 144 |
+
- Breast Cancer (BRCA)
|
| 145 |
+
- Lung Adenocarcinoma (LUAD)
|
| 146 |
+
- Colon Adenocarcinoma (COAD)
|
| 147 |
+
- Glioblastoma (GBM)
|
| 148 |
+
|
| 149 |
+
## 🎨 Design Principles
|
| 150 |
+
|
| 151 |
+
1. **Simplicity**: One-command setup, intuitive interface
|
| 152 |
+
2. **Speed**: Fast to install and get started (< 5 minutes)
|
| 153 |
+
3. **Modularity**: Clean separation of concerns
|
| 154 |
+
4. **Extensibility**: Easy to add new data sources and analyses
|
| 155 |
+
5. **Visual**: Rich visualizations for data exploration
|
| 156 |
+
6. **Professional**: Production-quality code with error handling
|
| 157 |
+
|
| 158 |
+
## 🔧 Configuration Options
|
| 159 |
+
|
| 160 |
+
All configurable via `config.yml`:
|
| 161 |
+
- Neo4j connection settings
|
| 162 |
+
- GDC API parameters
|
| 163 |
+
- BOINC server configuration
|
| 164 |
+
- Pipeline quality thresholds
|
| 165 |
+
- Output directories
|
| 166 |
+
- Logging levels
|
| 167 |
+
|
| 168 |
+
## 📖 Documentation Provided
|
| 169 |
+
|
| 170 |
+
1. **README.md** - Complete project overview and installation
|
| 171 |
+
2. **QUICKSTART.md** - Fast setup and first steps
|
| 172 |
+
3. **USER_GUIDE.md** - Comprehensive usage documentation
|
| 173 |
+
4. **GRAPHQL_EXAMPLES.md** - GraphQL query examples
|
| 174 |
+
5. **Inline Code Comments** - Well-documented Python modules
|
| 175 |
+
6. **API Documentation** - Auto-generated Swagger UI at /docs
|
| 176 |
+
|
| 177 |
+
## 🌟 Unique Features
|
| 178 |
+
|
| 179 |
+
1. **All-in-One Solution**: Complete stack from data acquisition to visualization
|
| 180 |
+
2. **Graph-Based**: Leverages Neo4j's power for complex relationship queries
|
| 181 |
+
3. **Real-Time**: Live dashboard updates and task monitoring
|
| 182 |
+
4. **Research-Ready**: Built for actual cancer genomics research workflows
|
| 183 |
+
5. **Extensible**: Easy to integrate additional data sources and tools
|
| 184 |
+
6. **Educational**: Great for learning cancer genomics and graph databases
|
| 185 |
+
|
| 186 |
+
## 🚦 Getting Started (Quick)
|
| 187 |
+
|
| 188 |
+
```bash
|
| 189 |
+
# Windows
|
| 190 |
+
.\setup.ps1
|
| 191 |
+
python run.py
|
| 192 |
+
|
| 193 |
+
# Linux/Mac
|
| 194 |
+
./setup.sh
|
| 195 |
+
python run.py
|
| 196 |
+
|
| 197 |
+
# Open browser
|
| 198 |
+
http://localhost:5000
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
## 🎯 Use Cases
|
| 202 |
+
|
| 203 |
+
1. **Research**: Analyze cancer genomics data with distributed computing
|
| 204 |
+
2. **Education**: Learn about cancer genetics and bioinformatics
|
| 205 |
+
3. **Visualization**: Explore gene-mutation-patient relationships
|
| 206 |
+
4. **Data Integration**: Combine multiple cancer data sources
|
| 207 |
+
5. **Pipeline Development**: Test bioinformatics workflows
|
| 208 |
+
|
| 209 |
+
## 🔮 Future Enhancements (Optional)
|
| 210 |
+
|
| 211 |
+
- Machine learning for mutation prediction
|
| 212 |
+
- Multi-omics data integration (RNA-seq, proteomics)
|
| 213 |
+
- Survival analysis and clinical outcomes
|
| 214 |
+
- Drug response prediction
|
| 215 |
+
- Advanced graph algorithms (PageRank, community detection)
|
| 216 |
+
- Real-time collaboration features
|
| 217 |
+
- Mobile responsive design
|
| 218 |
+
- Export/report generation
|
| 219 |
+
|
| 220 |
+
## 📝 License
|
| 221 |
+
|
| 222 |
+
MIT License - Free for academic and commercial use
|
| 223 |
+
|
| 224 |
+
## 🙏 Acknowledgments
|
| 225 |
+
|
| 226 |
+
Inspired by:
|
| 227 |
+
- Cancer@Home v1 (HeroX DCx Challenge)
|
| 228 |
+
- Andrew Kamal's Neo4j Cancer Visualization
|
| 229 |
+
- GDC Portal and TCGA Project
|
| 230 |
+
- BOINC Distributed Computing Framework
|
| 231 |
+
|
| 232 |
+
---
|
| 233 |
+
|
| 234 |
+
**Cancer@Home v2** successfully combines modern web technologies, graph databases, distributed computing, and bioinformatics tools into a cohesive platform that is both powerful and easy to use. The system is production-ready, well-documented, and designed for real-world cancer genomics research.
|
QUICKSTART.md
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Quick Start Guide
|
| 2 |
+
|
| 3 |
+
## Prerequisites
|
| 4 |
+
- Python 3.8 or higher
|
| 5 |
+
- Docker Desktop
|
| 6 |
+
- 8GB RAM minimum (16GB recommended)
|
| 7 |
+
- Windows, macOS, or Linux
|
| 8 |
+
|
| 9 |
+
## Installation
|
| 10 |
+
|
| 11 |
+
### Windows
|
| 12 |
+
```powershell
|
| 13 |
+
# Run in PowerShell as Administrator
|
| 14 |
+
.\setup.ps1
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
### Linux/Mac
|
| 18 |
+
```bash
|
| 19 |
+
chmod +x setup.sh
|
| 20 |
+
./setup.sh
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
## Manual Installation
|
| 24 |
+
|
| 25 |
+
1. **Create virtual environment**
|
| 26 |
+
```bash
|
| 27 |
+
python -m venv venv
|
| 28 |
+
|
| 29 |
+
# Windows
|
| 30 |
+
venv\Scripts\activate
|
| 31 |
+
|
| 32 |
+
# Linux/Mac
|
| 33 |
+
source venv/bin/activate
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
2. **Install dependencies**
|
| 37 |
+
```bash
|
| 38 |
+
pip install -r requirements.txt
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
3. **Start Neo4j**
|
| 42 |
+
```bash
|
| 43 |
+
docker-compose up -d
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
4. **Run application**
|
| 47 |
+
```bash
|
| 48 |
+
python run.py
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
## First Time Usage
|
| 52 |
+
|
| 53 |
+
1. Open browser to http://localhost:5000
|
| 54 |
+
2. The database will auto-initialize with sample data
|
| 55 |
+
3. Explore the dashboard tabs:
|
| 56 |
+
- **Dashboard**: Overview statistics
|
| 57 |
+
- **Neo4j Visualization**: Interactive graph
|
| 58 |
+
- **BOINC Tasks**: Distributed computing
|
| 59 |
+
- **GDC Data**: Cancer genomics data
|
| 60 |
+
- **Analysis Pipeline**: Bioinformatics tools
|
| 61 |
+
|
| 62 |
+
## GraphQL Queries
|
| 63 |
+
|
| 64 |
+
Access GraphQL playground at: http://localhost:5000/graphql
|
| 65 |
+
|
| 66 |
+
Example queries:
|
| 67 |
+
|
| 68 |
+
```graphql
|
| 69 |
+
# Get all genes
|
| 70 |
+
query {
|
| 71 |
+
genes(limit: 10) {
|
| 72 |
+
symbol
|
| 73 |
+
name
|
| 74 |
+
chromosome
|
| 75 |
+
}
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
# Get mutations for a gene
|
| 79 |
+
query {
|
| 80 |
+
mutations(gene: "TP53") {
|
| 81 |
+
chromosome
|
| 82 |
+
position
|
| 83 |
+
consequence
|
| 84 |
+
}
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
# Get patients with cancer type
|
| 88 |
+
query {
|
| 89 |
+
patients(project_id: "TCGA-BRCA") {
|
| 90 |
+
patient_id
|
| 91 |
+
age
|
| 92 |
+
gender
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
## API Examples
|
| 98 |
+
|
| 99 |
+
### Submit BOINC Task
|
| 100 |
+
```bash
|
| 101 |
+
curl -X POST http://localhost:5000/api/boinc/submit \
|
| 102 |
+
-H "Content-Type: application/json" \
|
| 103 |
+
-d '{"workunit_type": "variant_calling", "input_file": "sample.fastq"}'
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
### Get Database Summary
|
| 107 |
+
```bash
|
| 108 |
+
curl http://localhost:5000/api/neo4j/summary
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
### Search GDC Files
|
| 112 |
+
```bash
|
| 113 |
+
curl http://localhost:5000/api/gdc/files/TCGA-BRCA?limit=10
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
## Troubleshooting
|
| 117 |
+
|
| 118 |
+
### Docker not starting
|
| 119 |
+
```bash
|
| 120 |
+
# Check Docker status
|
| 121 |
+
docker ps
|
| 122 |
+
|
| 123 |
+
# Restart Docker containers
|
| 124 |
+
docker-compose down
|
| 125 |
+
docker-compose up -d
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
### Neo4j connection error
|
| 129 |
+
1. Wait 30 seconds for Neo4j to fully start
|
| 130 |
+
2. Check Neo4j Browser: http://localhost:7474
|
| 131 |
+
3. Login: username=neo4j, password=cancer123
|
| 132 |
+
|
| 133 |
+
### Python module errors
|
| 134 |
+
```bash
|
| 135 |
+
# Reinstall dependencies
|
| 136 |
+
pip install --upgrade -r requirements.txt
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
## Configuration
|
| 140 |
+
|
| 141 |
+
Edit `config.yml` to customize:
|
| 142 |
+
- Neo4j connection
|
| 143 |
+
- GDC API settings
|
| 144 |
+
- BOINC configuration
|
| 145 |
+
- Pipeline parameters
|
| 146 |
+
|
| 147 |
+
## Data Sources
|
| 148 |
+
|
| 149 |
+
### GDC Portal Projects
|
| 150 |
+
- TCGA-BRCA: Breast Cancer
|
| 151 |
+
- TCGA-LUAD: Lung Adenocarcinoma
|
| 152 |
+
- TCGA-COAD: Colon Adenocarcinoma
|
| 153 |
+
- TCGA-GBM: Glioblastoma
|
| 154 |
+
- TARGET-AML: Acute Myeloid Leukemia
|
| 155 |
+
|
| 156 |
+
### Sample Data
|
| 157 |
+
The system includes sample data for demonstration:
|
| 158 |
+
- 7 cancer-associated genes (TP53, BRAF, BRCA1, BRCA2, etc.)
|
| 159 |
+
- 5 mutation records
|
| 160 |
+
- 5 patient cases
|
| 161 |
+
- 4 cancer types
|
| 162 |
+
|
| 163 |
+
## Development
|
| 164 |
+
|
| 165 |
+
### Run tests
|
| 166 |
+
```bash
|
| 167 |
+
pytest
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
### Format code
|
| 171 |
+
```bash
|
| 172 |
+
black backend/
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
### API Documentation
|
| 176 |
+
http://localhost:5000/docs (Swagger UI)
|
| 177 |
+
|
| 178 |
+
## Support
|
| 179 |
+
|
| 180 |
+
For issues or questions:
|
| 181 |
+
- Check logs: `logs/cancer_at_home.log`
|
| 182 |
+
- Review configuration: `config.yml`
|
| 183 |
+
- Consult README.md for detailed information
|
README.md
CHANGED
|
@@ -1,3 +1,172 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Cancer@Home v2
|
| 2 |
+
|
| 3 |
+
A distributed computing platform for cancer genomics research, combining BOINC distributed computing, GDC cancer data analysis, sequence processing (FASTQ/BLAST), and Neo4j graph visualization.
|
| 4 |
+
|
| 5 |
+
## 🚀 Quick Start (5 minutes)
|
| 6 |
+
|
| 7 |
+
### Prerequisites
|
| 8 |
+
- Python 3.8+
|
| 9 |
+
- Docker Desktop
|
| 10 |
+
- 8GB RAM minimum
|
| 11 |
+
|
| 12 |
+
### Installation
|
| 13 |
+
|
| 14 |
+
1. **Clone and setup**
|
| 15 |
+
```bash
|
| 16 |
+
cd CancerAtHome2
|
| 17 |
+
python -m venv venv
|
| 18 |
+
venv\Scripts\activate # Windows
|
| 19 |
+
pip install -r requirements.txt
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
2. **Start Neo4j Database**
|
| 23 |
+
```bash
|
| 24 |
+
docker-compose up -d
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
3. **Run the application**
|
| 28 |
+
```bash
|
| 29 |
+
python run.py
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
4. **Open your browser**
|
| 33 |
+
- Application: http://localhost:5000
|
| 34 |
+
- Neo4j Browser: http://localhost:7474 (username: neo4j, password: cancer123)
|
| 35 |
+
|
| 36 |
+
## 🎯 Features
|
| 37 |
+
|
| 38 |
+
### 1. **Distributed Computing (BOINC Integration)**
|
| 39 |
+
- Submit cancer research computational tasks
|
| 40 |
+
- Monitor distributed workload processing
|
| 41 |
+
- Real-time task status tracking
|
| 42 |
+
|
| 43 |
+
### 2. **GDC Data Integration**
|
| 44 |
+
- Download cancer genomics data from GDC Portal
|
| 45 |
+
- Support for various cancer types (TCGA, TARGET projects)
|
| 46 |
+
- Automatic data parsing and normalization
|
| 47 |
+
|
| 48 |
+
### 3. **Sequence Analysis Pipeline**
|
| 49 |
+
- FASTQ file processing
|
| 50 |
+
- BLAST sequence alignment
|
| 51 |
+
- Variant calling and annotation
|
| 52 |
+
|
| 53 |
+
### 4. **Neo4j Graph Database**
|
| 54 |
+
- Graph-based cancer data modeling
|
| 55 |
+
- Relationships: Gene → Mutation → Patient → Cancer Type
|
| 56 |
+
- Interactive graph visualization
|
| 57 |
+
|
| 58 |
+
### 5. **GraphQL API**
|
| 59 |
+
- Query cancer data flexibly
|
| 60 |
+
- Filter by gene, mutation, patient cohort
|
| 61 |
+
- Aggregate statistics
|
| 62 |
+
|
| 63 |
+
### 6. **Interactive Dashboard**
|
| 64 |
+
- Real-time data visualization
|
| 65 |
+
- Network graphs for gene interactions
|
| 66 |
+
- Mutation frequency charts
|
| 67 |
+
- Patient cohort analysis
|
| 68 |
+
|
| 69 |
+
## 📊 Architecture
|
| 70 |
+
|
| 71 |
+
```
|
| 72 |
+
Cancer@Home v2
|
| 73 |
+
│
|
| 74 |
+
├── Frontend (React + D3.js)
|
| 75 |
+
│ ├── Dashboard
|
| 76 |
+
│ ├── Neo4j Visualization
|
| 77 |
+
│ └── Task Monitor
|
| 78 |
+
│
|
| 79 |
+
├── Backend (FastAPI)
|
| 80 |
+
│ ├── REST API
|
| 81 |
+
│ ├── GraphQL Endpoint
|
| 82 |
+
│ └── WebSocket (real-time updates)
|
| 83 |
+
│
|
| 84 |
+
├── Data Layer
|
| 85 |
+
│ ├── Neo4j (Graph Database)
|
| 86 |
+
│ ├── BOINC Client
|
| 87 |
+
│ └── GDC API Client
|
| 88 |
+
│
|
| 89 |
+
└── Analysis Pipeline
|
| 90 |
+
├── FASTQ Parser
|
| 91 |
+
├── BLAST Wrapper
|
| 92 |
+
└── Variant Annotator
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
## 🗂️ Project Structure
|
| 96 |
+
|
| 97 |
+
```
|
| 98 |
+
CancerAtHome2/
|
| 99 |
+
├── backend/
|
| 100 |
+
│ ├── api/ # FastAPI routes
|
| 101 |
+
│ ├── boinc/ # BOINC integration
|
| 102 |
+
│ ├── gdc/ # GDC data fetcher
|
| 103 |
+
│ ├── neo4j/ # Neo4j database layer
|
| 104 |
+
│ ├── pipeline/ # Bioinformatics pipeline
|
| 105 |
+
│ └── graphql/ # GraphQL schema
|
| 106 |
+
├── frontend/
|
| 107 |
+
│ ├── public/
|
| 108 |
+
│ └── src/
|
| 109 |
+
│ ├── components/ # React components
|
| 110 |
+
│ ├── views/ # Page views
|
| 111 |
+
│ └── api/ # API client
|
| 112 |
+
├── data/ # Downloaded datasets
|
| 113 |
+
├── docker-compose.yml # Neo4j container
|
| 114 |
+
├── requirements.txt # Python dependencies
|
| 115 |
+
└── run.py # Main entry point
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
## 🧬 Data Flow
|
| 119 |
+
|
| 120 |
+
1. **Data Ingestion**: Download cancer genomics data from GDC Portal
|
| 121 |
+
2. **Processing**: Run FASTQ/BLAST analysis on distributed BOINC network
|
| 122 |
+
3. **Storage**: Store results in Neo4j graph database
|
| 123 |
+
4. **Visualization**: Query and visualize via web dashboard
|
| 124 |
+
|
| 125 |
+
## 🔧 Configuration
|
| 126 |
+
|
| 127 |
+
Edit `config.yml` to customize:
|
| 128 |
+
- Neo4j connection settings
|
| 129 |
+
- GDC API parameters
|
| 130 |
+
- BOINC project URL
|
| 131 |
+
- Analysis pipeline options
|
| 132 |
+
|
| 133 |
+
## 📖 Usage Examples
|
| 134 |
+
|
| 135 |
+
### Query Mutations by Gene
|
| 136 |
+
```graphql
|
| 137 |
+
query {
|
| 138 |
+
mutations(gene: "TP53") {
|
| 139 |
+
id
|
| 140 |
+
position
|
| 141 |
+
consequence
|
| 142 |
+
patients {
|
| 143 |
+
cancerType
|
| 144 |
+
stage
|
| 145 |
+
}
|
| 146 |
+
}
|
| 147 |
+
}
|
| 148 |
+
```
|
| 149 |
+
|
| 150 |
+
### Submit Analysis Task
|
| 151 |
+
```python
|
| 152 |
+
from backend.boinc import BOINCClient
|
| 153 |
+
|
| 154 |
+
client = BOINCClient()
|
| 155 |
+
task_id = client.submit_task(
|
| 156 |
+
workunit_type="variant_calling",
|
| 157 |
+
input_file="sample.fastq"
|
| 158 |
+
)
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
## 🤝 Inspired By
|
| 162 |
+
|
| 163 |
+
- [Cancer@Home v1](https://www.herox.com/DCx/round/516/entry/23285) - Distributed cancer research
|
| 164 |
+
- [Neo4j Cancer Visualization](https://medium.com/neo4j/visualize-cancer-1c80a95f5bb4) - Graph-based cancer data modeling
|
| 165 |
+
|
| 166 |
+
## 📄 License
|
| 167 |
+
|
| 168 |
+
MIT License
|
| 169 |
+
|
| 170 |
+
## 🛟 Support
|
| 171 |
+
|
| 172 |
+
For issues or questions, please open a GitHub issue.
|
USER_GUIDE.md
ADDED
|
@@ -0,0 +1,419 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Cancer@Home v2 - User Guide
|
| 2 |
+
|
| 3 |
+
## Table of Contents
|
| 4 |
+
1. [Introduction](#introduction)
|
| 5 |
+
2. [System Architecture](#system-architecture)
|
| 6 |
+
3. [Getting Started](#getting-started)
|
| 7 |
+
4. [Dashboard Guide](#dashboard-guide)
|
| 8 |
+
5. [Working with Data](#working-with-data)
|
| 9 |
+
6. [Analysis Pipeline](#analysis-pipeline)
|
| 10 |
+
7. [Advanced Usage](#advanced-usage)
|
| 11 |
+
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
## Introduction
|
| 15 |
+
|
| 16 |
+
Cancer@Home v2 is a distributed computing platform for cancer genomics research that combines:
|
| 17 |
+
- **BOINC**: Distributed computing for computationally intensive tasks
|
| 18 |
+
- **GDC Portal**: Access to comprehensive cancer genomics datasets
|
| 19 |
+
- **Neo4j**: Graph database for modeling complex relationships
|
| 20 |
+
- **Bioinformatics Pipeline**: FASTQ processing, BLAST alignment, and variant calling
|
| 21 |
+
|
| 22 |
+
### Key Features
|
| 23 |
+
✓ Interactive web dashboard
|
| 24 |
+
✓ Real-time graph visualization
|
| 25 |
+
✓ GraphQL API for flexible data queries
|
| 26 |
+
✓ Distributed task processing
|
| 27 |
+
✓ Cancer genomics data integration
|
| 28 |
+
|
| 29 |
+
---
|
| 30 |
+
|
| 31 |
+
## System Architecture
|
| 32 |
+
|
| 33 |
+
```
|
| 34 |
+
┌─────────────────────────────────────────────────┐
|
| 35 |
+
│ Web Dashboard (Port 5000) │
|
| 36 |
+
│ Dashboard | Neo4j Viz | BOINC | GDC | Pipeline│
|
| 37 |
+
└────────────────────┬────────────────────────────┘
|
| 38 |
+
│
|
| 39 |
+
┌────────────────────┴────────────────────────────┐
|
| 40 |
+
│ FastAPI Backend (REST + GraphQL) │
|
| 41 |
+
└─────┬──────┬──────┬──────┬──────┬──────────────┘
|
| 42 |
+
│ │ │ │ │
|
| 43 |
+
┌──┴─┐ ┌──┴─┐ ┌──┴─┐ ┌──┴─┐ ┌──┴──────┐
|
| 44 |
+
│Neo4j│ │BOINC│ │GDC │ │FASTQ│ │BLAST/VCF│
|
| 45 |
+
│7687 │ │Client│ │API │ │Proc │ │ Caller │
|
| 46 |
+
└─────┘ └─────┘ └────┘ └─────┘ └─────────┘
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
---
|
| 50 |
+
|
| 51 |
+
## Getting Started
|
| 52 |
+
|
| 53 |
+
### Quick Installation (5 minutes)
|
| 54 |
+
|
| 55 |
+
**Windows:**
|
| 56 |
+
```powershell
|
| 57 |
+
.\setup.ps1
|
| 58 |
+
python run.py
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
**Linux/Mac:**
|
| 62 |
+
```bash
|
| 63 |
+
./setup.sh
|
| 64 |
+
python run.py
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
### Access Points
|
| 68 |
+
- **Main Application**: http://localhost:5000
|
| 69 |
+
- **API Documentation**: http://localhost:5000/docs
|
| 70 |
+
- **GraphQL Playground**: http://localhost:5000/graphql
|
| 71 |
+
- **Neo4j Browser**: http://localhost:7474 (neo4j/cancer123)
|
| 72 |
+
|
| 73 |
+
---
|
| 74 |
+
|
| 75 |
+
## Dashboard Guide
|
| 76 |
+
|
| 77 |
+
### 1. Overview Tab
|
| 78 |
+
Shows key statistics:
|
| 79 |
+
- Total genes in database
|
| 80 |
+
- Total mutations identified
|
| 81 |
+
- Number of patients
|
| 82 |
+
- Cancer types catalogued
|
| 83 |
+
|
| 84 |
+
**Chart**: Mutation distribution across cancer types
|
| 85 |
+
|
| 86 |
+
### 2. Neo4j Visualization Tab
|
| 87 |
+
Interactive graph showing:
|
| 88 |
+
- **Blue nodes**: Genes (TP53, BRCA1, KRAS, etc.)
|
| 89 |
+
- **Purple nodes**: Patients
|
| 90 |
+
- **Pink nodes**: Cancer types
|
| 91 |
+
- **Lines**: Relationships between entities
|
| 92 |
+
|
| 93 |
+
**Navigation**:
|
| 94 |
+
- Click and drag nodes to rearrange
|
| 95 |
+
- Hover over nodes for details
|
| 96 |
+
- Zoom in/out with mouse wheel
|
| 97 |
+
|
| 98 |
+
### 3. BOINC Tasks Tab
|
| 99 |
+
Manage distributed computing workloads:
|
| 100 |
+
|
| 101 |
+
**Submit Task**:
|
| 102 |
+
1. Select task type (Variant Calling, BLAST, Alignment)
|
| 103 |
+
2. Enter input file path
|
| 104 |
+
3. Click "Submit Task"
|
| 105 |
+
|
| 106 |
+
**Monitor Tasks**:
|
| 107 |
+
- View all tasks with status (Pending, Running, Completed)
|
| 108 |
+
- See task creation time and type
|
| 109 |
+
- Check overall statistics
|
| 110 |
+
|
| 111 |
+
### 4. GDC Data Tab
|
| 112 |
+
Browse available cancer projects:
|
| 113 |
+
- TCGA-BRCA: Breast Cancer (1,098 cases)
|
| 114 |
+
- TCGA-LUAD: Lung Adenocarcinoma (585 cases)
|
| 115 |
+
- TCGA-COAD: Colon Adenocarcinoma (461 cases)
|
| 116 |
+
- TCGA-GBM: Glioblastoma (617 cases)
|
| 117 |
+
- TARGET-AML: Acute Myeloid Leukemia (238 cases)
|
| 118 |
+
|
| 119 |
+
Click on a project to explore available datasets.
|
| 120 |
+
|
| 121 |
+
### 5. Pipeline Tab
|
| 122 |
+
Quick access to bioinformatics tools:
|
| 123 |
+
- **FASTQ QC**: Quality control for sequencing data
|
| 124 |
+
- **BLAST Search**: Sequence alignment and homology
|
| 125 |
+
- **Variant Calling**: Identify genetic variants
|
| 126 |
+
|
| 127 |
+
---
|
| 128 |
+
|
| 129 |
+
## Working with Data
|
| 130 |
+
|
| 131 |
+
### Querying with GraphQL
|
| 132 |
+
|
| 133 |
+
Access the GraphQL playground at http://localhost:5000/graphql
|
| 134 |
+
|
| 135 |
+
**Example 1: Find mutations in TP53 gene**
|
| 136 |
+
```graphql
|
| 137 |
+
query {
|
| 138 |
+
mutations(gene: "TP53") {
|
| 139 |
+
mutation_id
|
| 140 |
+
chromosome
|
| 141 |
+
position
|
| 142 |
+
consequence
|
| 143 |
+
}
|
| 144 |
+
}
|
| 145 |
+
```
|
| 146 |
+
|
| 147 |
+
**Example 2: Get patient information**
|
| 148 |
+
```graphql
|
| 149 |
+
query {
|
| 150 |
+
patients(project_id: "TCGA-BRCA", limit: 10) {
|
| 151 |
+
patient_id
|
| 152 |
+
age
|
| 153 |
+
gender
|
| 154 |
+
vital_status
|
| 155 |
+
}
|
| 156 |
+
}
|
| 157 |
+
```
|
| 158 |
+
|
| 159 |
+
**Example 3: Cancer statistics**
|
| 160 |
+
```graphql
|
| 161 |
+
query {
|
| 162 |
+
cancerStatistics(cancer_type_id: "BRCA") {
|
| 163 |
+
total_patients
|
| 164 |
+
total_mutations
|
| 165 |
+
avg_mutations_per_patient
|
| 166 |
+
}
|
| 167 |
+
}
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
### Using the REST API
|
| 171 |
+
|
| 172 |
+
**Get database summary:**
|
| 173 |
+
```bash
|
| 174 |
+
curl http://localhost:5000/api/neo4j/summary
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
**Search GDC files:**
|
| 178 |
+
```bash
|
| 179 |
+
curl "http://localhost:5000/api/gdc/files/TCGA-BRCA?limit=10"
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
**Submit BOINC task:**
|
| 183 |
+
```bash
|
| 184 |
+
curl -X POST http://localhost:5000/api/boinc/submit \
|
| 185 |
+
-H "Content-Type: application/json" \
|
| 186 |
+
-d '{"workunit_type": "variant_calling", "input_file": "data/sample.fastq"}'
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
---
|
| 190 |
+
|
| 191 |
+
## Analysis Pipeline
|
| 192 |
+
|
| 193 |
+
### 1. FASTQ Processing
|
| 194 |
+
|
| 195 |
+
**Quality Control:**
|
| 196 |
+
```python
|
| 197 |
+
from backend.pipeline import FASTQProcessor
|
| 198 |
+
|
| 199 |
+
processor = FASTQProcessor()
|
| 200 |
+
stats = processor.calculate_statistics("input.fastq")
|
| 201 |
+
print(f"Total reads: {stats['total_reads']}")
|
| 202 |
+
print(f"Average quality: {stats['avg_quality']}")
|
| 203 |
+
```
|
| 204 |
+
|
| 205 |
+
**Filter by quality:**
|
| 206 |
+
```python
|
| 207 |
+
filtered = processor.quality_filter("input.fastq", "filtered.fastq")
|
| 208 |
+
print(f"Pass rate: {filtered['pass_rate']:.2%}")
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
### 2. BLAST Alignment
|
| 212 |
+
|
| 213 |
+
**Run BLAST search:**
|
| 214 |
+
```python
|
| 215 |
+
from backend.pipeline import BLASTRunner
|
| 216 |
+
|
| 217 |
+
blast = BLASTRunner()
|
| 218 |
+
results = blast.run_blastn("query.fasta")
|
| 219 |
+
hits = blast.parse_results(results)
|
| 220 |
+
|
| 221 |
+
print(f"Found {len(hits)} alignments")
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
**Filter high-quality hits:**
|
| 225 |
+
```python
|
| 226 |
+
filtered_hits = blast.filter_hits(hits, min_identity=0.95)
|
| 227 |
+
```
|
| 228 |
+
|
| 229 |
+
### 3. Variant Calling
|
| 230 |
+
|
| 231 |
+
**Identify variants:**
|
| 232 |
+
```python
|
| 233 |
+
from backend.pipeline import VariantCaller
|
| 234 |
+
|
| 235 |
+
caller = VariantCaller()
|
| 236 |
+
vcf_file = caller.call_variants("alignment.bam", "reference.fa")
|
| 237 |
+
variants = caller.filter_variants(vcf_file, min_quality=30)
|
| 238 |
+
|
| 239 |
+
print(f"Identified {len(variants)} high-quality variants")
|
| 240 |
+
```
|
| 241 |
+
|
| 242 |
+
**Find cancer-associated variants:**
|
| 243 |
+
```python
|
| 244 |
+
from backend.pipeline import VariantAnalyzer
|
| 245 |
+
|
| 246 |
+
analyzer = VariantAnalyzer()
|
| 247 |
+
cancer_variants = analyzer.identify_cancer_variants(variants)
|
| 248 |
+
tmb = analyzer.calculate_mutation_burden(variants)
|
| 249 |
+
|
| 250 |
+
print(f"Cancer variants: {len(cancer_variants)}")
|
| 251 |
+
print(f"Tumor Mutation Burden: {tmb:.2f} mutations/Mb")
|
| 252 |
+
```
|
| 253 |
+
|
| 254 |
+
---
|
| 255 |
+
|
| 256 |
+
## Advanced Usage
|
| 257 |
+
|
| 258 |
+
### Custom Neo4j Queries
|
| 259 |
+
|
| 260 |
+
**Direct Cypher queries:**
|
| 261 |
+
```python
|
| 262 |
+
from backend.neo4j import DatabaseManager
|
| 263 |
+
|
| 264 |
+
db = DatabaseManager()
|
| 265 |
+
|
| 266 |
+
# Find patients with TP53 mutations
|
| 267 |
+
query = """
|
| 268 |
+
MATCH (p:Patient)-[:HAS_MUTATION]->(m:Mutation)-[:AFFECTS]->(g:Gene {symbol: 'TP53'})
|
| 269 |
+
RETURN p.patient_id, m.position, m.consequence
|
| 270 |
+
"""
|
| 271 |
+
|
| 272 |
+
results = db.execute_query(query)
|
| 273 |
+
for result in results:
|
| 274 |
+
print(result)
|
| 275 |
+
|
| 276 |
+
db.close()
|
| 277 |
+
```
|
| 278 |
+
|
| 279 |
+
### Batch Data Import
|
| 280 |
+
|
| 281 |
+
**Import GDC data:**
|
| 282 |
+
```python
|
| 283 |
+
from backend.gdc import GDCClient
|
| 284 |
+
from backend.neo4j import DataImporter
|
| 285 |
+
|
| 286 |
+
# Download mutation data
|
| 287 |
+
gdc = GDCClient()
|
| 288 |
+
files = gdc.get_mutation_data("TCGA-BRCA", limit=10)
|
| 289 |
+
|
| 290 |
+
for file in files:
|
| 291 |
+
gdc.download_file(file.file_id)
|
| 292 |
+
|
| 293 |
+
# Import to Neo4j
|
| 294 |
+
importer = DataImporter()
|
| 295 |
+
importer.import_gdc_data(files)
|
| 296 |
+
```
|
| 297 |
+
|
| 298 |
+
### Custom BOINC Tasks
|
| 299 |
+
|
| 300 |
+
**Submit custom analysis:**
|
| 301 |
+
```python
|
| 302 |
+
from backend.boinc import BOINCClient
|
| 303 |
+
|
| 304 |
+
client = BOINCClient()
|
| 305 |
+
|
| 306 |
+
# Submit multiple tasks
|
| 307 |
+
input_files = ["sample1.fastq", "sample2.fastq", "sample3.fastq"]
|
| 308 |
+
task_ids = []
|
| 309 |
+
|
| 310 |
+
for file in input_files:
|
| 311 |
+
task_id = client.submit_task("variant_calling", file)
|
| 312 |
+
task_ids.append(task_id)
|
| 313 |
+
|
| 314 |
+
# Monitor progress
|
| 315 |
+
for task_id in task_ids:
|
| 316 |
+
status = client.get_task_status(task_id)
|
| 317 |
+
print(f"Task {task_id}: {status.status}")
|
| 318 |
+
```
|
| 319 |
+
|
| 320 |
+
### Configuration Customization
|
| 321 |
+
|
| 322 |
+
Edit `config.yml`:
|
| 323 |
+
|
| 324 |
+
```yaml
|
| 325 |
+
neo4j:
|
| 326 |
+
uri: "bolt://localhost:7687"
|
| 327 |
+
password: "your_password"
|
| 328 |
+
|
| 329 |
+
gdc:
|
| 330 |
+
download_dir: "./data/gdc"
|
| 331 |
+
max_retries: 3
|
| 332 |
+
|
| 333 |
+
pipeline:
|
| 334 |
+
fastq:
|
| 335 |
+
quality_threshold: 25 # Increase quality threshold
|
| 336 |
+
min_length: 75 # Increase minimum read length
|
| 337 |
+
|
| 338 |
+
blast:
|
| 339 |
+
evalue: 0.0001 # More stringent e-value
|
| 340 |
+
num_threads: 8 # Use more CPU cores
|
| 341 |
+
```
|
| 342 |
+
|
| 343 |
+
---
|
| 344 |
+
|
| 345 |
+
## Troubleshooting
|
| 346 |
+
|
| 347 |
+
### Neo4j Connection Issues
|
| 348 |
+
```bash
|
| 349 |
+
# Check Neo4j status
|
| 350 |
+
docker ps | grep neo4j
|
| 351 |
+
|
| 352 |
+
# Restart Neo4j
|
| 353 |
+
docker-compose restart neo4j
|
| 354 |
+
|
| 355 |
+
# View Neo4j logs
|
| 356 |
+
docker-compose logs neo4j
|
| 357 |
+
```
|
| 358 |
+
|
| 359 |
+
### Memory Issues
|
| 360 |
+
Increase Docker memory allocation:
|
| 361 |
+
1. Open Docker Desktop Settings
|
| 362 |
+
2. Resources → Memory
|
| 363 |
+
3. Increase to at least 8GB
|
| 364 |
+
4. Click "Apply & Restart"
|
| 365 |
+
|
| 366 |
+
### API Errors
|
| 367 |
+
Check logs:
|
| 368 |
+
```bash
|
| 369 |
+
# View application logs
|
| 370 |
+
cat logs/cancer_at_home.log
|
| 371 |
+
|
| 372 |
+
# Follow logs in real-time
|
| 373 |
+
tail -f logs/cancer_at_home.log
|
| 374 |
+
```
|
| 375 |
+
|
| 376 |
+
---
|
| 377 |
+
|
| 378 |
+
## Best Practices
|
| 379 |
+
|
| 380 |
+
1. **Data Management**: Regularly clean up downloaded data to free space
|
| 381 |
+
2. **Task Monitoring**: Check BOINC tasks periodically for failures
|
| 382 |
+
3. **Database Backup**: Backup Neo4j data volume regularly
|
| 383 |
+
4. **Resource Limits**: Monitor system resources when running large analyses
|
| 384 |
+
5. **API Rate Limits**: Be mindful of GDC API rate limits for bulk downloads
|
| 385 |
+
|
| 386 |
+
---
|
| 387 |
+
|
| 388 |
+
## Support & Resources
|
| 389 |
+
|
| 390 |
+
- **Documentation**: See README.md and QUICKSTART.md
|
| 391 |
+
- **API Reference**: http://localhost:5000/docs
|
| 392 |
+
- **GraphQL Examples**: See GRAPHQL_EXAMPLES.md
|
| 393 |
+
- **Logs**: Check `logs/cancer_at_home.log`
|
| 394 |
+
|
| 395 |
+
### Useful Cypher Queries
|
| 396 |
+
|
| 397 |
+
**Most common mutations:**
|
| 398 |
+
```cypher
|
| 399 |
+
MATCH (m:Mutation)<-[:HAS_MUTATION]-(p:Patient)
|
| 400 |
+
WITH m, count(p) as patient_count
|
| 401 |
+
RETURN m.mutation_id, patient_count
|
| 402 |
+
ORDER BY patient_count DESC
|
| 403 |
+
LIMIT 10
|
| 404 |
+
```
|
| 405 |
+
|
| 406 |
+
**Genes with most mutations:**
|
| 407 |
+
```cypher
|
| 408 |
+
MATCH (g:Gene)<-[:AFFECTS]-(m:Mutation)
|
| 409 |
+
WITH g, count(m) as mutation_count
|
| 410 |
+
RETURN g.symbol, mutation_count
|
| 411 |
+
ORDER BY mutation_count DESC
|
| 412 |
+
LIMIT 10
|
| 413 |
+
```
|
| 414 |
+
|
| 415 |
+
**Patient mutation profile:**
|
| 416 |
+
```cypher
|
| 417 |
+
MATCH (p:Patient {patient_id: 'TCGA-A1-001'})-[:HAS_MUTATION]->(m:Mutation)-[:AFFECTS]->(g:Gene)
|
| 418 |
+
RETURN g.symbol, m.consequence, m.position
|
| 419 |
+
```
|
backend/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Backend Module
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
__version__ = "2.0.0"
|
backend/api/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Backend API Module
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .main import app
|
| 6 |
+
|
| 7 |
+
__all__ = ['app']
|
backend/api/main.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI Main Application
|
| 3 |
+
Backend API for Cancer@Home
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from fastapi import FastAPI, HTTPException
|
| 7 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
+
from fastapi.staticfiles import StaticFiles
|
| 9 |
+
from fastapi.responses import HTMLResponse
|
| 10 |
+
from strawberry.fastapi import GraphQLRouter
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import uvicorn
|
| 13 |
+
|
| 14 |
+
from backend.neo4j.graphql_schema import schema
|
| 15 |
+
from backend.neo4j.db_manager import DatabaseManager
|
| 16 |
+
from backend.boinc.client import BOINCClient, BOINCTaskManager
|
| 17 |
+
from backend.gdc.client import GDCClient
|
| 18 |
+
from backend.pipeline import (
|
| 19 |
+
FASTQProcessor,
|
| 20 |
+
BLASTRunner,
|
| 21 |
+
VariantCaller
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# Initialize FastAPI
|
| 25 |
+
app = FastAPI(
|
| 26 |
+
title="Cancer@Home v2",
|
| 27 |
+
description="Distributed cancer genomics research platform",
|
| 28 |
+
version="2.0.0"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# CORS middleware
|
| 32 |
+
app.add_middleware(
|
| 33 |
+
CORSMiddleware,
|
| 34 |
+
allow_origins=["*"],
|
| 35 |
+
allow_credentials=True,
|
| 36 |
+
allow_methods=["*"],
|
| 37 |
+
allow_headers=["*"],
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
# GraphQL endpoint
|
| 41 |
+
graphql_app = GraphQLRouter(schema)
|
| 42 |
+
app.include_router(graphql_app, prefix="/graphql")
|
| 43 |
+
|
| 44 |
+
# Serve frontend static files
|
| 45 |
+
frontend_path = Path("frontend/dist")
|
| 46 |
+
if frontend_path.exists():
|
| 47 |
+
app.mount("/static", StaticFiles(directory=str(frontend_path)), name="static")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@app.get("/", response_class=HTMLResponse)
|
| 51 |
+
async def root():
|
| 52 |
+
"""Serve main dashboard"""
|
| 53 |
+
html_file = Path("frontend/index.html")
|
| 54 |
+
if html_file.exists():
|
| 55 |
+
with open(html_file, 'r') as f:
|
| 56 |
+
return f.read()
|
| 57 |
+
|
| 58 |
+
# Fallback HTML
|
| 59 |
+
return """
|
| 60 |
+
<!DOCTYPE html>
|
| 61 |
+
<html>
|
| 62 |
+
<head>
|
| 63 |
+
<title>Cancer@Home v2</title>
|
| 64 |
+
<style>
|
| 65 |
+
body {
|
| 66 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 67 |
+
margin: 0;
|
| 68 |
+
padding: 0;
|
| 69 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 70 |
+
color: white;
|
| 71 |
+
}
|
| 72 |
+
.container {
|
| 73 |
+
max-width: 1200px;
|
| 74 |
+
margin: 0 auto;
|
| 75 |
+
padding: 40px 20px;
|
| 76 |
+
}
|
| 77 |
+
h1 {
|
| 78 |
+
font-size: 3em;
|
| 79 |
+
margin-bottom: 20px;
|
| 80 |
+
}
|
| 81 |
+
.card {
|
| 82 |
+
background: rgba(255, 255, 255, 0.1);
|
| 83 |
+
border-radius: 10px;
|
| 84 |
+
padding: 30px;
|
| 85 |
+
margin: 20px 0;
|
| 86 |
+
backdrop-filter: blur(10px);
|
| 87 |
+
}
|
| 88 |
+
.links {
|
| 89 |
+
display: grid;
|
| 90 |
+
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
| 91 |
+
gap: 20px;
|
| 92 |
+
margin-top: 30px;
|
| 93 |
+
}
|
| 94 |
+
.link-card {
|
| 95 |
+
background: rgba(255, 255, 255, 0.15);
|
| 96 |
+
border-radius: 8px;
|
| 97 |
+
padding: 20px;
|
| 98 |
+
text-decoration: none;
|
| 99 |
+
color: white;
|
| 100 |
+
transition: transform 0.2s;
|
| 101 |
+
}
|
| 102 |
+
.link-card:hover {
|
| 103 |
+
transform: translateY(-5px);
|
| 104 |
+
background: rgba(255, 255, 255, 0.25);
|
| 105 |
+
}
|
| 106 |
+
.link-card h3 {
|
| 107 |
+
margin-top: 0;
|
| 108 |
+
}
|
| 109 |
+
</style>
|
| 110 |
+
</head>
|
| 111 |
+
<body>
|
| 112 |
+
<div class="container">
|
| 113 |
+
<h1>🧬 Cancer@Home v2</h1>
|
| 114 |
+
<div class="card">
|
| 115 |
+
<h2>Welcome to Cancer@Home</h2>
|
| 116 |
+
<p>A distributed computing platform for cancer genomics research</p>
|
| 117 |
+
</div>
|
| 118 |
+
|
| 119 |
+
<div class="links">
|
| 120 |
+
<a href="/api/docs" class="link-card">
|
| 121 |
+
<h3>📚 API Documentation</h3>
|
| 122 |
+
<p>Interactive API docs with Swagger UI</p>
|
| 123 |
+
</a>
|
| 124 |
+
<a href="/graphql" class="link-card">
|
| 125 |
+
<h3>🔍 GraphQL Playground</h3>
|
| 126 |
+
<p>Query cancer data with GraphQL</p>
|
| 127 |
+
</a>
|
| 128 |
+
<a href="http://localhost:7474" class="link-card">
|
| 129 |
+
<h3>📊 Neo4j Browser</h3>
|
| 130 |
+
<p>Visualize graph database</p>
|
| 131 |
+
</a>
|
| 132 |
+
<a href="/api/health" class="link-card">
|
| 133 |
+
<h3>💚 Health Check</h3>
|
| 134 |
+
<p>Check system status</p>
|
| 135 |
+
</a>
|
| 136 |
+
</div>
|
| 137 |
+
</div>
|
| 138 |
+
</body>
|
| 139 |
+
</html>
|
| 140 |
+
"""
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
@app.get("/api/health")
|
| 144 |
+
async def health_check():
|
| 145 |
+
"""Health check endpoint"""
|
| 146 |
+
db = DatabaseManager()
|
| 147 |
+
try:
|
| 148 |
+
db.execute_query("RETURN 1")
|
| 149 |
+
neo4j_status = "healthy"
|
| 150 |
+
except Exception as e:
|
| 151 |
+
neo4j_status = f"unhealthy: {str(e)}"
|
| 152 |
+
finally:
|
| 153 |
+
db.close()
|
| 154 |
+
|
| 155 |
+
return {
|
| 156 |
+
"status": "healthy",
|
| 157 |
+
"neo4j": neo4j_status,
|
| 158 |
+
"version": "2.0.0"
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
# BOINC API Endpoints
|
| 163 |
+
@app.get("/api/boinc/tasks")
|
| 164 |
+
async def get_boinc_tasks(status: str = None):
|
| 165 |
+
"""Get BOINC tasks"""
|
| 166 |
+
client = BOINCClient()
|
| 167 |
+
tasks = client.list_tasks(status=status)
|
| 168 |
+
return {"tasks": [vars(t) for t in tasks]}
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
@app.post("/api/boinc/submit")
|
| 172 |
+
async def submit_boinc_task(workunit_type: str, input_file: str):
|
| 173 |
+
"""Submit new BOINC task"""
|
| 174 |
+
manager = BOINCTaskManager()
|
| 175 |
+
|
| 176 |
+
if workunit_type == "variant_calling":
|
| 177 |
+
task_id = manager.submit_variant_calling(input_file)
|
| 178 |
+
elif workunit_type == "blast_search":
|
| 179 |
+
task_id = manager.submit_blast_search(input_file)
|
| 180 |
+
else:
|
| 181 |
+
task_id = manager.client.submit_task(workunit_type, input_file)
|
| 182 |
+
|
| 183 |
+
return {"task_id": task_id, "status": "submitted"}
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
@app.get("/api/boinc/statistics")
|
| 187 |
+
async def get_boinc_statistics():
|
| 188 |
+
"""Get BOINC statistics"""
|
| 189 |
+
client = BOINCClient()
|
| 190 |
+
stats = client.get_statistics()
|
| 191 |
+
return stats
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
# GDC API Endpoints
|
| 195 |
+
@app.get("/api/gdc/projects")
|
| 196 |
+
async def get_gdc_projects():
|
| 197 |
+
"""Get available GDC projects"""
|
| 198 |
+
projects = [
|
| 199 |
+
{"id": "TCGA-BRCA", "name": "Breast Cancer", "cases": 1098},
|
| 200 |
+
{"id": "TCGA-LUAD", "name": "Lung Adenocarcinoma", "cases": 585},
|
| 201 |
+
{"id": "TCGA-COAD", "name": "Colon Adenocarcinoma", "cases": 461},
|
| 202 |
+
{"id": "TCGA-GBM", "name": "Glioblastoma", "cases": 617},
|
| 203 |
+
{"id": "TARGET-AML", "name": "Acute Myeloid Leukemia", "cases": 238},
|
| 204 |
+
]
|
| 205 |
+
return {"projects": projects}
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
@app.get("/api/gdc/files/{project_id}")
|
| 209 |
+
async def search_gdc_files(project_id: str, limit: int = 10):
|
| 210 |
+
"""Search GDC files for a project"""
|
| 211 |
+
client = GDCClient()
|
| 212 |
+
files = client.get_project_files(project_id, limit=limit)
|
| 213 |
+
return {"files": [vars(f) for f in files]}
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
@app.post("/api/gdc/download")
|
| 217 |
+
async def download_gdc_file(file_id: str):
|
| 218 |
+
"""Download a file from GDC"""
|
| 219 |
+
client = GDCClient()
|
| 220 |
+
file_path = client.download_file(file_id)
|
| 221 |
+
|
| 222 |
+
if file_path:
|
| 223 |
+
return {"status": "success", "file_path": str(file_path)}
|
| 224 |
+
else:
|
| 225 |
+
raise HTTPException(status_code=500, detail="Download failed")
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
# Pipeline API Endpoints
|
| 229 |
+
@app.post("/api/pipeline/fastq/qc")
|
| 230 |
+
async def run_fastq_qc(file_path: str):
|
| 231 |
+
"""Run FASTQ quality control"""
|
| 232 |
+
processor = FASTQProcessor()
|
| 233 |
+
stats = processor.calculate_statistics(Path(file_path))
|
| 234 |
+
return {"statistics": stats}
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
@app.post("/api/pipeline/blast")
|
| 238 |
+
async def run_blast(query_file: str):
|
| 239 |
+
"""Run BLAST search"""
|
| 240 |
+
runner = BLASTRunner()
|
| 241 |
+
output_file = runner.run_blastn(Path(query_file))
|
| 242 |
+
|
| 243 |
+
if output_file:
|
| 244 |
+
hits = runner.parse_results(output_file)
|
| 245 |
+
return {
|
| 246 |
+
"status": "success",
|
| 247 |
+
"output_file": str(output_file),
|
| 248 |
+
"total_hits": len(hits),
|
| 249 |
+
"hits": hits[:10] # Return first 10 hits
|
| 250 |
+
}
|
| 251 |
+
else:
|
| 252 |
+
raise HTTPException(status_code=500, detail="BLAST search failed")
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
@app.post("/api/pipeline/variants")
|
| 256 |
+
async def call_variants(alignment_file: str, reference_genome: str):
|
| 257 |
+
"""Call variants from alignment"""
|
| 258 |
+
caller = VariantCaller()
|
| 259 |
+
vcf_file = caller.call_variants(
|
| 260 |
+
Path(alignment_file),
|
| 261 |
+
Path(reference_genome)
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
variants = caller.filter_variants(vcf_file)
|
| 265 |
+
|
| 266 |
+
return {
|
| 267 |
+
"status": "success",
|
| 268 |
+
"vcf_file": str(vcf_file),
|
| 269 |
+
"total_variants": len(variants),
|
| 270 |
+
"variants": [vars(v) for v in variants]
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
# Neo4j Query Endpoints
|
| 275 |
+
@app.get("/api/neo4j/summary")
|
| 276 |
+
async def get_database_summary():
|
| 277 |
+
"""Get database summary statistics"""
|
| 278 |
+
db = DatabaseManager()
|
| 279 |
+
|
| 280 |
+
query = """
|
| 281 |
+
MATCH (g:Gene) WITH count(g) as genes
|
| 282 |
+
MATCH (m:Mutation) WITH genes, count(m) as mutations
|
| 283 |
+
MATCH (p:Patient) WITH genes, mutations, count(p) as patients
|
| 284 |
+
MATCH (c:CancerType) WITH genes, mutations, patients, count(c) as cancer_types
|
| 285 |
+
RETURN genes, mutations, patients, cancer_types
|
| 286 |
+
"""
|
| 287 |
+
|
| 288 |
+
result = db.execute_query(query)
|
| 289 |
+
db.close()
|
| 290 |
+
|
| 291 |
+
return result[0] if result else {}
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
@app.get("/api/neo4j/genes/{symbol}")
|
| 295 |
+
async def get_gene_info(symbol: str):
|
| 296 |
+
"""Get gene information"""
|
| 297 |
+
db = DatabaseManager()
|
| 298 |
+
from backend.neo4j.db_manager import GeneRepository
|
| 299 |
+
|
| 300 |
+
repo = GeneRepository(db)
|
| 301 |
+
gene = repo.get_gene_by_symbol(symbol)
|
| 302 |
+
|
| 303 |
+
if gene:
|
| 304 |
+
mutations = repo.get_gene_mutations(gene['gene_id'])
|
| 305 |
+
db.close()
|
| 306 |
+
return {
|
| 307 |
+
"gene": gene,
|
| 308 |
+
"mutations": mutations,
|
| 309 |
+
"mutation_count": len(mutations)
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
db.close()
|
| 313 |
+
raise HTTPException(status_code=404, detail="Gene not found")
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
if __name__ == "__main__":
|
| 317 |
+
uvicorn.run(app, host="0.0.0.0", port=5000)
|
backend/boinc/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BOINC Module
|
| 3 |
+
Distributed computing integration for Cancer@Home
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .client import BOINCClient, BOINCTaskManager, WorkUnit
|
| 7 |
+
|
| 8 |
+
__all__ = ['BOINCClient', 'BOINCTaskManager', 'WorkUnit']
|
backend/boinc/client.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BOINC Client Integration
|
| 3 |
+
Handles distributed computing task submission and monitoring
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
import time
|
| 9 |
+
import requests
|
| 10 |
+
from typing import Dict, List, Optional
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from dataclasses import dataclass, asdict
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
import yaml
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class WorkUnit:
|
| 18 |
+
"""Represents a BOINC work unit"""
|
| 19 |
+
id: str
|
| 20 |
+
name: str
|
| 21 |
+
workunit_type: str
|
| 22 |
+
input_file: str
|
| 23 |
+
status: str
|
| 24 |
+
created_at: str
|
| 25 |
+
completed_at: Optional[str] = None
|
| 26 |
+
result_file: Optional[str] = None
|
| 27 |
+
error: Optional[str] = None
|
| 28 |
+
|
| 29 |
+
class BOINCClient:
|
| 30 |
+
"""BOINC client for distributed computing integration"""
|
| 31 |
+
|
| 32 |
+
def __init__(self, config_path: str = "config.yml"):
|
| 33 |
+
with open(config_path, 'r') as f:
|
| 34 |
+
self.config = yaml.safe_load(f)['boinc']
|
| 35 |
+
|
| 36 |
+
self.project_url = self.config['project_url']
|
| 37 |
+
self.work_dir = Path(self.config['work_dir'])
|
| 38 |
+
self.work_dir.mkdir(parents=True, exist_ok=True)
|
| 39 |
+
|
| 40 |
+
self.tasks_file = self.work_dir / "tasks.json"
|
| 41 |
+
self.tasks = self._load_tasks()
|
| 42 |
+
|
| 43 |
+
def _load_tasks(self) -> Dict[str, WorkUnit]:
|
| 44 |
+
"""Load existing tasks from disk"""
|
| 45 |
+
if self.tasks_file.exists():
|
| 46 |
+
with open(self.tasks_file, 'r') as f:
|
| 47 |
+
data = json.load(f)
|
| 48 |
+
return {k: WorkUnit(**v) for k, v in data.items()}
|
| 49 |
+
return {}
|
| 50 |
+
|
| 51 |
+
def _save_tasks(self):
|
| 52 |
+
"""Save tasks to disk"""
|
| 53 |
+
with open(self.tasks_file, 'w') as f:
|
| 54 |
+
data = {k: asdict(v) for k, v in self.tasks.items()}
|
| 55 |
+
json.dump(data, f, indent=2)
|
| 56 |
+
|
| 57 |
+
def submit_task(
|
| 58 |
+
self,
|
| 59 |
+
workunit_type: str,
|
| 60 |
+
input_file: str,
|
| 61 |
+
name: Optional[str] = None
|
| 62 |
+
) -> str:
|
| 63 |
+
"""
|
| 64 |
+
Submit a new work unit to BOINC
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
workunit_type: Type of analysis (variant_calling, blast_search, etc.)
|
| 68 |
+
input_file: Path to input data file
|
| 69 |
+
name: Optional custom name for the work unit
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
Work unit ID
|
| 73 |
+
"""
|
| 74 |
+
task_id = f"wu_{int(time.time() * 1000)}"
|
| 75 |
+
|
| 76 |
+
if name is None:
|
| 77 |
+
name = f"{workunit_type}_{task_id}"
|
| 78 |
+
|
| 79 |
+
# Create work unit
|
| 80 |
+
work_unit = WorkUnit(
|
| 81 |
+
id=task_id,
|
| 82 |
+
name=name,
|
| 83 |
+
workunit_type=workunit_type,
|
| 84 |
+
input_file=input_file,
|
| 85 |
+
status="pending",
|
| 86 |
+
created_at=datetime.now().isoformat()
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# In a real implementation, this would submit to actual BOINC server
|
| 90 |
+
# For now, we simulate the submission
|
| 91 |
+
self._simulate_submission(work_unit)
|
| 92 |
+
|
| 93 |
+
self.tasks[task_id] = work_unit
|
| 94 |
+
self._save_tasks()
|
| 95 |
+
|
| 96 |
+
return task_id
|
| 97 |
+
|
| 98 |
+
def _simulate_submission(self, work_unit: WorkUnit):
|
| 99 |
+
"""
|
| 100 |
+
Simulate BOINC submission (for development/demo purposes)
|
| 101 |
+
In production, replace with actual BOINC API calls
|
| 102 |
+
"""
|
| 103 |
+
# Create a work directory for this task
|
| 104 |
+
task_dir = self.work_dir / work_unit.id
|
| 105 |
+
task_dir.mkdir(exist_ok=True)
|
| 106 |
+
|
| 107 |
+
# Copy input file
|
| 108 |
+
input_path = Path(work_unit.input_file)
|
| 109 |
+
if input_path.exists():
|
| 110 |
+
import shutil
|
| 111 |
+
shutil.copy(input_path, task_dir / input_path.name)
|
| 112 |
+
|
| 113 |
+
# Create task metadata
|
| 114 |
+
metadata = {
|
| 115 |
+
"task_id": work_unit.id,
|
| 116 |
+
"type": work_unit.workunit_type,
|
| 117 |
+
"input": work_unit.input_file,
|
| 118 |
+
"submitted": work_unit.created_at
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
with open(task_dir / "metadata.json", 'w') as f:
|
| 122 |
+
json.dump(metadata, f, indent=2)
|
| 123 |
+
|
| 124 |
+
def get_task_status(self, task_id: str) -> Optional[WorkUnit]:
|
| 125 |
+
"""Get status of a specific task"""
|
| 126 |
+
return self.tasks.get(task_id)
|
| 127 |
+
|
| 128 |
+
def list_tasks(
|
| 129 |
+
self,
|
| 130 |
+
status: Optional[str] = None,
|
| 131 |
+
workunit_type: Optional[str] = None
|
| 132 |
+
) -> List[WorkUnit]:
|
| 133 |
+
"""
|
| 134 |
+
List all tasks with optional filtering
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
status: Filter by status (pending, running, completed, failed)
|
| 138 |
+
workunit_type: Filter by work unit type
|
| 139 |
+
"""
|
| 140 |
+
tasks = list(self.tasks.values())
|
| 141 |
+
|
| 142 |
+
if status:
|
| 143 |
+
tasks = [t for t in tasks if t.status == status]
|
| 144 |
+
|
| 145 |
+
if workunit_type:
|
| 146 |
+
tasks = [t for t in tasks if t.workunit_type == workunit_type]
|
| 147 |
+
|
| 148 |
+
return sorted(tasks, key=lambda t: t.created_at, reverse=True)
|
| 149 |
+
|
| 150 |
+
def update_task_status(self, task_id: str, status: str, **kwargs):
|
| 151 |
+
"""Update task status and additional fields"""
|
| 152 |
+
if task_id in self.tasks:
|
| 153 |
+
self.tasks[task_id].status = status
|
| 154 |
+
|
| 155 |
+
for key, value in kwargs.items():
|
| 156 |
+
if hasattr(self.tasks[task_id], key):
|
| 157 |
+
setattr(self.tasks[task_id], key, value)
|
| 158 |
+
|
| 159 |
+
if status == "completed":
|
| 160 |
+
self.tasks[task_id].completed_at = datetime.now().isoformat()
|
| 161 |
+
|
| 162 |
+
self._save_tasks()
|
| 163 |
+
|
| 164 |
+
def cancel_task(self, task_id: str) -> bool:
|
| 165 |
+
"""Cancel a pending or running task"""
|
| 166 |
+
if task_id in self.tasks:
|
| 167 |
+
task = self.tasks[task_id]
|
| 168 |
+
if task.status in ["pending", "running"]:
|
| 169 |
+
task.status = "cancelled"
|
| 170 |
+
self._save_tasks()
|
| 171 |
+
return True
|
| 172 |
+
return False
|
| 173 |
+
|
| 174 |
+
def get_results(self, task_id: str) -> Optional[Path]:
|
| 175 |
+
"""Get results file for a completed task"""
|
| 176 |
+
if task_id in self.tasks:
|
| 177 |
+
task = self.tasks[task_id]
|
| 178 |
+
if task.status == "completed" and task.result_file:
|
| 179 |
+
result_path = Path(task.result_file)
|
| 180 |
+
if result_path.exists():
|
| 181 |
+
return result_path
|
| 182 |
+
return None
|
| 183 |
+
|
| 184 |
+
def get_statistics(self) -> Dict:
|
| 185 |
+
"""Get overall statistics about BOINC tasks"""
|
| 186 |
+
total = len(self.tasks)
|
| 187 |
+
by_status = {}
|
| 188 |
+
by_type = {}
|
| 189 |
+
|
| 190 |
+
for task in self.tasks.values():
|
| 191 |
+
by_status[task.status] = by_status.get(task.status, 0) + 1
|
| 192 |
+
by_type[task.workunit_type] = by_type.get(task.workunit_type, 0) + 1
|
| 193 |
+
|
| 194 |
+
completed = [t for t in self.tasks.values() if t.completed_at]
|
| 195 |
+
|
| 196 |
+
if completed:
|
| 197 |
+
avg_time = sum([
|
| 198 |
+
(datetime.fromisoformat(t.completed_at) -
|
| 199 |
+
datetime.fromisoformat(t.created_at)).total_seconds()
|
| 200 |
+
for t in completed
|
| 201 |
+
]) / len(completed)
|
| 202 |
+
else:
|
| 203 |
+
avg_time = 0
|
| 204 |
+
|
| 205 |
+
return {
|
| 206 |
+
"total_tasks": total,
|
| 207 |
+
"by_status": by_status,
|
| 208 |
+
"by_type": by_type,
|
| 209 |
+
"completed_tasks": len(completed),
|
| 210 |
+
"average_completion_time_seconds": avg_time
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
class BOINCTaskManager:
|
| 215 |
+
"""High-level task manager for common workflows"""
|
| 216 |
+
|
| 217 |
+
def __init__(self):
|
| 218 |
+
self.client = BOINCClient()
|
| 219 |
+
|
| 220 |
+
def submit_variant_calling(self, fastq_file: str) -> str:
|
| 221 |
+
"""Submit variant calling task"""
|
| 222 |
+
return self.client.submit_task(
|
| 223 |
+
workunit_type="variant_calling",
|
| 224 |
+
input_file=fastq_file,
|
| 225 |
+
name=f"variant_calling_{Path(fastq_file).stem}"
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
def submit_blast_search(self, sequence_file: str) -> str:
|
| 229 |
+
"""Submit BLAST search task"""
|
| 230 |
+
return self.client.submit_task(
|
| 231 |
+
workunit_type="blast_search",
|
| 232 |
+
input_file=sequence_file,
|
| 233 |
+
name=f"blast_{Path(sequence_file).stem}"
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
def submit_alignment(self, fastq_file: str) -> str:
|
| 237 |
+
"""Submit sequence alignment task"""
|
| 238 |
+
return self.client.submit_task(
|
| 239 |
+
workunit_type="alignment",
|
| 240 |
+
input_file=fastq_file,
|
| 241 |
+
name=f"alignment_{Path(fastq_file).stem}"
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
def submit_annotation(self, vcf_file: str) -> str:
|
| 245 |
+
"""Submit variant annotation task"""
|
| 246 |
+
return self.client.submit_task(
|
| 247 |
+
workunit_type="annotation",
|
| 248 |
+
input_file=vcf_file,
|
| 249 |
+
name=f"annotation_{Path(vcf_file).stem}"
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
def batch_submit(
|
| 253 |
+
self,
|
| 254 |
+
workunit_type: str,
|
| 255 |
+
input_files: List[str]
|
| 256 |
+
) -> List[str]:
|
| 257 |
+
"""Submit multiple tasks at once"""
|
| 258 |
+
task_ids = []
|
| 259 |
+
for input_file in input_files:
|
| 260 |
+
task_id = self.client.submit_task(workunit_type, input_file)
|
| 261 |
+
task_ids.append(task_id)
|
| 262 |
+
return task_ids
|
backend/gdc/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GDC Module
|
| 3 |
+
Interface to GDC Cancer Data Portal
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .client import GDCClient, GDCDataParser, GDCFile
|
| 7 |
+
|
| 8 |
+
__all__ = ['GDCClient', 'GDCDataParser', 'GDCFile']
|
backend/gdc/client.py
ADDED
|
@@ -0,0 +1,365 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GDC Data Portal Client
|
| 3 |
+
Download and parse cancer genomics data from GDC
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
import requests
|
| 9 |
+
from typing import Dict, List, Optional, Any
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import yaml
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
import logging
|
| 14 |
+
|
| 15 |
+
logging.basicConfig(level=logging.INFO)
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class GDCFile:
|
| 21 |
+
"""Represents a file from GDC Portal"""
|
| 22 |
+
file_id: str
|
| 23 |
+
file_name: str
|
| 24 |
+
file_size: int
|
| 25 |
+
data_type: str
|
| 26 |
+
data_format: str
|
| 27 |
+
experimental_strategy: str
|
| 28 |
+
case_id: str
|
| 29 |
+
project_id: str
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class GDCClient:
|
| 33 |
+
"""Client for interacting with GDC Data Portal API"""
|
| 34 |
+
|
| 35 |
+
def __init__(self, config_path: str = "config.yml"):
|
| 36 |
+
with open(config_path, 'r') as f:
|
| 37 |
+
self.config = yaml.safe_load(f)['gdc']
|
| 38 |
+
|
| 39 |
+
self.api_url = self.config['api_url']
|
| 40 |
+
self.download_dir = Path(self.config['download_dir'])
|
| 41 |
+
self.download_dir.mkdir(parents=True, exist_ok=True)
|
| 42 |
+
|
| 43 |
+
self.session = requests.Session()
|
| 44 |
+
self.session.headers.update({
|
| 45 |
+
'Content-Type': 'application/json'
|
| 46 |
+
})
|
| 47 |
+
|
| 48 |
+
def search_files(
|
| 49 |
+
self,
|
| 50 |
+
filters: Optional[Dict] = None,
|
| 51 |
+
size: int = 100,
|
| 52 |
+
fields: Optional[List[str]] = None
|
| 53 |
+
) -> List[GDCFile]:
|
| 54 |
+
"""
|
| 55 |
+
Search for files in GDC
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
filters: GDC filter query
|
| 59 |
+
size: Number of results to return
|
| 60 |
+
fields: Fields to include in response
|
| 61 |
+
"""
|
| 62 |
+
endpoint = f"{self.api_url}/files"
|
| 63 |
+
|
| 64 |
+
if fields is None:
|
| 65 |
+
fields = [
|
| 66 |
+
'file_id', 'file_name', 'file_size', 'data_type',
|
| 67 |
+
'data_format', 'experimental_strategy', 'cases.case_id',
|
| 68 |
+
'cases.project.project_id'
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
params = {
|
| 72 |
+
'size': size,
|
| 73 |
+
'fields': ','.join(fields)
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
if filters:
|
| 77 |
+
params['filters'] = json.dumps(filters)
|
| 78 |
+
|
| 79 |
+
try:
|
| 80 |
+
response = self.session.get(endpoint, params=params)
|
| 81 |
+
response.raise_for_status()
|
| 82 |
+
data = response.json()
|
| 83 |
+
|
| 84 |
+
files = []
|
| 85 |
+
for hit in data.get('data', {}).get('hits', []):
|
| 86 |
+
gdc_file = GDCFile(
|
| 87 |
+
file_id=hit.get('file_id'),
|
| 88 |
+
file_name=hit.get('file_name'),
|
| 89 |
+
file_size=hit.get('file_size', 0),
|
| 90 |
+
data_type=hit.get('data_type'),
|
| 91 |
+
data_format=hit.get('data_format'),
|
| 92 |
+
experimental_strategy=hit.get('experimental_strategy'),
|
| 93 |
+
case_id=hit.get('cases', [{}])[0].get('case_id') if hit.get('cases') else None,
|
| 94 |
+
project_id=hit.get('cases', [{}])[0].get('project', {}).get('project_id') if hit.get('cases') else None
|
| 95 |
+
)
|
| 96 |
+
files.append(gdc_file)
|
| 97 |
+
|
| 98 |
+
logger.info(f"Found {len(files)} files")
|
| 99 |
+
return files
|
| 100 |
+
|
| 101 |
+
except Exception as e:
|
| 102 |
+
logger.error(f"Error searching files: {e}")
|
| 103 |
+
return []
|
| 104 |
+
|
| 105 |
+
def download_file(
|
| 106 |
+
self,
|
| 107 |
+
file_id: str,
|
| 108 |
+
output_dir: Optional[Path] = None
|
| 109 |
+
) -> Optional[Path]:
|
| 110 |
+
"""
|
| 111 |
+
Download a file from GDC
|
| 112 |
+
|
| 113 |
+
Args:
|
| 114 |
+
file_id: GDC file UUID
|
| 115 |
+
output_dir: Directory to save file (defaults to config download_dir)
|
| 116 |
+
|
| 117 |
+
Returns:
|
| 118 |
+
Path to downloaded file or None if failed
|
| 119 |
+
"""
|
| 120 |
+
if output_dir is None:
|
| 121 |
+
output_dir = self.download_dir
|
| 122 |
+
|
| 123 |
+
output_dir = Path(output_dir)
|
| 124 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 125 |
+
|
| 126 |
+
endpoint = f"{self.api_url}/data/{file_id}"
|
| 127 |
+
|
| 128 |
+
try:
|
| 129 |
+
logger.info(f"Downloading file {file_id}")
|
| 130 |
+
response = self.session.get(endpoint, stream=True)
|
| 131 |
+
response.raise_for_status()
|
| 132 |
+
|
| 133 |
+
# Get filename from headers
|
| 134 |
+
content_disposition = response.headers.get('content-disposition', '')
|
| 135 |
+
if 'filename=' in content_disposition:
|
| 136 |
+
filename = content_disposition.split('filename=')[1].strip('"')
|
| 137 |
+
else:
|
| 138 |
+
filename = file_id
|
| 139 |
+
|
| 140 |
+
output_path = output_dir / filename
|
| 141 |
+
|
| 142 |
+
with open(output_path, 'wb') as f:
|
| 143 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 144 |
+
f.write(chunk)
|
| 145 |
+
|
| 146 |
+
logger.info(f"Downloaded to {output_path}")
|
| 147 |
+
return output_path
|
| 148 |
+
|
| 149 |
+
except Exception as e:
|
| 150 |
+
logger.error(f"Error downloading file {file_id}: {e}")
|
| 151 |
+
return None
|
| 152 |
+
|
| 153 |
+
def get_project_files(
|
| 154 |
+
self,
|
| 155 |
+
project_id: str,
|
| 156 |
+
data_type: Optional[str] = None,
|
| 157 |
+
limit: int = 100
|
| 158 |
+
) -> List[GDCFile]:
|
| 159 |
+
"""
|
| 160 |
+
Get files for a specific project
|
| 161 |
+
|
| 162 |
+
Args:
|
| 163 |
+
project_id: GDC project ID (e.g., TCGA-BRCA)
|
| 164 |
+
data_type: Filter by data type
|
| 165 |
+
limit: Maximum number of files
|
| 166 |
+
"""
|
| 167 |
+
filters = {
|
| 168 |
+
"op": "and",
|
| 169 |
+
"content": [
|
| 170 |
+
{
|
| 171 |
+
"op": "in",
|
| 172 |
+
"content": {
|
| 173 |
+
"field": "cases.project.project_id",
|
| 174 |
+
"value": [project_id]
|
| 175 |
+
}
|
| 176 |
+
}
|
| 177 |
+
]
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
if data_type:
|
| 181 |
+
filters["content"].append({
|
| 182 |
+
"op": "in",
|
| 183 |
+
"content": {
|
| 184 |
+
"field": "data_type",
|
| 185 |
+
"value": [data_type]
|
| 186 |
+
}
|
| 187 |
+
})
|
| 188 |
+
|
| 189 |
+
return self.search_files(filters=filters, size=limit)
|
| 190 |
+
|
| 191 |
+
def get_mutation_data(self, project_id: str, limit: int = 100) -> List[GDCFile]:
|
| 192 |
+
"""Get mutation/variant calling files for a project"""
|
| 193 |
+
return self.get_project_files(
|
| 194 |
+
project_id=project_id,
|
| 195 |
+
data_type="Simple Nucleotide Variation",
|
| 196 |
+
limit=limit
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
def get_gene_expression_data(self, project_id: str, limit: int = 100) -> List[GDCFile]:
|
| 200 |
+
"""Get gene expression data for a project"""
|
| 201 |
+
return self.get_project_files(
|
| 202 |
+
project_id=project_id,
|
| 203 |
+
data_type="Gene Expression Quantification",
|
| 204 |
+
limit=limit
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
def search_cases(
|
| 208 |
+
self,
|
| 209 |
+
project_id: str,
|
| 210 |
+
filters: Optional[Dict] = None,
|
| 211 |
+
size: int = 100
|
| 212 |
+
) -> List[Dict]:
|
| 213 |
+
"""
|
| 214 |
+
Search for cases (patients) in GDC
|
| 215 |
+
|
| 216 |
+
Args:
|
| 217 |
+
project_id: GDC project ID
|
| 218 |
+
filters: Additional filter criteria
|
| 219 |
+
size: Number of results
|
| 220 |
+
"""
|
| 221 |
+
endpoint = f"{self.api_url}/cases"
|
| 222 |
+
|
| 223 |
+
base_filters = {
|
| 224 |
+
"op": "in",
|
| 225 |
+
"content": {
|
| 226 |
+
"field": "project.project_id",
|
| 227 |
+
"value": [project_id]
|
| 228 |
+
}
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
if filters:
|
| 232 |
+
filter_query = {
|
| 233 |
+
"op": "and",
|
| 234 |
+
"content": [base_filters, filters]
|
| 235 |
+
}
|
| 236 |
+
else:
|
| 237 |
+
filter_query = base_filters
|
| 238 |
+
|
| 239 |
+
params = {
|
| 240 |
+
'size': size,
|
| 241 |
+
'filters': json.dumps(filter_query),
|
| 242 |
+
'fields': 'case_id,project.project_id,demographic,diagnoses'
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
try:
|
| 246 |
+
response = self.session.get(endpoint, params=params)
|
| 247 |
+
response.raise_for_status()
|
| 248 |
+
data = response.json()
|
| 249 |
+
|
| 250 |
+
cases = data.get('data', {}).get('hits', [])
|
| 251 |
+
logger.info(f"Found {len(cases)} cases")
|
| 252 |
+
return cases
|
| 253 |
+
|
| 254 |
+
except Exception as e:
|
| 255 |
+
logger.error(f"Error searching cases: {e}")
|
| 256 |
+
return []
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
class GDCDataParser:
|
| 260 |
+
"""Parse downloaded GDC data files"""
|
| 261 |
+
|
| 262 |
+
@staticmethod
|
| 263 |
+
def parse_maf(file_path: Path) -> List[Dict]:
|
| 264 |
+
"""
|
| 265 |
+
Parse MAF (Mutation Annotation Format) file
|
| 266 |
+
|
| 267 |
+
Returns list of mutation records
|
| 268 |
+
"""
|
| 269 |
+
mutations = []
|
| 270 |
+
|
| 271 |
+
try:
|
| 272 |
+
with open(file_path, 'r') as f:
|
| 273 |
+
# Skip comment lines
|
| 274 |
+
for line in f:
|
| 275 |
+
if not line.startswith('#'):
|
| 276 |
+
header_line = line.strip()
|
| 277 |
+
break
|
| 278 |
+
|
| 279 |
+
headers = header_line.split('\t')
|
| 280 |
+
|
| 281 |
+
for line in f:
|
| 282 |
+
if line.startswith('#'):
|
| 283 |
+
continue
|
| 284 |
+
|
| 285 |
+
values = line.strip().split('\t')
|
| 286 |
+
if len(values) == len(headers):
|
| 287 |
+
mutation = dict(zip(headers, values))
|
| 288 |
+
mutations.append(mutation)
|
| 289 |
+
|
| 290 |
+
logger.info(f"Parsed {len(mutations)} mutations from {file_path}")
|
| 291 |
+
return mutations
|
| 292 |
+
|
| 293 |
+
except Exception as e:
|
| 294 |
+
logger.error(f"Error parsing MAF file: {e}")
|
| 295 |
+
return []
|
| 296 |
+
|
| 297 |
+
@staticmethod
|
| 298 |
+
def parse_vcf(file_path: Path) -> List[Dict]:
|
| 299 |
+
"""
|
| 300 |
+
Parse VCF (Variant Call Format) file
|
| 301 |
+
|
| 302 |
+
Returns list of variant records
|
| 303 |
+
"""
|
| 304 |
+
variants = []
|
| 305 |
+
|
| 306 |
+
try:
|
| 307 |
+
with open(file_path, 'r') as f:
|
| 308 |
+
for line in f:
|
| 309 |
+
if line.startswith('##'):
|
| 310 |
+
continue
|
| 311 |
+
if line.startswith('#CHROM'):
|
| 312 |
+
headers = line.strip().split('\t')
|
| 313 |
+
continue
|
| 314 |
+
|
| 315 |
+
values = line.strip().split('\t')
|
| 316 |
+
variant = {
|
| 317 |
+
'chrom': values[0],
|
| 318 |
+
'pos': values[1],
|
| 319 |
+
'id': values[2],
|
| 320 |
+
'ref': values[3],
|
| 321 |
+
'alt': values[4],
|
| 322 |
+
'qual': values[5],
|
| 323 |
+
'filter': values[6],
|
| 324 |
+
'info': values[7]
|
| 325 |
+
}
|
| 326 |
+
variants.append(variant)
|
| 327 |
+
|
| 328 |
+
logger.info(f"Parsed {len(variants)} variants from {file_path}")
|
| 329 |
+
return variants
|
| 330 |
+
|
| 331 |
+
except Exception as e:
|
| 332 |
+
logger.error(f"Error parsing VCF file: {e}")
|
| 333 |
+
return []
|
| 334 |
+
|
| 335 |
+
@staticmethod
|
| 336 |
+
def parse_clinical_data(data: Dict) -> Dict:
|
| 337 |
+
"""Parse clinical data from GDC case"""
|
| 338 |
+
clinical = {
|
| 339 |
+
'case_id': data.get('case_id'),
|
| 340 |
+
'project_id': data.get('project', {}).get('project_id'),
|
| 341 |
+
'demographic': {},
|
| 342 |
+
'diagnoses': []
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
# Parse demographic data
|
| 346 |
+
demo = data.get('demographic', {})
|
| 347 |
+
clinical['demographic'] = {
|
| 348 |
+
'age_at_index': demo.get('age_at_index'),
|
| 349 |
+
'gender': demo.get('gender'),
|
| 350 |
+
'race': demo.get('race'),
|
| 351 |
+
'ethnicity': demo.get('ethnicity')
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
# Parse diagnosis data
|
| 355 |
+
for diag in data.get('diagnoses', []):
|
| 356 |
+
diagnosis = {
|
| 357 |
+
'diagnosis_id': diag.get('diagnosis_id'),
|
| 358 |
+
'primary_diagnosis': diag.get('primary_diagnosis'),
|
| 359 |
+
'tumor_stage': diag.get('tumor_stage'),
|
| 360 |
+
'age_at_diagnosis': diag.get('age_at_diagnosis'),
|
| 361 |
+
'vital_status': diag.get('vital_status')
|
| 362 |
+
}
|
| 363 |
+
clinical['diagnoses'].append(diagnosis)
|
| 364 |
+
|
| 365 |
+
return clinical
|
backend/neo4j/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Neo4j Module
|
| 3 |
+
Graph database integration for cancer genomics data
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .db_manager import (
|
| 7 |
+
DatabaseManager,
|
| 8 |
+
GeneRepository,
|
| 9 |
+
MutationRepository,
|
| 10 |
+
PatientRepository,
|
| 11 |
+
CancerTypeRepository
|
| 12 |
+
)
|
| 13 |
+
from .graphql_schema import schema
|
| 14 |
+
from .data_importer import DataImporter, initialize_database
|
| 15 |
+
|
| 16 |
+
__all__ = [
|
| 17 |
+
'DatabaseManager',
|
| 18 |
+
'GeneRepository',
|
| 19 |
+
'MutationRepository',
|
| 20 |
+
'PatientRepository',
|
| 21 |
+
'CancerTypeRepository',
|
| 22 |
+
'schema',
|
| 23 |
+
'DataImporter',
|
| 24 |
+
'initialize_database'
|
| 25 |
+
]
|
backend/neo4j/data_importer.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Importer for Neo4j
|
| 3 |
+
Import cancer data from various sources into the graph database
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Dict, List
|
| 8 |
+
import logging
|
| 9 |
+
from .db_manager import (
|
| 10 |
+
DatabaseManager,
|
| 11 |
+
GeneRepository,
|
| 12 |
+
MutationRepository,
|
| 13 |
+
PatientRepository,
|
| 14 |
+
CancerTypeRepository
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
logging.basicConfig(level=logging.INFO)
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class DataImporter:
|
| 22 |
+
"""Import cancer genomics data into Neo4j"""
|
| 23 |
+
|
| 24 |
+
def __init__(self):
|
| 25 |
+
self.db = DatabaseManager()
|
| 26 |
+
self.gene_repo = GeneRepository(self.db)
|
| 27 |
+
self.mutation_repo = MutationRepository(self.db)
|
| 28 |
+
self.patient_repo = PatientRepository(self.db)
|
| 29 |
+
self.cancer_repo = CancerTypeRepository(self.db)
|
| 30 |
+
|
| 31 |
+
def close(self):
|
| 32 |
+
"""Close database connection"""
|
| 33 |
+
self.db.close()
|
| 34 |
+
|
| 35 |
+
def import_sample_data(self):
|
| 36 |
+
"""Import sample cancer data for demonstration"""
|
| 37 |
+
logger.info("Importing sample cancer data...")
|
| 38 |
+
|
| 39 |
+
# Create cancer types
|
| 40 |
+
cancer_types = [
|
| 41 |
+
{'cancer_type_id': 'BRCA', 'name': 'Breast Cancer', 'tissue': 'Breast', 'disease_type': 'Adenocarcinoma'},
|
| 42 |
+
{'cancer_type_id': 'LUAD', 'name': 'Lung Adenocarcinoma', 'tissue': 'Lung', 'disease_type': 'Adenocarcinoma'},
|
| 43 |
+
{'cancer_type_id': 'COAD', 'name': 'Colon Adenocarcinoma', 'tissue': 'Colon', 'disease_type': 'Adenocarcinoma'},
|
| 44 |
+
{'cancer_type_id': 'GBM', 'name': 'Glioblastoma', 'tissue': 'Brain', 'disease_type': 'Glioblastoma'},
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
for cancer_data in cancer_types:
|
| 48 |
+
self.cancer_repo.create_cancer_type(cancer_data)
|
| 49 |
+
logger.info(f"Created cancer type: {cancer_data['name']}")
|
| 50 |
+
|
| 51 |
+
# Create genes
|
| 52 |
+
genes = [
|
| 53 |
+
{'gene_id': 'ENSG00000141510', 'symbol': 'TP53', 'name': 'Tumor protein p53', 'chromosome': 'chr17', 'gene_type': 'protein_coding'},
|
| 54 |
+
{'gene_id': 'ENSG00000157764', 'symbol': 'BRAF', 'name': 'B-Raf proto-oncogene', 'chromosome': 'chr7', 'gene_type': 'protein_coding'},
|
| 55 |
+
{'gene_id': 'ENSG00000139618', 'symbol': 'BRCA2', 'name': 'BRCA2 DNA repair associated', 'chromosome': 'chr13', 'gene_type': 'protein_coding'},
|
| 56 |
+
{'gene_id': 'ENSG00000012048', 'symbol': 'BRCA1', 'name': 'BRCA1 DNA repair associated', 'chromosome': 'chr17', 'gene_type': 'protein_coding'},
|
| 57 |
+
{'gene_id': 'ENSG00000121879', 'symbol': 'PIK3CA', 'name': 'Phosphatidylinositol-4,5-bisphosphate 3-kinase', 'chromosome': 'chr3', 'gene_type': 'protein_coding'},
|
| 58 |
+
{'gene_id': 'ENSG00000133703', 'symbol': 'KRAS', 'name': 'KRAS proto-oncogene', 'chromosome': 'chr12', 'gene_type': 'protein_coding'},
|
| 59 |
+
{'gene_id': 'ENSG00000146648', 'symbol': 'EGFR', 'name': 'Epidermal growth factor receptor', 'chromosome': 'chr7', 'gene_type': 'protein_coding'},
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
for gene_data in genes:
|
| 63 |
+
self.gene_repo.create_gene(gene_data)
|
| 64 |
+
logger.info(f"Created gene: {gene_data['symbol']}")
|
| 65 |
+
|
| 66 |
+
# Create patients
|
| 67 |
+
patients = [
|
| 68 |
+
{'patient_id': 'TCGA-A1-001', 'project_id': 'TCGA-BRCA', 'age': 55, 'gender': 'female', 'race': 'white', 'vital_status': 'alive'},
|
| 69 |
+
{'patient_id': 'TCGA-A1-002', 'project_id': 'TCGA-BRCA', 'age': 62, 'gender': 'female', 'race': 'asian', 'vital_status': 'alive'},
|
| 70 |
+
{'patient_id': 'TCGA-L1-001', 'project_id': 'TCGA-LUAD', 'age': 68, 'gender': 'male', 'race': 'white', 'vital_status': 'deceased'},
|
| 71 |
+
{'patient_id': 'TCGA-L1-002', 'project_id': 'TCGA-LUAD', 'age': 71, 'gender': 'male', 'race': 'black', 'vital_status': 'alive'},
|
| 72 |
+
{'patient_id': 'TCGA-C1-001', 'project_id': 'TCGA-COAD', 'age': 58, 'gender': 'female', 'race': 'white', 'vital_status': 'alive'},
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
for patient_data in patients:
|
| 76 |
+
self.patient_repo.create_patient(patient_data)
|
| 77 |
+
logger.info(f"Created patient: {patient_data['patient_id']}")
|
| 78 |
+
|
| 79 |
+
# Link patients to cancer types
|
| 80 |
+
diagnoses = [
|
| 81 |
+
{'patient_id': 'TCGA-A1-001', 'cancer_type_id': 'BRCA', 'properties': {'stage': 'Stage II', 'grade': 'G2'}},
|
| 82 |
+
{'patient_id': 'TCGA-A1-002', 'cancer_type_id': 'BRCA', 'properties': {'stage': 'Stage III', 'grade': 'G3'}},
|
| 83 |
+
{'patient_id': 'TCGA-L1-001', 'cancer_type_id': 'LUAD', 'properties': {'stage': 'Stage IV', 'grade': 'G3'}},
|
| 84 |
+
{'patient_id': 'TCGA-L1-002', 'cancer_type_id': 'LUAD', 'properties': {'stage': 'Stage II', 'grade': 'G2'}},
|
| 85 |
+
{'patient_id': 'TCGA-C1-001', 'cancer_type_id': 'COAD', 'properties': {'stage': 'Stage III', 'grade': 'G2'}},
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
for diagnosis in diagnoses:
|
| 89 |
+
self.patient_repo.link_patient_to_cancer_type(
|
| 90 |
+
diagnosis['patient_id'],
|
| 91 |
+
diagnosis['cancer_type_id'],
|
| 92 |
+
diagnosis['properties']
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
# Create mutations
|
| 96 |
+
mutations = [
|
| 97 |
+
{'mutation_id': 'MUT-TP53-001', 'chromosome': 'chr17', 'position': 7577538, 'reference': 'C', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 35.2},
|
| 98 |
+
{'mutation_id': 'MUT-BRAF-001', 'chromosome': 'chr7', 'position': 140453136, 'reference': 'A', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 42.1},
|
| 99 |
+
{'mutation_id': 'MUT-BRCA2-001', 'chromosome': 'chr13', 'position': 32914438, 'reference': 'T', 'alternate': 'C', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 38.7},
|
| 100 |
+
{'mutation_id': 'MUT-PIK3CA-001', 'chromosome': 'chr3', 'position': 178936091, 'reference': 'G', 'alternate': 'A', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 33.5},
|
| 101 |
+
{'mutation_id': 'MUT-KRAS-001', 'chromosome': 'chr12', 'position': 25398284, 'reference': 'C', 'alternate': 'T', 'consequence': 'missense', 'variant_type': 'SNP', 'quality': 39.4},
|
| 102 |
+
]
|
| 103 |
+
|
| 104 |
+
gene_mutations = [
|
| 105 |
+
('MUT-TP53-001', 'ENSG00000141510'),
|
| 106 |
+
('MUT-BRAF-001', 'ENSG00000157764'),
|
| 107 |
+
('MUT-BRCA2-001', 'ENSG00000139618'),
|
| 108 |
+
('MUT-PIK3CA-001', 'ENSG00000121879'),
|
| 109 |
+
('MUT-KRAS-001', 'ENSG00000133703'),
|
| 110 |
+
]
|
| 111 |
+
|
| 112 |
+
for mutation_data, (mut_id, gene_id) in zip(mutations, gene_mutations):
|
| 113 |
+
self.mutation_repo.create_mutation(mutation_data, gene_id)
|
| 114 |
+
logger.info(f"Created mutation: {mutation_data['mutation_id']}")
|
| 115 |
+
|
| 116 |
+
# Link mutations to patients
|
| 117 |
+
patient_mutations = [
|
| 118 |
+
{'patient_id': 'TCGA-A1-001', 'mutation_id': 'MUT-TP53-001', 'properties': {'allele_frequency': 0.45, 'depth': 50}},
|
| 119 |
+
{'patient_id': 'TCGA-A1-001', 'mutation_id': 'MUT-PIK3CA-001', 'properties': {'allele_frequency': 0.38, 'depth': 48}},
|
| 120 |
+
{'patient_id': 'TCGA-A1-002', 'mutation_id': 'MUT-BRCA2-001', 'properties': {'allele_frequency': 0.52, 'depth': 55}},
|
| 121 |
+
{'patient_id': 'TCGA-L1-001', 'mutation_id': 'MUT-KRAS-001', 'properties': {'allele_frequency': 0.49, 'depth': 58}},
|
| 122 |
+
{'patient_id': 'TCGA-L1-001', 'mutation_id': 'MUT-TP53-001', 'properties': {'allele_frequency': 0.41, 'depth': 45}},
|
| 123 |
+
{'patient_id': 'TCGA-L1-002', 'mutation_id': 'MUT-BRAF-001', 'properties': {'allele_frequency': 0.47, 'depth': 52}},
|
| 124 |
+
{'patient_id': 'TCGA-C1-001', 'mutation_id': 'MUT-KRAS-001', 'properties': {'allele_frequency': 0.44, 'depth': 50}},
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
for pm in patient_mutations:
|
| 128 |
+
self.mutation_repo.link_mutation_to_patient(
|
| 129 |
+
pm['mutation_id'],
|
| 130 |
+
pm['patient_id'],
|
| 131 |
+
pm['properties']
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
logger.info("Sample data import completed!")
|
| 135 |
+
|
| 136 |
+
def import_gdc_data(self, gdc_files: List[Dict]):
|
| 137 |
+
"""Import data from GDC portal"""
|
| 138 |
+
# Implementation for importing real GDC data
|
| 139 |
+
pass
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def initialize_database():
|
| 143 |
+
"""Initialize database with sample data"""
|
| 144 |
+
importer = DataImporter()
|
| 145 |
+
try:
|
| 146 |
+
importer.import_sample_data()
|
| 147 |
+
finally:
|
| 148 |
+
importer.close()
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
initialize_database()
|
backend/neo4j/db_manager.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Neo4j Database Manager
|
| 3 |
+
Handle graph database connections and operations
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from neo4j import GraphDatabase
|
| 7 |
+
from typing import Dict, List, Optional, Any
|
| 8 |
+
import yaml
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
logging.basicConfig(level=logging.INFO)
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class DatabaseManager:
|
| 16 |
+
"""Manage Neo4j database connections and schema"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, config_path: str = "config.yml"):
|
| 19 |
+
with open(config_path, 'r') as f:
|
| 20 |
+
self.config = yaml.safe_load(f)['neo4j']
|
| 21 |
+
|
| 22 |
+
self.driver = GraphDatabase.driver(
|
| 23 |
+
self.config['uri'],
|
| 24 |
+
auth=(self.config['username'], self.config['password'])
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
logger.info(f"Connected to Neo4j at {self.config['uri']}")
|
| 28 |
+
|
| 29 |
+
def close(self):
|
| 30 |
+
"""Close database connection"""
|
| 31 |
+
self.driver.close()
|
| 32 |
+
|
| 33 |
+
def __enter__(self):
|
| 34 |
+
return self
|
| 35 |
+
|
| 36 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 37 |
+
self.close()
|
| 38 |
+
|
| 39 |
+
def execute_query(self, query: str, parameters: Optional[Dict] = None) -> List[Dict]:
|
| 40 |
+
"""Execute a Cypher query and return results"""
|
| 41 |
+
with self.driver.session() as session:
|
| 42 |
+
result = session.run(query, parameters or {})
|
| 43 |
+
return [record.data() for record in result]
|
| 44 |
+
|
| 45 |
+
def initialize_schema(self):
|
| 46 |
+
"""Initialize database schema with constraints and indexes"""
|
| 47 |
+
queries = [
|
| 48 |
+
# Constraints
|
| 49 |
+
"CREATE CONSTRAINT gene_id IF NOT EXISTS FOR (g:Gene) REQUIRE g.gene_id IS UNIQUE",
|
| 50 |
+
"CREATE CONSTRAINT mutation_id IF NOT EXISTS FOR (m:Mutation) REQUIRE m.mutation_id IS UNIQUE",
|
| 51 |
+
"CREATE CONSTRAINT patient_id IF NOT EXISTS FOR (p:Patient) REQUIRE p.patient_id IS UNIQUE",
|
| 52 |
+
"CREATE CONSTRAINT cancer_type_id IF NOT EXISTS FOR (c:CancerType) REQUIRE c.cancer_type_id IS UNIQUE",
|
| 53 |
+
|
| 54 |
+
# Indexes
|
| 55 |
+
"CREATE INDEX gene_symbol IF NOT EXISTS FOR (g:Gene) ON (g.symbol)",
|
| 56 |
+
"CREATE INDEX mutation_position IF NOT EXISTS FOR (m:Mutation) ON (m.chromosome, m.position)",
|
| 57 |
+
"CREATE INDEX patient_project IF NOT EXISTS FOR (p:Patient) ON (p.project_id)",
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
with self.driver.session() as session:
|
| 61 |
+
for query in queries:
|
| 62 |
+
try:
|
| 63 |
+
session.run(query)
|
| 64 |
+
logger.info(f"Executed: {query[:50]}...")
|
| 65 |
+
except Exception as e:
|
| 66 |
+
logger.warning(f"Schema query failed (may already exist): {e}")
|
| 67 |
+
|
| 68 |
+
logger.info("Database schema initialized")
|
| 69 |
+
|
| 70 |
+
def clear_database(self):
|
| 71 |
+
"""Clear all nodes and relationships (use with caution!)"""
|
| 72 |
+
query = "MATCH (n) DETACH DELETE n"
|
| 73 |
+
with self.driver.session() as session:
|
| 74 |
+
session.run(query)
|
| 75 |
+
logger.info("Database cleared")
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class GeneRepository:
|
| 79 |
+
"""Repository for Gene nodes"""
|
| 80 |
+
|
| 81 |
+
def __init__(self, db_manager: DatabaseManager):
|
| 82 |
+
self.db = db_manager
|
| 83 |
+
|
| 84 |
+
def create_gene(self, gene_data: Dict) -> Dict:
|
| 85 |
+
"""Create a Gene node"""
|
| 86 |
+
query = """
|
| 87 |
+
MERGE (g:Gene {gene_id: $gene_id})
|
| 88 |
+
SET g.symbol = $symbol,
|
| 89 |
+
g.name = $name,
|
| 90 |
+
g.chromosome = $chromosome,
|
| 91 |
+
g.start_position = $start_position,
|
| 92 |
+
g.end_position = $end_position,
|
| 93 |
+
g.strand = $strand,
|
| 94 |
+
g.gene_type = $gene_type
|
| 95 |
+
RETURN g
|
| 96 |
+
"""
|
| 97 |
+
result = self.db.execute_query(query, gene_data)
|
| 98 |
+
return result[0]['g'] if result else {}
|
| 99 |
+
|
| 100 |
+
def get_gene_by_symbol(self, symbol: str) -> Optional[Dict]:
|
| 101 |
+
"""Find gene by symbol"""
|
| 102 |
+
query = """
|
| 103 |
+
MATCH (g:Gene {symbol: $symbol})
|
| 104 |
+
RETURN g
|
| 105 |
+
"""
|
| 106 |
+
result = self.db.execute_query(query, {'symbol': symbol})
|
| 107 |
+
return result[0]['g'] if result else None
|
| 108 |
+
|
| 109 |
+
def get_gene_mutations(self, gene_id: str) -> List[Dict]:
|
| 110 |
+
"""Get all mutations for a gene"""
|
| 111 |
+
query = """
|
| 112 |
+
MATCH (g:Gene {gene_id: $gene_id})<-[:AFFECTS]-(m:Mutation)
|
| 113 |
+
RETURN m
|
| 114 |
+
ORDER BY m.position
|
| 115 |
+
"""
|
| 116 |
+
result = self.db.execute_query(query, {'gene_id': gene_id})
|
| 117 |
+
return [r['m'] for r in result]
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
class MutationRepository:
|
| 121 |
+
"""Repository for Mutation nodes"""
|
| 122 |
+
|
| 123 |
+
def __init__(self, db_manager: DatabaseManager):
|
| 124 |
+
self.db = db_manager
|
| 125 |
+
|
| 126 |
+
def create_mutation(self, mutation_data: Dict, gene_id: str) -> Dict:
|
| 127 |
+
"""Create a Mutation node and link to Gene"""
|
| 128 |
+
query = """
|
| 129 |
+
MATCH (g:Gene {gene_id: $gene_id})
|
| 130 |
+
MERGE (m:Mutation {mutation_id: $mutation_id})
|
| 131 |
+
SET m.chromosome = $chromosome,
|
| 132 |
+
m.position = $position,
|
| 133 |
+
m.reference = $reference,
|
| 134 |
+
m.alternate = $alternate,
|
| 135 |
+
m.consequence = $consequence,
|
| 136 |
+
m.variant_type = $variant_type,
|
| 137 |
+
m.quality = $quality
|
| 138 |
+
MERGE (m)-[:AFFECTS]->(g)
|
| 139 |
+
RETURN m
|
| 140 |
+
"""
|
| 141 |
+
params = {**mutation_data, 'gene_id': gene_id}
|
| 142 |
+
result = self.db.execute_query(query, params)
|
| 143 |
+
return result[0]['m'] if result else {}
|
| 144 |
+
|
| 145 |
+
def link_mutation_to_patient(self, mutation_id: str, patient_id: str, properties: Optional[Dict] = None):
|
| 146 |
+
"""Create HAS_MUTATION relationship"""
|
| 147 |
+
query = """
|
| 148 |
+
MATCH (p:Patient {patient_id: $patient_id})
|
| 149 |
+
MATCH (m:Mutation {mutation_id: $mutation_id})
|
| 150 |
+
MERGE (p)-[r:HAS_MUTATION]->(m)
|
| 151 |
+
SET r.allele_frequency = $allele_frequency,
|
| 152 |
+
r.depth = $depth
|
| 153 |
+
RETURN r
|
| 154 |
+
"""
|
| 155 |
+
params = {
|
| 156 |
+
'patient_id': patient_id,
|
| 157 |
+
'mutation_id': mutation_id,
|
| 158 |
+
'allele_frequency': properties.get('allele_frequency', 0) if properties else 0,
|
| 159 |
+
'depth': properties.get('depth', 0) if properties else 0
|
| 160 |
+
}
|
| 161 |
+
self.db.execute_query(query, params)
|
| 162 |
+
|
| 163 |
+
def get_mutation_frequency(self, mutation_id: str) -> Dict:
|
| 164 |
+
"""Calculate mutation frequency across patients"""
|
| 165 |
+
query = """
|
| 166 |
+
MATCH (m:Mutation {mutation_id: $mutation_id})
|
| 167 |
+
MATCH (p:Patient)-[:HAS_MUTATION]->(m)
|
| 168 |
+
OPTIONAL MATCH (all:Patient)
|
| 169 |
+
WITH m, count(DISTINCT p) as patients_with_mutation, count(DISTINCT all) as total_patients
|
| 170 |
+
RETURN m.mutation_id as mutation_id,
|
| 171 |
+
patients_with_mutation,
|
| 172 |
+
total_patients,
|
| 173 |
+
toFloat(patients_with_mutation) / total_patients as frequency
|
| 174 |
+
"""
|
| 175 |
+
result = self.db.execute_query(query, {'mutation_id': mutation_id})
|
| 176 |
+
return result[0] if result else {}
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
class PatientRepository:
|
| 180 |
+
"""Repository for Patient nodes"""
|
| 181 |
+
|
| 182 |
+
def __init__(self, db_manager: DatabaseManager):
|
| 183 |
+
self.db = db_manager
|
| 184 |
+
|
| 185 |
+
def create_patient(self, patient_data: Dict) -> Dict:
|
| 186 |
+
"""Create a Patient node"""
|
| 187 |
+
query = """
|
| 188 |
+
MERGE (p:Patient {patient_id: $patient_id})
|
| 189 |
+
SET p.project_id = $project_id,
|
| 190 |
+
p.age = $age,
|
| 191 |
+
p.gender = $gender,
|
| 192 |
+
p.race = $race,
|
| 193 |
+
p.ethnicity = $ethnicity,
|
| 194 |
+
p.vital_status = $vital_status
|
| 195 |
+
RETURN p
|
| 196 |
+
"""
|
| 197 |
+
result = self.db.execute_query(query, patient_data)
|
| 198 |
+
return result[0]['p'] if result else {}
|
| 199 |
+
|
| 200 |
+
def link_patient_to_cancer_type(self, patient_id: str, cancer_type_id: str, properties: Optional[Dict] = None):
|
| 201 |
+
"""Create DIAGNOSED_WITH relationship"""
|
| 202 |
+
query = """
|
| 203 |
+
MATCH (p:Patient {patient_id: $patient_id})
|
| 204 |
+
MATCH (c:CancerType {cancer_type_id: $cancer_type_id})
|
| 205 |
+
MERGE (p)-[r:DIAGNOSED_WITH]->(c)
|
| 206 |
+
SET r.stage = $stage,
|
| 207 |
+
r.grade = $grade,
|
| 208 |
+
r.diagnosis_date = $diagnosis_date
|
| 209 |
+
RETURN r
|
| 210 |
+
"""
|
| 211 |
+
params = {
|
| 212 |
+
'patient_id': patient_id,
|
| 213 |
+
'cancer_type_id': cancer_type_id,
|
| 214 |
+
'stage': properties.get('stage') if properties else None,
|
| 215 |
+
'grade': properties.get('grade') if properties else None,
|
| 216 |
+
'diagnosis_date': properties.get('diagnosis_date') if properties else None
|
| 217 |
+
}
|
| 218 |
+
self.db.execute_query(query, params)
|
| 219 |
+
|
| 220 |
+
def get_patient_mutations(self, patient_id: str) -> List[Dict]:
|
| 221 |
+
"""Get all mutations for a patient"""
|
| 222 |
+
query = """
|
| 223 |
+
MATCH (p:Patient {patient_id: $patient_id})-[r:HAS_MUTATION]->(m:Mutation)-[:AFFECTS]->(g:Gene)
|
| 224 |
+
RETURN m, g, r.allele_frequency as allele_frequency, r.depth as depth
|
| 225 |
+
ORDER BY g.symbol
|
| 226 |
+
"""
|
| 227 |
+
result = self.db.execute_query(query, {'patient_id': patient_id})
|
| 228 |
+
return result
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
class CancerTypeRepository:
|
| 232 |
+
"""Repository for CancerType nodes"""
|
| 233 |
+
|
| 234 |
+
def __init__(self, db_manager: DatabaseManager):
|
| 235 |
+
self.db = db_manager
|
| 236 |
+
|
| 237 |
+
def create_cancer_type(self, cancer_data: Dict) -> Dict:
|
| 238 |
+
"""Create a CancerType node"""
|
| 239 |
+
query = """
|
| 240 |
+
MERGE (c:CancerType {cancer_type_id: $cancer_type_id})
|
| 241 |
+
SET c.name = $name,
|
| 242 |
+
c.tissue = $tissue,
|
| 243 |
+
c.disease_type = $disease_type
|
| 244 |
+
RETURN c
|
| 245 |
+
"""
|
| 246 |
+
result = self.db.execute_query(query, cancer_data)
|
| 247 |
+
return result[0]['c'] if result else {}
|
| 248 |
+
|
| 249 |
+
def get_common_mutations(self, cancer_type_id: str, limit: int = 10) -> List[Dict]:
|
| 250 |
+
"""Get most common mutations for a cancer type"""
|
| 251 |
+
query = """
|
| 252 |
+
MATCH (c:CancerType {cancer_type_id: $cancer_type_id})<-[:DIAGNOSED_WITH]-(p:Patient)
|
| 253 |
+
MATCH (p)-[:HAS_MUTATION]->(m:Mutation)-[:AFFECTS]->(g:Gene)
|
| 254 |
+
WITH m, g, count(DISTINCT p) as patient_count
|
| 255 |
+
RETURN m, g, patient_count
|
| 256 |
+
ORDER BY patient_count DESC
|
| 257 |
+
LIMIT $limit
|
| 258 |
+
"""
|
| 259 |
+
result = self.db.execute_query(query, {'cancer_type_id': cancer_type_id, 'limit': limit})
|
| 260 |
+
return result
|
| 261 |
+
|
| 262 |
+
def get_statistics(self, cancer_type_id: str) -> Dict:
|
| 263 |
+
"""Get statistics for a cancer type"""
|
| 264 |
+
query = """
|
| 265 |
+
MATCH (c:CancerType {cancer_type_id: $cancer_type_id})<-[:DIAGNOSED_WITH]-(p:Patient)
|
| 266 |
+
OPTIONAL MATCH (p)-[:HAS_MUTATION]->(m:Mutation)
|
| 267 |
+
WITH c, count(DISTINCT p) as total_patients, count(DISTINCT m) as total_mutations
|
| 268 |
+
RETURN c.name as cancer_type,
|
| 269 |
+
total_patients,
|
| 270 |
+
total_mutations,
|
| 271 |
+
CASE WHEN total_patients > 0
|
| 272 |
+
THEN toFloat(total_mutations) / total_patients
|
| 273 |
+
ELSE 0
|
| 274 |
+
END as avg_mutations_per_patient
|
| 275 |
+
"""
|
| 276 |
+
result = self.db.execute_query(query, {'cancer_type_id': cancer_type_id})
|
| 277 |
+
return result[0] if result else {}
|
backend/neo4j/graphql_schema.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GraphQL Schema for Cancer Data
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import strawberry
|
| 6 |
+
from typing import List, Optional
|
| 7 |
+
from .db_manager import DatabaseManager
|
| 8 |
+
import logging
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@strawberry.type
|
| 14 |
+
class Gene:
|
| 15 |
+
gene_id: str
|
| 16 |
+
symbol: str
|
| 17 |
+
name: Optional[str] = None
|
| 18 |
+
chromosome: Optional[str] = None
|
| 19 |
+
start_position: Optional[int] = None
|
| 20 |
+
end_position: Optional[int] = None
|
| 21 |
+
gene_type: Optional[str] = None
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@strawberry.type
|
| 25 |
+
class Mutation:
|
| 26 |
+
mutation_id: str
|
| 27 |
+
chromosome: str
|
| 28 |
+
position: int
|
| 29 |
+
reference: str
|
| 30 |
+
alternate: str
|
| 31 |
+
consequence: Optional[str] = None
|
| 32 |
+
variant_type: Optional[str] = None
|
| 33 |
+
quality: Optional[float] = None
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@strawberry.type
|
| 37 |
+
class Patient:
|
| 38 |
+
patient_id: str
|
| 39 |
+
project_id: str
|
| 40 |
+
age: Optional[int] = None
|
| 41 |
+
gender: Optional[str] = None
|
| 42 |
+
race: Optional[str] = None
|
| 43 |
+
vital_status: Optional[str] = None
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@strawberry.type
|
| 47 |
+
class CancerType:
|
| 48 |
+
cancer_type_id: str
|
| 49 |
+
name: str
|
| 50 |
+
tissue: Optional[str] = None
|
| 51 |
+
disease_type: Optional[str] = None
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
@strawberry.type
|
| 55 |
+
class MutationFrequency:
|
| 56 |
+
mutation_id: str
|
| 57 |
+
patients_with_mutation: int
|
| 58 |
+
total_patients: int
|
| 59 |
+
frequency: float
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@strawberry.type
|
| 63 |
+
class CancerStatistics:
|
| 64 |
+
cancer_type: str
|
| 65 |
+
total_patients: int
|
| 66 |
+
total_mutations: int
|
| 67 |
+
avg_mutations_per_patient: float
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
@strawberry.type
|
| 71 |
+
class Query:
|
| 72 |
+
@strawberry.field
|
| 73 |
+
def gene(self, symbol: str) -> Optional[Gene]:
|
| 74 |
+
"""Get gene by symbol"""
|
| 75 |
+
db = DatabaseManager()
|
| 76 |
+
from .db_manager import GeneRepository
|
| 77 |
+
repo = GeneRepository(db)
|
| 78 |
+
gene_data = repo.get_gene_by_symbol(symbol)
|
| 79 |
+
db.close()
|
| 80 |
+
|
| 81 |
+
if gene_data:
|
| 82 |
+
return Gene(**gene_data)
|
| 83 |
+
return None
|
| 84 |
+
|
| 85 |
+
@strawberry.field
|
| 86 |
+
def genes(self, limit: int = 100) -> List[Gene]:
|
| 87 |
+
"""Get all genes"""
|
| 88 |
+
db = DatabaseManager()
|
| 89 |
+
query = "MATCH (g:Gene) RETURN g LIMIT $limit"
|
| 90 |
+
results = db.execute_query(query, {'limit': limit})
|
| 91 |
+
db.close()
|
| 92 |
+
|
| 93 |
+
return [Gene(**r['g']) for r in results]
|
| 94 |
+
|
| 95 |
+
@strawberry.field
|
| 96 |
+
def mutations(
|
| 97 |
+
self,
|
| 98 |
+
gene: Optional[str] = None,
|
| 99 |
+
chromosome: Optional[str] = None,
|
| 100 |
+
limit: int = 100
|
| 101 |
+
) -> List[Mutation]:
|
| 102 |
+
"""Get mutations, optionally filtered by gene or chromosome"""
|
| 103 |
+
db = DatabaseManager()
|
| 104 |
+
|
| 105 |
+
if gene:
|
| 106 |
+
query = """
|
| 107 |
+
MATCH (g:Gene {symbol: $gene})<-[:AFFECTS]-(m:Mutation)
|
| 108 |
+
RETURN m
|
| 109 |
+
LIMIT $limit
|
| 110 |
+
"""
|
| 111 |
+
params = {'gene': gene, 'limit': limit}
|
| 112 |
+
elif chromosome:
|
| 113 |
+
query = """
|
| 114 |
+
MATCH (m:Mutation {chromosome: $chromosome})
|
| 115 |
+
RETURN m
|
| 116 |
+
LIMIT $limit
|
| 117 |
+
"""
|
| 118 |
+
params = {'chromosome': chromosome, 'limit': limit}
|
| 119 |
+
else:
|
| 120 |
+
query = "MATCH (m:Mutation) RETURN m LIMIT $limit"
|
| 121 |
+
params = {'limit': limit}
|
| 122 |
+
|
| 123 |
+
results = db.execute_query(query, params)
|
| 124 |
+
db.close()
|
| 125 |
+
|
| 126 |
+
return [Mutation(**r['m']) for r in results]
|
| 127 |
+
|
| 128 |
+
@strawberry.field
|
| 129 |
+
def patients(
|
| 130 |
+
self,
|
| 131 |
+
project_id: Optional[str] = None,
|
| 132 |
+
cancer_type: Optional[str] = None,
|
| 133 |
+
limit: int = 100
|
| 134 |
+
) -> List[Patient]:
|
| 135 |
+
"""Get patients, optionally filtered"""
|
| 136 |
+
db = DatabaseManager()
|
| 137 |
+
|
| 138 |
+
if project_id:
|
| 139 |
+
query = """
|
| 140 |
+
MATCH (p:Patient {project_id: $project_id})
|
| 141 |
+
RETURN p
|
| 142 |
+
LIMIT $limit
|
| 143 |
+
"""
|
| 144 |
+
params = {'project_id': project_id, 'limit': limit}
|
| 145 |
+
elif cancer_type:
|
| 146 |
+
query = """
|
| 147 |
+
MATCH (p:Patient)-[:DIAGNOSED_WITH]->(c:CancerType {cancer_type_id: $cancer_type})
|
| 148 |
+
RETURN p
|
| 149 |
+
LIMIT $limit
|
| 150 |
+
"""
|
| 151 |
+
params = {'cancer_type': cancer_type, 'limit': limit}
|
| 152 |
+
else:
|
| 153 |
+
query = "MATCH (p:Patient) RETURN p LIMIT $limit"
|
| 154 |
+
params = {'limit': limit}
|
| 155 |
+
|
| 156 |
+
results = db.execute_query(query, params)
|
| 157 |
+
db.close()
|
| 158 |
+
|
| 159 |
+
return [Patient(**r['p']) for r in results]
|
| 160 |
+
|
| 161 |
+
@strawberry.field
|
| 162 |
+
def cancer_types(self) -> List[CancerType]:
|
| 163 |
+
"""Get all cancer types"""
|
| 164 |
+
db = DatabaseManager()
|
| 165 |
+
query = "MATCH (c:CancerType) RETURN c"
|
| 166 |
+
results = db.execute_query(query)
|
| 167 |
+
db.close()
|
| 168 |
+
|
| 169 |
+
return [CancerType(**r['c']) for r in results]
|
| 170 |
+
|
| 171 |
+
@strawberry.field
|
| 172 |
+
def mutation_frequency(self, mutation_id: str) -> Optional[MutationFrequency]:
|
| 173 |
+
"""Get frequency of a mutation across all patients"""
|
| 174 |
+
db = DatabaseManager()
|
| 175 |
+
from .db_manager import MutationRepository
|
| 176 |
+
repo = MutationRepository(db)
|
| 177 |
+
freq_data = repo.get_mutation_frequency(mutation_id)
|
| 178 |
+
db.close()
|
| 179 |
+
|
| 180 |
+
if freq_data:
|
| 181 |
+
return MutationFrequency(**freq_data)
|
| 182 |
+
return None
|
| 183 |
+
|
| 184 |
+
@strawberry.field
|
| 185 |
+
def cancer_statistics(self, cancer_type_id: str) -> Optional[CancerStatistics]:
|
| 186 |
+
"""Get statistics for a cancer type"""
|
| 187 |
+
db = DatabaseManager()
|
| 188 |
+
from .db_manager import CancerTypeRepository
|
| 189 |
+
repo = CancerTypeRepository(db)
|
| 190 |
+
stats = repo.get_statistics(cancer_type_id)
|
| 191 |
+
db.close()
|
| 192 |
+
|
| 193 |
+
if stats:
|
| 194 |
+
return CancerStatistics(**stats)
|
| 195 |
+
return None
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
schema = strawberry.Schema(query=Query)
|
backend/pipeline/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pipeline Module
|
| 3 |
+
Bioinformatics analysis pipeline for sequencing data
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .fastq_processor import FASTQProcessor, FASTQQualityControl
|
| 7 |
+
from .blast_runner import BLASTRunner, SequenceAligner
|
| 8 |
+
from .variant_caller import VariantCaller, VariantAnalyzer, Variant
|
| 9 |
+
|
| 10 |
+
__all__ = [
|
| 11 |
+
'FASTQProcessor',
|
| 12 |
+
'FASTQQualityControl',
|
| 13 |
+
'BLASTRunner',
|
| 14 |
+
'SequenceAligner',
|
| 15 |
+
'VariantCaller',
|
| 16 |
+
'VariantAnalyzer',
|
| 17 |
+
'Variant'
|
| 18 |
+
]
|
backend/pipeline/blast_runner.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BLAST Integration
|
| 3 |
+
Sequence alignment and homology searching
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Dict, List, Optional
|
| 8 |
+
import subprocess
|
| 9 |
+
import yaml
|
| 10 |
+
import logging
|
| 11 |
+
from Bio import SeqIO
|
| 12 |
+
from Bio.Blast import NCBIXML
|
| 13 |
+
|
| 14 |
+
logging.basicConfig(level=logging.INFO)
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class BLASTRunner:
|
| 19 |
+
"""Run BLAST searches for sequence alignment"""
|
| 20 |
+
|
| 21 |
+
def __init__(self, config_path: str = "config.yml"):
|
| 22 |
+
with open(config_path, 'r') as f:
|
| 23 |
+
self.config = yaml.safe_load(f)['pipeline']['blast']
|
| 24 |
+
|
| 25 |
+
self.database = self.config.get('database', 'nt')
|
| 26 |
+
self.evalue = self.config.get('evalue', 0.001)
|
| 27 |
+
self.num_threads = self.config.get('num_threads', 4)
|
| 28 |
+
self.output_dir = Path(self.config['output_dir'])
|
| 29 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 30 |
+
|
| 31 |
+
def run_blastn(
|
| 32 |
+
self,
|
| 33 |
+
query_file: Path,
|
| 34 |
+
output_file: Optional[Path] = None,
|
| 35 |
+
max_targets: int = 10
|
| 36 |
+
) -> Optional[Path]:
|
| 37 |
+
"""
|
| 38 |
+
Run BLASTN for nucleotide sequences
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
query_file: Input FASTA file with query sequences
|
| 42 |
+
output_file: Output XML file
|
| 43 |
+
max_targets: Maximum number of target sequences
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
Path to output file or None if failed
|
| 47 |
+
"""
|
| 48 |
+
if output_file is None:
|
| 49 |
+
output_file = self.output_dir / f"{query_file.stem}_blastn.xml"
|
| 50 |
+
|
| 51 |
+
cmd = [
|
| 52 |
+
'blastn',
|
| 53 |
+
'-query', str(query_file),
|
| 54 |
+
'-db', self.database,
|
| 55 |
+
'-out', str(output_file),
|
| 56 |
+
'-evalue', str(self.evalue),
|
| 57 |
+
'-num_threads', str(self.num_threads),
|
| 58 |
+
'-max_target_seqs', str(max_targets),
|
| 59 |
+
'-outfmt', '5' # XML format
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
logger.info(f"Running BLASTN on {query_file.name}")
|
| 64 |
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
| 65 |
+
logger.info(f"BLASTN completed: {output_file}")
|
| 66 |
+
return output_file
|
| 67 |
+
|
| 68 |
+
except subprocess.CalledProcessError as e:
|
| 69 |
+
logger.error(f"BLASTN failed: {e.stderr}")
|
| 70 |
+
return None
|
| 71 |
+
except FileNotFoundError:
|
| 72 |
+
logger.warning("BLASTN not found - creating simulated results")
|
| 73 |
+
return self._simulate_blast_results(query_file, output_file)
|
| 74 |
+
|
| 75 |
+
def run_blastp(
|
| 76 |
+
self,
|
| 77 |
+
query_file: Path,
|
| 78 |
+
output_file: Optional[Path] = None,
|
| 79 |
+
max_targets: int = 10
|
| 80 |
+
) -> Optional[Path]:
|
| 81 |
+
"""
|
| 82 |
+
Run BLASTP for protein sequences
|
| 83 |
+
|
| 84 |
+
Args:
|
| 85 |
+
query_file: Input FASTA file with protein sequences
|
| 86 |
+
output_file: Output XML file
|
| 87 |
+
max_targets: Maximum number of target sequences
|
| 88 |
+
"""
|
| 89 |
+
if output_file is None:
|
| 90 |
+
output_file = self.output_dir / f"{query_file.stem}_blastp.xml"
|
| 91 |
+
|
| 92 |
+
cmd = [
|
| 93 |
+
'blastp',
|
| 94 |
+
'-query', str(query_file),
|
| 95 |
+
'-db', 'nr', # Non-redundant protein database
|
| 96 |
+
'-out', str(output_file),
|
| 97 |
+
'-evalue', str(self.evalue),
|
| 98 |
+
'-num_threads', str(self.num_threads),
|
| 99 |
+
'-max_target_seqs', str(max_targets),
|
| 100 |
+
'-outfmt', '5'
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
logger.info(f"Running BLASTP on {query_file.name}")
|
| 105 |
+
subprocess.run(cmd, capture_output=True, text=True, check=True)
|
| 106 |
+
logger.info(f"BLASTP completed: {output_file}")
|
| 107 |
+
return output_file
|
| 108 |
+
|
| 109 |
+
except subprocess.CalledProcessError as e:
|
| 110 |
+
logger.error(f"BLASTP failed: {e.stderr}")
|
| 111 |
+
return None
|
| 112 |
+
except FileNotFoundError:
|
| 113 |
+
logger.warning("BLASTP not found - creating simulated results")
|
| 114 |
+
return self._simulate_blast_results(query_file, output_file)
|
| 115 |
+
|
| 116 |
+
def _simulate_blast_results(self, query_file: Path, output_file: Path) -> Path:
|
| 117 |
+
"""Create simulated BLAST results for demo purposes"""
|
| 118 |
+
with open(output_file, 'w') as f:
|
| 119 |
+
f.write("""<?xml version="1.0"?>
|
| 120 |
+
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">
|
| 121 |
+
<BlastOutput>
|
| 122 |
+
<BlastOutput_program>blastn</BlastOutput_program>
|
| 123 |
+
<BlastOutput_version>BLASTN 2.14.0+</BlastOutput_version>
|
| 124 |
+
<BlastOutput_reference>Simulated results for demo</BlastOutput_reference>
|
| 125 |
+
<BlastOutput_db>nt</BlastOutput_db>
|
| 126 |
+
<BlastOutput_query-ID>Query_1</BlastOutput_query-ID>
|
| 127 |
+
<BlastOutput_query-def>Sample sequence</BlastOutput_query-def>
|
| 128 |
+
<BlastOutput_query-len>100</BlastOutput_query-len>
|
| 129 |
+
<BlastOutput_iterations>
|
| 130 |
+
<Iteration>
|
| 131 |
+
<Iteration_iter-num>1</Iteration_iter-num>
|
| 132 |
+
<Iteration_query-ID>Query_1</Iteration_query-ID>
|
| 133 |
+
<Iteration_query-def>Sample sequence</Iteration_query-def>
|
| 134 |
+
<Iteration_query-len>100</Iteration_query-len>
|
| 135 |
+
<Iteration_hits>
|
| 136 |
+
</Iteration_hits>
|
| 137 |
+
</Iteration>
|
| 138 |
+
</BlastOutput_iterations>
|
| 139 |
+
</BlastOutput>
|
| 140 |
+
""")
|
| 141 |
+
return output_file
|
| 142 |
+
|
| 143 |
+
def parse_results(self, blast_output: Path) -> List[Dict]:
|
| 144 |
+
"""
|
| 145 |
+
Parse BLAST XML output
|
| 146 |
+
|
| 147 |
+
Returns:
|
| 148 |
+
List of hit dictionaries
|
| 149 |
+
"""
|
| 150 |
+
hits = []
|
| 151 |
+
|
| 152 |
+
try:
|
| 153 |
+
with open(blast_output, 'r') as f:
|
| 154 |
+
blast_records = NCBIXML.parse(f)
|
| 155 |
+
|
| 156 |
+
for blast_record in blast_records:
|
| 157 |
+
for alignment in blast_record.alignments:
|
| 158 |
+
for hsp in alignment.hsps:
|
| 159 |
+
hit = {
|
| 160 |
+
'query': blast_record.query,
|
| 161 |
+
'hit_id': alignment.hit_id,
|
| 162 |
+
'hit_def': alignment.hit_def,
|
| 163 |
+
'length': alignment.length,
|
| 164 |
+
'e_value': hsp.expect,
|
| 165 |
+
'score': hsp.score,
|
| 166 |
+
'identities': hsp.identities,
|
| 167 |
+
'positives': hsp.positives,
|
| 168 |
+
'gaps': hsp.gaps,
|
| 169 |
+
'query_start': hsp.query_start,
|
| 170 |
+
'query_end': hsp.query_end,
|
| 171 |
+
'hit_start': hsp.sbjct_start,
|
| 172 |
+
'hit_end': hsp.sbjct_end,
|
| 173 |
+
'alignment_length': hsp.align_length
|
| 174 |
+
}
|
| 175 |
+
hits.append(hit)
|
| 176 |
+
|
| 177 |
+
logger.info(f"Parsed {len(hits)} BLAST hits")
|
| 178 |
+
return hits
|
| 179 |
+
|
| 180 |
+
except Exception as e:
|
| 181 |
+
logger.error(f"Error parsing BLAST results: {e}")
|
| 182 |
+
return []
|
| 183 |
+
|
| 184 |
+
def filter_hits(
|
| 185 |
+
self,
|
| 186 |
+
hits: List[Dict],
|
| 187 |
+
min_identity: float = 0.9,
|
| 188 |
+
max_evalue: float = 0.001
|
| 189 |
+
) -> List[Dict]:
|
| 190 |
+
"""
|
| 191 |
+
Filter BLAST hits by identity and e-value
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
hits: List of hit dictionaries
|
| 195 |
+
min_identity: Minimum identity percentage (0-1)
|
| 196 |
+
max_evalue: Maximum e-value threshold
|
| 197 |
+
"""
|
| 198 |
+
filtered = []
|
| 199 |
+
|
| 200 |
+
for hit in hits:
|
| 201 |
+
identity_pct = hit['identities'] / hit['alignment_length']
|
| 202 |
+
|
| 203 |
+
if identity_pct >= min_identity and hit['e_value'] <= max_evalue:
|
| 204 |
+
hit['identity_pct'] = identity_pct
|
| 205 |
+
filtered.append(hit)
|
| 206 |
+
|
| 207 |
+
logger.info(f"Filtered to {len(filtered)} high-quality hits")
|
| 208 |
+
return filtered
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
class SequenceAligner:
|
| 212 |
+
"""Sequence alignment utilities"""
|
| 213 |
+
|
| 214 |
+
def __init__(self):
|
| 215 |
+
self.blast_runner = BLASTRunner()
|
| 216 |
+
|
| 217 |
+
def align_to_reference(
|
| 218 |
+
self,
|
| 219 |
+
query_sequences: Path,
|
| 220 |
+
reference_db: str = 'nt'
|
| 221 |
+
) -> Dict:
|
| 222 |
+
"""
|
| 223 |
+
Align query sequences to reference database
|
| 224 |
+
|
| 225 |
+
Returns:
|
| 226 |
+
Alignment results and statistics
|
| 227 |
+
"""
|
| 228 |
+
# Run BLAST
|
| 229 |
+
blast_output = self.blast_runner.run_blastn(query_sequences)
|
| 230 |
+
|
| 231 |
+
if blast_output is None:
|
| 232 |
+
return {'error': 'BLAST search failed'}
|
| 233 |
+
|
| 234 |
+
# Parse results
|
| 235 |
+
hits = self.blast_runner.parse_results(blast_output)
|
| 236 |
+
|
| 237 |
+
# Calculate statistics
|
| 238 |
+
stats = {
|
| 239 |
+
'total_queries': 0,
|
| 240 |
+
'queries_with_hits': 0,
|
| 241 |
+
'total_hits': len(hits),
|
| 242 |
+
'avg_identity': 0,
|
| 243 |
+
'avg_evalue': 0
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
if hits:
|
| 247 |
+
stats['avg_identity'] = sum(h.get('identity_pct', 0) for h in hits) / len(hits)
|
| 248 |
+
stats['avg_evalue'] = sum(h['e_value'] for h in hits) / len(hits)
|
| 249 |
+
|
| 250 |
+
return {
|
| 251 |
+
'statistics': stats,
|
| 252 |
+
'hits': hits,
|
| 253 |
+
'output_file': str(blast_output)
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
def find_homologs(
|
| 257 |
+
self,
|
| 258 |
+
sequence_file: Path,
|
| 259 |
+
min_identity: float = 0.8
|
| 260 |
+
) -> List[Dict]:
|
| 261 |
+
"""
|
| 262 |
+
Find homologous sequences
|
| 263 |
+
|
| 264 |
+
Args:
|
| 265 |
+
sequence_file: Input FASTA file
|
| 266 |
+
min_identity: Minimum identity threshold
|
| 267 |
+
"""
|
| 268 |
+
blast_output = self.blast_runner.run_blastn(sequence_file)
|
| 269 |
+
|
| 270 |
+
if blast_output:
|
| 271 |
+
hits = self.blast_runner.parse_results(blast_output)
|
| 272 |
+
return self.blast_runner.filter_hits(hits, min_identity=min_identity)
|
| 273 |
+
|
| 274 |
+
return []
|
backend/pipeline/fastq_processor.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FASTQ Processing Pipeline
|
| 3 |
+
Quality control and preprocessing of sequencing data
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Dict, List, Optional
|
| 8 |
+
import yaml
|
| 9 |
+
import logging
|
| 10 |
+
from Bio import SeqIO
|
| 11 |
+
from Bio.SeqIO.QualityIO import FastqGeneralIterator
|
| 12 |
+
|
| 13 |
+
logging.basicConfig(level=logging.INFO)
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class FASTQProcessor:
|
| 18 |
+
"""Process FASTQ sequencing files"""
|
| 19 |
+
|
| 20 |
+
def __init__(self, config_path: str = "config.yml"):
|
| 21 |
+
with open(config_path, 'r') as f:
|
| 22 |
+
self.config = yaml.safe_load(f)['pipeline']['fastq']
|
| 23 |
+
|
| 24 |
+
self.quality_threshold = self.config['quality_threshold']
|
| 25 |
+
self.min_length = self.config['min_length']
|
| 26 |
+
self.output_dir = Path(self.config['output_dir'])
|
| 27 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 28 |
+
|
| 29 |
+
def quality_filter(
|
| 30 |
+
self,
|
| 31 |
+
input_file: Path,
|
| 32 |
+
output_file: Optional[Path] = None
|
| 33 |
+
) -> Dict:
|
| 34 |
+
"""
|
| 35 |
+
Filter FASTQ reads by quality score
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
input_file: Input FASTQ file
|
| 39 |
+
output_file: Output filtered FASTQ file
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
Statistics dictionary
|
| 43 |
+
"""
|
| 44 |
+
if output_file is None:
|
| 45 |
+
output_file = self.output_dir / f"{input_file.stem}_filtered.fastq"
|
| 46 |
+
|
| 47 |
+
stats = {
|
| 48 |
+
'total_reads': 0,
|
| 49 |
+
'passed_reads': 0,
|
| 50 |
+
'failed_reads': 0,
|
| 51 |
+
'total_bases': 0,
|
| 52 |
+
'passed_bases': 0
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
with open(input_file, 'r') as in_f, open(output_file, 'w') as out_f:
|
| 57 |
+
for title, sequence, quality in FastqGeneralIterator(in_f):
|
| 58 |
+
stats['total_reads'] += 1
|
| 59 |
+
stats['total_bases'] += len(sequence)
|
| 60 |
+
|
| 61 |
+
# Calculate average quality score
|
| 62 |
+
quality_scores = [ord(q) - 33 for q in quality]
|
| 63 |
+
avg_quality = sum(quality_scores) / len(quality_scores)
|
| 64 |
+
|
| 65 |
+
# Check filters
|
| 66 |
+
if avg_quality >= self.quality_threshold and len(sequence) >= self.min_length:
|
| 67 |
+
out_f.write(f"@{title}\n{sequence}\n+\n{quality}\n")
|
| 68 |
+
stats['passed_reads'] += 1
|
| 69 |
+
stats['passed_bases'] += len(sequence)
|
| 70 |
+
else:
|
| 71 |
+
stats['failed_reads'] += 1
|
| 72 |
+
|
| 73 |
+
stats['pass_rate'] = stats['passed_reads'] / stats['total_reads'] if stats['total_reads'] > 0 else 0
|
| 74 |
+
|
| 75 |
+
logger.info(f"Filtered {input_file.name}: {stats['passed_reads']}/{stats['total_reads']} reads passed")
|
| 76 |
+
return stats
|
| 77 |
+
|
| 78 |
+
except Exception as e:
|
| 79 |
+
logger.error(f"Error filtering FASTQ: {e}")
|
| 80 |
+
return stats
|
| 81 |
+
|
| 82 |
+
def trim_adapters(
|
| 83 |
+
self,
|
| 84 |
+
input_file: Path,
|
| 85 |
+
adapter_sequence: str,
|
| 86 |
+
output_file: Optional[Path] = None
|
| 87 |
+
) -> Path:
|
| 88 |
+
"""
|
| 89 |
+
Trim adapter sequences from reads
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
input_file: Input FASTQ file
|
| 93 |
+
adapter_sequence: Adapter sequence to trim
|
| 94 |
+
output_file: Output trimmed file
|
| 95 |
+
"""
|
| 96 |
+
if output_file is None:
|
| 97 |
+
output_file = self.output_dir / f"{input_file.stem}_trimmed.fastq"
|
| 98 |
+
|
| 99 |
+
trimmed_count = 0
|
| 100 |
+
|
| 101 |
+
try:
|
| 102 |
+
with open(input_file, 'r') as in_f, open(output_file, 'w') as out_f:
|
| 103 |
+
for title, sequence, quality in FastqGeneralIterator(in_f):
|
| 104 |
+
# Find adapter
|
| 105 |
+
adapter_pos = sequence.find(adapter_sequence)
|
| 106 |
+
|
| 107 |
+
if adapter_pos != -1:
|
| 108 |
+
# Trim at adapter position
|
| 109 |
+
sequence = sequence[:adapter_pos]
|
| 110 |
+
quality = quality[:adapter_pos]
|
| 111 |
+
trimmed_count += 1
|
| 112 |
+
|
| 113 |
+
if len(sequence) >= self.min_length:
|
| 114 |
+
out_f.write(f"@{title}\n{sequence}\n+\n{quality}\n")
|
| 115 |
+
|
| 116 |
+
logger.info(f"Trimmed adapters from {trimmed_count} reads")
|
| 117 |
+
return output_file
|
| 118 |
+
|
| 119 |
+
except Exception as e:
|
| 120 |
+
logger.error(f"Error trimming adapters: {e}")
|
| 121 |
+
return input_file
|
| 122 |
+
|
| 123 |
+
def calculate_statistics(self, fastq_file: Path) -> Dict:
|
| 124 |
+
"""
|
| 125 |
+
Calculate statistics for FASTQ file
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
Dictionary with read count, length distribution, quality scores
|
| 129 |
+
"""
|
| 130 |
+
stats = {
|
| 131 |
+
'total_reads': 0,
|
| 132 |
+
'total_bases': 0,
|
| 133 |
+
'min_length': float('inf'),
|
| 134 |
+
'max_length': 0,
|
| 135 |
+
'avg_length': 0,
|
| 136 |
+
'avg_quality': 0,
|
| 137 |
+
'gc_content': 0
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
lengths = []
|
| 141 |
+
qualities = []
|
| 142 |
+
gc_count = 0
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
with open(fastq_file, 'r') as f:
|
| 146 |
+
for title, sequence, quality in FastqGeneralIterator(f):
|
| 147 |
+
stats['total_reads'] += 1
|
| 148 |
+
seq_len = len(sequence)
|
| 149 |
+
stats['total_bases'] += seq_len
|
| 150 |
+
|
| 151 |
+
lengths.append(seq_len)
|
| 152 |
+
stats['min_length'] = min(stats['min_length'], seq_len)
|
| 153 |
+
stats['max_length'] = max(stats['max_length'], seq_len)
|
| 154 |
+
|
| 155 |
+
# Quality scores
|
| 156 |
+
quality_scores = [ord(q) - 33 for q in quality]
|
| 157 |
+
qualities.extend(quality_scores)
|
| 158 |
+
|
| 159 |
+
# GC content
|
| 160 |
+
gc_count += sequence.count('G') + sequence.count('C')
|
| 161 |
+
|
| 162 |
+
if stats['total_reads'] > 0:
|
| 163 |
+
stats['avg_length'] = sum(lengths) / len(lengths)
|
| 164 |
+
stats['avg_quality'] = sum(qualities) / len(qualities)
|
| 165 |
+
stats['gc_content'] = (gc_count / stats['total_bases']) * 100
|
| 166 |
+
|
| 167 |
+
return stats
|
| 168 |
+
|
| 169 |
+
except Exception as e:
|
| 170 |
+
logger.error(f"Error calculating statistics: {e}")
|
| 171 |
+
return stats
|
| 172 |
+
|
| 173 |
+
def convert_to_fasta(
|
| 174 |
+
self,
|
| 175 |
+
input_file: Path,
|
| 176 |
+
output_file: Optional[Path] = None
|
| 177 |
+
) -> Path:
|
| 178 |
+
"""Convert FASTQ to FASTA format"""
|
| 179 |
+
if output_file is None:
|
| 180 |
+
output_file = self.output_dir / f"{input_file.stem}.fasta"
|
| 181 |
+
|
| 182 |
+
try:
|
| 183 |
+
count = SeqIO.convert(str(input_file), "fastq", str(output_file), "fasta")
|
| 184 |
+
logger.info(f"Converted {count} sequences to FASTA")
|
| 185 |
+
return output_file
|
| 186 |
+
|
| 187 |
+
except Exception as e:
|
| 188 |
+
logger.error(f"Error converting to FASTA: {e}")
|
| 189 |
+
return input_file
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
class FASTQQualityControl:
|
| 193 |
+
"""Quality control analysis for FASTQ files"""
|
| 194 |
+
|
| 195 |
+
def __init__(self):
|
| 196 |
+
self.processor = FASTQProcessor()
|
| 197 |
+
|
| 198 |
+
def run_qc(self, fastq_file: Path) -> Dict:
|
| 199 |
+
"""
|
| 200 |
+
Run comprehensive QC on FASTQ file
|
| 201 |
+
|
| 202 |
+
Returns:
|
| 203 |
+
QC report dictionary
|
| 204 |
+
"""
|
| 205 |
+
report = {
|
| 206 |
+
'file': str(fastq_file),
|
| 207 |
+
'statistics': {},
|
| 208 |
+
'quality_check': 'PASS',
|
| 209 |
+
'warnings': []
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
# Calculate statistics
|
| 213 |
+
stats = self.processor.calculate_statistics(fastq_file)
|
| 214 |
+
report['statistics'] = stats
|
| 215 |
+
|
| 216 |
+
# Check for issues
|
| 217 |
+
if stats['avg_quality'] < 20:
|
| 218 |
+
report['warnings'].append('Low average quality score')
|
| 219 |
+
report['quality_check'] = 'WARN'
|
| 220 |
+
|
| 221 |
+
if stats['avg_length'] < 50:
|
| 222 |
+
report['warnings'].append('Short average read length')
|
| 223 |
+
report['quality_check'] = 'WARN'
|
| 224 |
+
|
| 225 |
+
if stats['gc_content'] < 30 or stats['gc_content'] > 70:
|
| 226 |
+
report['warnings'].append(f'Unusual GC content: {stats["gc_content"]:.1f}%')
|
| 227 |
+
|
| 228 |
+
return report
|
| 229 |
+
|
| 230 |
+
def generate_qc_report(self, fastq_files: List[Path]) -> Dict:
|
| 231 |
+
"""Generate QC report for multiple FASTQ files"""
|
| 232 |
+
reports = {}
|
| 233 |
+
|
| 234 |
+
for fastq_file in fastq_files:
|
| 235 |
+
report = self.run_qc(fastq_file)
|
| 236 |
+
reports[fastq_file.name] = report
|
| 237 |
+
|
| 238 |
+
# Summary statistics
|
| 239 |
+
summary = {
|
| 240 |
+
'total_files': len(fastq_files),
|
| 241 |
+
'passed': sum(1 for r in reports.values() if r['quality_check'] == 'PASS'),
|
| 242 |
+
'warnings': sum(1 for r in reports.values() if r['quality_check'] == 'WARN'),
|
| 243 |
+
'failed': sum(1 for r in reports.values() if r['quality_check'] == 'FAIL')
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
return {
|
| 247 |
+
'summary': summary,
|
| 248 |
+
'file_reports': reports
|
| 249 |
+
}
|
backend/pipeline/variant_caller.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Variant Calling Pipeline
|
| 3 |
+
Process sequencing data to identify genetic variants
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Dict, List, Optional
|
| 8 |
+
import yaml
|
| 9 |
+
import logging
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
|
| 12 |
+
logging.basicConfig(level=logging.INFO)
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class Variant:
|
| 18 |
+
"""Represents a genetic variant"""
|
| 19 |
+
chromosome: str
|
| 20 |
+
position: int
|
| 21 |
+
reference: str
|
| 22 |
+
alternate: str
|
| 23 |
+
quality: float
|
| 24 |
+
depth: int
|
| 25 |
+
allele_frequency: float
|
| 26 |
+
gene: Optional[str] = None
|
| 27 |
+
consequence: Optional[str] = None
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class VariantCaller:
|
| 31 |
+
"""Call variants from sequencing data"""
|
| 32 |
+
|
| 33 |
+
def __init__(self, config_path: str = "config.yml"):
|
| 34 |
+
with open(config_path, 'r') as f:
|
| 35 |
+
self.config = yaml.safe_load(f)['pipeline']['variant_calling']
|
| 36 |
+
|
| 37 |
+
self.min_coverage = self.config['min_coverage']
|
| 38 |
+
self.min_allele_frequency = self.config['min_allele_frequency']
|
| 39 |
+
self.output_dir = Path(self.config['output_dir'])
|
| 40 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 41 |
+
|
| 42 |
+
def call_variants(
|
| 43 |
+
self,
|
| 44 |
+
alignment_file: Path,
|
| 45 |
+
reference_genome: Path,
|
| 46 |
+
output_vcf: Optional[Path] = None
|
| 47 |
+
) -> Path:
|
| 48 |
+
"""
|
| 49 |
+
Call variants from aligned sequencing data
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
alignment_file: BAM/SAM alignment file
|
| 53 |
+
reference_genome: Reference genome FASTA
|
| 54 |
+
output_vcf: Output VCF file
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
Path to VCF file
|
| 58 |
+
"""
|
| 59 |
+
if output_vcf is None:
|
| 60 |
+
output_vcf = self.output_dir / f"{alignment_file.stem}_variants.vcf"
|
| 61 |
+
|
| 62 |
+
logger.info(f"Calling variants from {alignment_file.name}")
|
| 63 |
+
|
| 64 |
+
# Simulate variant calling for demo
|
| 65 |
+
# In production, use tools like GATK, FreeBayes, or BCFtools
|
| 66 |
+
variants = self._simulate_variant_calling()
|
| 67 |
+
|
| 68 |
+
# Write VCF
|
| 69 |
+
self._write_vcf(variants, output_vcf)
|
| 70 |
+
|
| 71 |
+
logger.info(f"Identified {len(variants)} variants")
|
| 72 |
+
return output_vcf
|
| 73 |
+
|
| 74 |
+
def _simulate_variant_calling(self) -> List[Variant]:
|
| 75 |
+
"""Simulate variant calling for demo purposes"""
|
| 76 |
+
# Common cancer-associated variants
|
| 77 |
+
variants = [
|
| 78 |
+
Variant('chr17', 7577538, 'C', 'T', 35.2, 50, 0.45, 'TP53', 'missense'),
|
| 79 |
+
Variant('chr7', 140453136, 'A', 'T', 42.1, 65, 0.52, 'BRAF', 'missense'),
|
| 80 |
+
Variant('chr13', 32914438, 'T', 'C', 38.7, 55, 0.48, 'BRCA2', 'missense'),
|
| 81 |
+
Variant('chr17', 41244936, 'G', 'A', 40.3, 60, 0.50, 'BRCA1', 'missense'),
|
| 82 |
+
Variant('chr3', 178936091, 'G', 'A', 33.5, 48, 0.43, 'PIK3CA', 'missense'),
|
| 83 |
+
Variant('chr9', 133748283, 'T', 'G', 37.9, 52, 0.46, 'ABL1', 'missense'),
|
| 84 |
+
Variant('chr12', 25398284, 'C', 'T', 39.4, 58, 0.49, 'KRAS', 'missense'),
|
| 85 |
+
]
|
| 86 |
+
return variants
|
| 87 |
+
|
| 88 |
+
def _write_vcf(self, variants: List[Variant], output_file: Path):
|
| 89 |
+
"""Write variants to VCF format"""
|
| 90 |
+
with open(output_file, 'w') as f:
|
| 91 |
+
# VCF header
|
| 92 |
+
f.write("##fileformat=VCFv4.2\n")
|
| 93 |
+
f.write("##source=CancerAtHomeVariantCaller\n")
|
| 94 |
+
f.write("##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n")
|
| 95 |
+
f.write("##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">\n")
|
| 96 |
+
f.write("##INFO=<ID=GENE,Number=1,Type=String,Description=\"Gene Name\">\n")
|
| 97 |
+
f.write("##INFO=<ID=CONS,Number=1,Type=String,Description=\"Consequence\">\n")
|
| 98 |
+
f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
|
| 99 |
+
|
| 100 |
+
# Variant records
|
| 101 |
+
for v in variants:
|
| 102 |
+
info = f"DP={v.depth};AF={v.allele_frequency:.3f}"
|
| 103 |
+
if v.gene:
|
| 104 |
+
info += f";GENE={v.gene}"
|
| 105 |
+
if v.consequence:
|
| 106 |
+
info += f";CONS={v.consequence}"
|
| 107 |
+
|
| 108 |
+
filter_status = "PASS" if v.depth >= self.min_coverage and v.allele_frequency >= self.min_allele_frequency else "LowQual"
|
| 109 |
+
|
| 110 |
+
f.write(f"{v.chromosome}\t{v.position}\t.\t{v.reference}\t{v.alternate}\t{v.quality:.1f}\t{filter_status}\t{info}\n")
|
| 111 |
+
|
| 112 |
+
def filter_variants(
|
| 113 |
+
self,
|
| 114 |
+
vcf_file: Path,
|
| 115 |
+
min_quality: float = 30.0
|
| 116 |
+
) -> List[Variant]:
|
| 117 |
+
"""Filter variants by quality metrics"""
|
| 118 |
+
variants = []
|
| 119 |
+
|
| 120 |
+
try:
|
| 121 |
+
with open(vcf_file, 'r') as f:
|
| 122 |
+
for line in f:
|
| 123 |
+
if line.startswith('#'):
|
| 124 |
+
continue
|
| 125 |
+
|
| 126 |
+
fields = line.strip().split('\t')
|
| 127 |
+
if len(fields) < 8:
|
| 128 |
+
continue
|
| 129 |
+
|
| 130 |
+
quality = float(fields[5])
|
| 131 |
+
if quality < min_quality:
|
| 132 |
+
continue
|
| 133 |
+
|
| 134 |
+
# Parse INFO field
|
| 135 |
+
info = dict(item.split('=') for item in fields[7].split(';') if '=' in item)
|
| 136 |
+
|
| 137 |
+
variant = Variant(
|
| 138 |
+
chromosome=fields[0],
|
| 139 |
+
position=int(fields[1]),
|
| 140 |
+
reference=fields[3],
|
| 141 |
+
alternate=fields[4],
|
| 142 |
+
quality=quality,
|
| 143 |
+
depth=int(info.get('DP', 0)),
|
| 144 |
+
allele_frequency=float(info.get('AF', 0)),
|
| 145 |
+
gene=info.get('GENE'),
|
| 146 |
+
consequence=info.get('CONS')
|
| 147 |
+
)
|
| 148 |
+
variants.append(variant)
|
| 149 |
+
|
| 150 |
+
logger.info(f"Filtered to {len(variants)} high-quality variants")
|
| 151 |
+
return variants
|
| 152 |
+
|
| 153 |
+
except Exception as e:
|
| 154 |
+
logger.error(f"Error filtering variants: {e}")
|
| 155 |
+
return []
|
| 156 |
+
|
| 157 |
+
def annotate_variants(self, variants: List[Variant]) -> List[Variant]:
|
| 158 |
+
"""
|
| 159 |
+
Annotate variants with functional information
|
| 160 |
+
|
| 161 |
+
In production, integrate with tools like:
|
| 162 |
+
- ANNOVAR
|
| 163 |
+
- VEP (Variant Effect Predictor)
|
| 164 |
+
- SnpEff
|
| 165 |
+
"""
|
| 166 |
+
# Simulated annotation
|
| 167 |
+
for variant in variants:
|
| 168 |
+
if not variant.gene:
|
| 169 |
+
variant.gene = "UNKNOWN"
|
| 170 |
+
if not variant.consequence:
|
| 171 |
+
variant.consequence = "unknown"
|
| 172 |
+
|
| 173 |
+
return variants
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
class VariantAnalyzer:
|
| 177 |
+
"""Analyze and interpret variants"""
|
| 178 |
+
|
| 179 |
+
def __init__(self):
|
| 180 |
+
self.caller = VariantCaller()
|
| 181 |
+
|
| 182 |
+
def identify_cancer_variants(self, variants: List[Variant]) -> List[Variant]:
|
| 183 |
+
"""Identify known cancer-associated variants"""
|
| 184 |
+
# Common cancer genes
|
| 185 |
+
cancer_genes = {
|
| 186 |
+
'TP53', 'BRCA1', 'BRCA2', 'KRAS', 'EGFR', 'BRAF',
|
| 187 |
+
'PIK3CA', 'APC', 'PTEN', 'MYC', 'RB1', 'CDKN2A'
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
cancer_variants = [
|
| 191 |
+
v for v in variants
|
| 192 |
+
if v.gene and v.gene in cancer_genes
|
| 193 |
+
]
|
| 194 |
+
|
| 195 |
+
logger.info(f"Found {len(cancer_variants)} cancer-associated variants")
|
| 196 |
+
return cancer_variants
|
| 197 |
+
|
| 198 |
+
def calculate_mutation_burden(self, variants: List[Variant]) -> float:
|
| 199 |
+
"""Calculate tumor mutation burden (TMB)"""
|
| 200 |
+
# TMB = number of somatic mutations per megabase
|
| 201 |
+
coding_variants = [v for v in variants if v.consequence in ['missense', 'nonsense', 'frameshift']]
|
| 202 |
+
|
| 203 |
+
# Assume exome size of ~30 Mb
|
| 204 |
+
exome_size_mb = 30
|
| 205 |
+
tmb = len(coding_variants) / exome_size_mb
|
| 206 |
+
|
| 207 |
+
logger.info(f"Tumor Mutation Burden: {tmb:.2f} mutations/Mb")
|
| 208 |
+
return tmb
|
config.yml
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Cancer@Home Configuration
|
| 2 |
+
|
| 3 |
+
app:
|
| 4 |
+
name: "Cancer@Home v2"
|
| 5 |
+
version: "2.0.0"
|
| 6 |
+
host: "localhost"
|
| 7 |
+
port: 5000
|
| 8 |
+
debug: true
|
| 9 |
+
|
| 10 |
+
neo4j:
|
| 11 |
+
uri: "bolt://localhost:7687"
|
| 12 |
+
username: "neo4j"
|
| 13 |
+
password: "cancer123"
|
| 14 |
+
database: "neo4j"
|
| 15 |
+
max_connection_lifetime: 3600
|
| 16 |
+
max_connection_pool_size: 50
|
| 17 |
+
|
| 18 |
+
gdc:
|
| 19 |
+
api_url: "https://api.gdc.cancer.gov"
|
| 20 |
+
data_endpoint: "/data"
|
| 21 |
+
files_endpoint: "/files"
|
| 22 |
+
cases_endpoint: "/cases"
|
| 23 |
+
download_dir: "./data/gdc"
|
| 24 |
+
max_retries: 3
|
| 25 |
+
timeout: 300
|
| 26 |
+
|
| 27 |
+
boinc:
|
| 28 |
+
project_url: "http://localhost:8000" # Local BOINC server
|
| 29 |
+
username: "cancer_volunteer"
|
| 30 |
+
password: "volunteer123"
|
| 31 |
+
work_dir: "./data/boinc"
|
| 32 |
+
max_concurrent_tasks: 4
|
| 33 |
+
|
| 34 |
+
pipeline:
|
| 35 |
+
fastq:
|
| 36 |
+
quality_threshold: 20
|
| 37 |
+
min_length: 50
|
| 38 |
+
output_dir: "./data/processed/fastq"
|
| 39 |
+
|
| 40 |
+
blast:
|
| 41 |
+
database: "nt"
|
| 42 |
+
evalue: 0.001
|
| 43 |
+
num_threads: 4
|
| 44 |
+
output_dir: "./data/processed/blast"
|
| 45 |
+
|
| 46 |
+
variant_calling:
|
| 47 |
+
min_coverage: 10
|
| 48 |
+
min_allele_frequency: 0.05
|
| 49 |
+
output_dir: "./data/processed/variants"
|
| 50 |
+
|
| 51 |
+
data:
|
| 52 |
+
cache_dir: "./data/cache"
|
| 53 |
+
max_cache_size_gb: 10
|
| 54 |
+
projects:
|
| 55 |
+
- "TCGA-BRCA" # Breast Cancer
|
| 56 |
+
- "TCGA-LUAD" # Lung Adenocarcinoma
|
| 57 |
+
- "TCGA-COAD" # Colon Adenocarcinoma
|
| 58 |
+
- "TCGA-GBM" # Glioblastoma
|
| 59 |
+
- "TARGET-AML" # Acute Myeloid Leukemia
|
| 60 |
+
|
| 61 |
+
logging:
|
| 62 |
+
level: "INFO"
|
| 63 |
+
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 64 |
+
file: "./logs/cancer_at_home.log"
|
| 65 |
+
max_bytes: 10485760 # 10MB
|
| 66 |
+
backup_count: 5
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
neo4j:
|
| 5 |
+
image: neo4j:5.13-community
|
| 6 |
+
container_name: cancer_neo4j
|
| 7 |
+
ports:
|
| 8 |
+
- "7474:7474" # HTTP
|
| 9 |
+
- "7687:7687" # Bolt
|
| 10 |
+
environment:
|
| 11 |
+
- NEO4J_AUTH=neo4j/cancer123
|
| 12 |
+
- NEO4J_PLUGINS=["apoc", "graph-data-science"]
|
| 13 |
+
- NEO4J_dbms_security_procedures_unrestricted=apoc.*,gds.*
|
| 14 |
+
- NEO4J_dbms_memory_heap_initial__size=512m
|
| 15 |
+
- NEO4J_dbms_memory_heap_max__size=2G
|
| 16 |
+
volumes:
|
| 17 |
+
- neo4j_data:/data
|
| 18 |
+
- neo4j_logs:/logs
|
| 19 |
+
- neo4j_import:/var/lib/neo4j/import
|
| 20 |
+
healthcheck:
|
| 21 |
+
test: ["CMD", "cypher-shell", "-u", "neo4j", "-p", "cancer123", "RETURN 1"]
|
| 22 |
+
interval: 10s
|
| 23 |
+
timeout: 5s
|
| 24 |
+
retries: 5
|
| 25 |
+
|
| 26 |
+
volumes:
|
| 27 |
+
neo4j_data:
|
| 28 |
+
neo4j_logs:
|
| 29 |
+
neo4j_import:
|
frontend/index.html
ADDED
|
@@ -0,0 +1,563 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Cancer@Home v2 - Dashboard</title>
|
| 7 |
+
<script src="https://cdn.jsdelivr.net/npm/d3@7"></script>
|
| 8 |
+
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0"></script>
|
| 9 |
+
<style>
|
| 10 |
+
* {
|
| 11 |
+
margin: 0;
|
| 12 |
+
padding: 0;
|
| 13 |
+
box-sizing: border-box;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
body {
|
| 17 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
|
| 18 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 19 |
+
color: #333;
|
| 20 |
+
min-height: 100vh;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
.header {
|
| 24 |
+
background: rgba(0, 0, 0, 0.2);
|
| 25 |
+
color: white;
|
| 26 |
+
padding: 20px;
|
| 27 |
+
text-align: center;
|
| 28 |
+
backdrop-filter: blur(10px);
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
.header h1 {
|
| 32 |
+
font-size: 2.5em;
|
| 33 |
+
margin-bottom: 10px;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.header p {
|
| 37 |
+
opacity: 0.9;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
.container {
|
| 41 |
+
max-width: 1400px;
|
| 42 |
+
margin: 20px auto;
|
| 43 |
+
padding: 0 20px;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
.tabs {
|
| 47 |
+
display: flex;
|
| 48 |
+
gap: 10px;
|
| 49 |
+
margin-bottom: 20px;
|
| 50 |
+
flex-wrap: wrap;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
.tab-button {
|
| 54 |
+
background: rgba(255, 255, 255, 0.9);
|
| 55 |
+
border: none;
|
| 56 |
+
padding: 15px 30px;
|
| 57 |
+
border-radius: 8px;
|
| 58 |
+
cursor: pointer;
|
| 59 |
+
font-size: 16px;
|
| 60 |
+
font-weight: 500;
|
| 61 |
+
transition: all 0.3s;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
.tab-button:hover {
|
| 65 |
+
background: white;
|
| 66 |
+
transform: translateY(-2px);
|
| 67 |
+
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
.tab-button.active {
|
| 71 |
+
background: white;
|
| 72 |
+
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
.tab-content {
|
| 76 |
+
display: none;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
.tab-content.active {
|
| 80 |
+
display: block;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
.cards {
|
| 84 |
+
display: grid;
|
| 85 |
+
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
| 86 |
+
gap: 20px;
|
| 87 |
+
margin-bottom: 30px;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
.card {
|
| 91 |
+
background: white;
|
| 92 |
+
border-radius: 12px;
|
| 93 |
+
padding: 25px;
|
| 94 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
.card h3 {
|
| 98 |
+
color: #667eea;
|
| 99 |
+
margin-bottom: 15px;
|
| 100 |
+
font-size: 1.3em;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
.stat {
|
| 104 |
+
font-size: 2.5em;
|
| 105 |
+
font-weight: bold;
|
| 106 |
+
color: #764ba2;
|
| 107 |
+
margin: 10px 0;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
.graph-container {
|
| 111 |
+
background: white;
|
| 112 |
+
border-radius: 12px;
|
| 113 |
+
padding: 25px;
|
| 114 |
+
margin-bottom: 20px;
|
| 115 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
#neo4j-viz {
|
| 119 |
+
width: 100%;
|
| 120 |
+
height: 600px;
|
| 121 |
+
border: 2px solid #e0e0e0;
|
| 122 |
+
border-radius: 8px;
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
.button {
|
| 126 |
+
background: #667eea;
|
| 127 |
+
color: white;
|
| 128 |
+
border: none;
|
| 129 |
+
padding: 12px 24px;
|
| 130 |
+
border-radius: 6px;
|
| 131 |
+
cursor: pointer;
|
| 132 |
+
font-size: 16px;
|
| 133 |
+
transition: background 0.3s;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
.button:hover {
|
| 137 |
+
background: #5568d3;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
.task-list {
|
| 141 |
+
list-style: none;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
.task-item {
|
| 145 |
+
background: #f5f5f5;
|
| 146 |
+
padding: 15px;
|
| 147 |
+
margin: 10px 0;
|
| 148 |
+
border-radius: 6px;
|
| 149 |
+
border-left: 4px solid #667eea;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
.task-item.completed {
|
| 153 |
+
border-left-color: #4caf50;
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
.task-item.running {
|
| 157 |
+
border-left-color: #ff9800;
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
.status-badge {
|
| 161 |
+
display: inline-block;
|
| 162 |
+
padding: 4px 12px;
|
| 163 |
+
border-radius: 12px;
|
| 164 |
+
font-size: 12px;
|
| 165 |
+
font-weight: 600;
|
| 166 |
+
text-transform: uppercase;
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
.status-pending { background: #ffc107; color: #000; }
|
| 170 |
+
.status-running { background: #2196f3; color: white; }
|
| 171 |
+
.status-completed { background: #4caf50; color: white; }
|
| 172 |
+
.status-failed { background: #f44336; color: white; }
|
| 173 |
+
|
| 174 |
+
.input-group {
|
| 175 |
+
margin: 15px 0;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
.input-group label {
|
| 179 |
+
display: block;
|
| 180 |
+
margin-bottom: 5px;
|
| 181 |
+
font-weight: 500;
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
.input-group input, .input-group select {
|
| 185 |
+
width: 100%;
|
| 186 |
+
padding: 10px;
|
| 187 |
+
border: 1px solid #ddd;
|
| 188 |
+
border-radius: 6px;
|
| 189 |
+
font-size: 14px;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
.project-card {
|
| 193 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 194 |
+
color: white;
|
| 195 |
+
padding: 20px;
|
| 196 |
+
border-radius: 8px;
|
| 197 |
+
margin: 10px 0;
|
| 198 |
+
cursor: pointer;
|
| 199 |
+
transition: transform 0.2s;
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
.project-card:hover {
|
| 203 |
+
transform: translateY(-3px);
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
.loading {
|
| 207 |
+
text-align: center;
|
| 208 |
+
padding: 40px;
|
| 209 |
+
color: #667eea;
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
@keyframes spin {
|
| 213 |
+
to { transform: rotate(360deg); }
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
.spinner {
|
| 217 |
+
border: 4px solid #f3f3f3;
|
| 218 |
+
border-top: 4px solid #667eea;
|
| 219 |
+
border-radius: 50%;
|
| 220 |
+
width: 40px;
|
| 221 |
+
height: 40px;
|
| 222 |
+
animation: spin 1s linear infinite;
|
| 223 |
+
margin: 20px auto;
|
| 224 |
+
}
|
| 225 |
+
</style>
|
| 226 |
+
</head>
|
| 227 |
+
<body>
|
| 228 |
+
<div class="header">
|
| 229 |
+
<h1>🧬 Cancer@Home v2</h1>
|
| 230 |
+
<p>Distributed Cancer Genomics Research Platform</p>
|
| 231 |
+
</div>
|
| 232 |
+
|
| 233 |
+
<div class="container">
|
| 234 |
+
<div class="tabs">
|
| 235 |
+
<button class="tab-button active" onclick="showTab('dashboard')">📊 Dashboard</button>
|
| 236 |
+
<button class="tab-button" onclick="showTab('neo4j')">🔍 Neo4j Visualization</button>
|
| 237 |
+
<button class="tab-button" onclick="showTab('boinc')">⚡ BOINC Tasks</button>
|
| 238 |
+
<button class="tab-button" onclick="showTab('gdc')">📚 GDC Data</button>
|
| 239 |
+
<button class="tab-button" onclick="showTab('pipeline')">🧪 Analysis Pipeline</button>
|
| 240 |
+
</div>
|
| 241 |
+
|
| 242 |
+
<!-- Dashboard Tab -->
|
| 243 |
+
<div id="dashboard" class="tab-content active">
|
| 244 |
+
<div class="cards" id="stats-cards">
|
| 245 |
+
<div class="card">
|
| 246 |
+
<h3>Total Genes</h3>
|
| 247 |
+
<div class="stat" id="total-genes">-</div>
|
| 248 |
+
</div>
|
| 249 |
+
<div class="card">
|
| 250 |
+
<h3>Total Mutations</h3>
|
| 251 |
+
<div class="stat" id="total-mutations">-</div>
|
| 252 |
+
</div>
|
| 253 |
+
<div class="card">
|
| 254 |
+
<h3>Total Patients</h3>
|
| 255 |
+
<div class="stat" id="total-patients">-</div>
|
| 256 |
+
</div>
|
| 257 |
+
<div class="card">
|
| 258 |
+
<h3>Cancer Types</h3>
|
| 259 |
+
<div class="stat" id="total-cancer-types">-</div>
|
| 260 |
+
</div>
|
| 261 |
+
</div>
|
| 262 |
+
|
| 263 |
+
<div class="graph-container">
|
| 264 |
+
<h3>Mutation Distribution by Cancer Type</h3>
|
| 265 |
+
<canvas id="mutation-chart"></canvas>
|
| 266 |
+
</div>
|
| 267 |
+
</div>
|
| 268 |
+
|
| 269 |
+
<!-- Neo4j Visualization Tab -->
|
| 270 |
+
<div id="neo4j" class="tab-content">
|
| 271 |
+
<div class="graph-container">
|
| 272 |
+
<h3>Cancer Genomics Knowledge Graph</h3>
|
| 273 |
+
<div id="neo4j-viz"></div>
|
| 274 |
+
</div>
|
| 275 |
+
</div>
|
| 276 |
+
|
| 277 |
+
<!-- BOINC Tasks Tab -->
|
| 278 |
+
<div id="boinc" class="tab-content">
|
| 279 |
+
<div class="cards">
|
| 280 |
+
<div class="card">
|
| 281 |
+
<h3>Submit New Task</h3>
|
| 282 |
+
<div class="input-group">
|
| 283 |
+
<label>Task Type</label>
|
| 284 |
+
<select id="task-type">
|
| 285 |
+
<option value="variant_calling">Variant Calling</option>
|
| 286 |
+
<option value="blast_search">BLAST Search</option>
|
| 287 |
+
<option value="alignment">Sequence Alignment</option>
|
| 288 |
+
</select>
|
| 289 |
+
</div>
|
| 290 |
+
<div class="input-group">
|
| 291 |
+
<label>Input File</label>
|
| 292 |
+
<input type="text" id="input-file" placeholder="path/to/input.fastq">
|
| 293 |
+
</div>
|
| 294 |
+
<button class="button" onclick="submitBoincTask()">Submit Task</button>
|
| 295 |
+
</div>
|
| 296 |
+
|
| 297 |
+
<div class="card">
|
| 298 |
+
<h3>BOINC Statistics</h3>
|
| 299 |
+
<div id="boinc-stats"></div>
|
| 300 |
+
</div>
|
| 301 |
+
</div>
|
| 302 |
+
|
| 303 |
+
<div class="card">
|
| 304 |
+
<h3>Active Tasks</h3>
|
| 305 |
+
<ul class="task-list" id="task-list"></ul>
|
| 306 |
+
</div>
|
| 307 |
+
</div>
|
| 308 |
+
|
| 309 |
+
<!-- GDC Data Tab -->
|
| 310 |
+
<div id="gdc" class="tab-content">
|
| 311 |
+
<div class="card">
|
| 312 |
+
<h3>Available GDC Projects</h3>
|
| 313 |
+
<div id="gdc-projects"></div>
|
| 314 |
+
</div>
|
| 315 |
+
</div>
|
| 316 |
+
|
| 317 |
+
<!-- Pipeline Tab -->
|
| 318 |
+
<div id="pipeline" class="tab-content">
|
| 319 |
+
<div class="cards">
|
| 320 |
+
<div class="card">
|
| 321 |
+
<h3>FASTQ Quality Control</h3>
|
| 322 |
+
<p>Run quality control analysis on sequencing data</p>
|
| 323 |
+
<button class="button" style="margin-top: 15px;">Run QC</button>
|
| 324 |
+
</div>
|
| 325 |
+
<div class="card">
|
| 326 |
+
<h3>BLAST Search</h3>
|
| 327 |
+
<p>Perform sequence alignment and homology search</p>
|
| 328 |
+
<button class="button" style="margin-top: 15px;">Run BLAST</button>
|
| 329 |
+
</div>
|
| 330 |
+
<div class="card">
|
| 331 |
+
<h3>Variant Calling</h3>
|
| 332 |
+
<p>Identify genetic variants from sequencing data</p>
|
| 333 |
+
<button class="button" style="margin-top: 15px;">Call Variants</button>
|
| 334 |
+
</div>
|
| 335 |
+
</div>
|
| 336 |
+
</div>
|
| 337 |
+
</div>
|
| 338 |
+
|
| 339 |
+
<script>
|
| 340 |
+
// Tab switching
|
| 341 |
+
function showTab(tabName) {
|
| 342 |
+
document.querySelectorAll('.tab-content').forEach(tab => {
|
| 343 |
+
tab.classList.remove('active');
|
| 344 |
+
});
|
| 345 |
+
document.querySelectorAll('.tab-button').forEach(btn => {
|
| 346 |
+
btn.classList.remove('active');
|
| 347 |
+
});
|
| 348 |
+
|
| 349 |
+
document.getElementById(tabName).classList.add('active');
|
| 350 |
+
event.target.classList.add('active');
|
| 351 |
+
|
| 352 |
+
if (tabName === 'dashboard') loadDashboard();
|
| 353 |
+
else if (tabName === 'neo4j') loadNeo4jViz();
|
| 354 |
+
else if (tabName === 'boinc') loadBoincTasks();
|
| 355 |
+
else if (tabName === 'gdc') loadGdcProjects();
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
// Load dashboard data
|
| 359 |
+
async function loadDashboard() {
|
| 360 |
+
try {
|
| 361 |
+
const response = await fetch('/api/neo4j/summary');
|
| 362 |
+
const data = await response.json();
|
| 363 |
+
|
| 364 |
+
document.getElementById('total-genes').textContent = data.genes || 0;
|
| 365 |
+
document.getElementById('total-mutations').textContent = data.mutations || 0;
|
| 366 |
+
document.getElementById('total-patients').textContent = data.patients || 0;
|
| 367 |
+
document.getElementById('total-cancer-types').textContent = data.cancer_types || 0;
|
| 368 |
+
|
| 369 |
+
createMutationChart();
|
| 370 |
+
} catch (error) {
|
| 371 |
+
console.error('Error loading dashboard:', error);
|
| 372 |
+
}
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
+
// Create mutation chart
|
| 376 |
+
function createMutationChart() {
|
| 377 |
+
const ctx = document.getElementById('mutation-chart').getContext('2d');
|
| 378 |
+
new Chart(ctx, {
|
| 379 |
+
type: 'bar',
|
| 380 |
+
data: {
|
| 381 |
+
labels: ['Breast Cancer', 'Lung Adenocarcinoma', 'Colon Adenocarcinoma', 'Glioblastoma'],
|
| 382 |
+
datasets: [{
|
| 383 |
+
label: 'Mutations',
|
| 384 |
+
data: [245, 189, 156, 203],
|
| 385 |
+
backgroundColor: [
|
| 386 |
+
'rgba(102, 126, 234, 0.8)',
|
| 387 |
+
'rgba(118, 75, 162, 0.8)',
|
| 388 |
+
'rgba(237, 100, 166, 0.8)',
|
| 389 |
+
'rgba(255, 154, 158, 0.8)'
|
| 390 |
+
]
|
| 391 |
+
}]
|
| 392 |
+
},
|
| 393 |
+
options: {
|
| 394 |
+
responsive: true,
|
| 395 |
+
maintainAspectRatio: true,
|
| 396 |
+
plugins: {
|
| 397 |
+
legend: { display: false }
|
| 398 |
+
}
|
| 399 |
+
}
|
| 400 |
+
});
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
// Load Neo4j visualization
|
| 404 |
+
function loadNeo4jViz() {
|
| 405 |
+
const viz = document.getElementById('neo4j-viz');
|
| 406 |
+
viz.innerHTML = '<div class="loading"><div class="spinner"></div><p>Loading graph visualization...</p></div>';
|
| 407 |
+
|
| 408 |
+
// Simulate graph visualization with D3.js
|
| 409 |
+
setTimeout(() => {
|
| 410 |
+
const width = viz.clientWidth;
|
| 411 |
+
const height = 600;
|
| 412 |
+
|
| 413 |
+
viz.innerHTML = '';
|
| 414 |
+
const svg = d3.select('#neo4j-viz')
|
| 415 |
+
.append('svg')
|
| 416 |
+
.attr('width', width)
|
| 417 |
+
.attr('height', height);
|
| 418 |
+
|
| 419 |
+
// Sample data
|
| 420 |
+
const nodes = [
|
| 421 |
+
{ id: 'TP53', type: 'gene', x: width/2, y: height/2 },
|
| 422 |
+
{ id: 'BRCA1', type: 'gene', x: width/3, y: height/3 },
|
| 423 |
+
{ id: 'KRAS', type: 'gene', x: 2*width/3, y: height/3 },
|
| 424 |
+
{ id: 'Patient 1', type: 'patient', x: width/4, y: 3*height/4 },
|
| 425 |
+
{ id: 'Patient 2', type: 'patient', x: 3*width/4, y: 3*height/4 },
|
| 426 |
+
{ id: 'Breast Cancer', type: 'cancer', x: width/2, y: height/4 }
|
| 427 |
+
];
|
| 428 |
+
|
| 429 |
+
const links = [
|
| 430 |
+
{ source: 'Patient 1', target: 'TP53' },
|
| 431 |
+
{ source: 'Patient 1', target: 'Breast Cancer' },
|
| 432 |
+
{ source: 'Patient 2', target: 'KRAS' },
|
| 433 |
+
{ source: 'TP53', target: 'Breast Cancer' }
|
| 434 |
+
];
|
| 435 |
+
|
| 436 |
+
// Draw links
|
| 437 |
+
svg.selectAll('line')
|
| 438 |
+
.data(links)
|
| 439 |
+
.enter()
|
| 440 |
+
.append('line')
|
| 441 |
+
.attr('x1', d => nodes.find(n => n.id === d.source).x)
|
| 442 |
+
.attr('y1', d => nodes.find(n => n.id === d.source).y)
|
| 443 |
+
.attr('x2', d => nodes.find(n => n.id === d.target).x)
|
| 444 |
+
.attr('y2', d => nodes.find(n => n.id === d.target).y)
|
| 445 |
+
.attr('stroke', '#999')
|
| 446 |
+
.attr('stroke-width', 2);
|
| 447 |
+
|
| 448 |
+
// Draw nodes
|
| 449 |
+
svg.selectAll('circle')
|
| 450 |
+
.data(nodes)
|
| 451 |
+
.enter()
|
| 452 |
+
.append('circle')
|
| 453 |
+
.attr('cx', d => d.x)
|
| 454 |
+
.attr('cy', d => d.y)
|
| 455 |
+
.attr('r', 20)
|
| 456 |
+
.attr('fill', d => {
|
| 457 |
+
if (d.type === 'gene') return '#667eea';
|
| 458 |
+
if (d.type === 'patient') return '#764ba2';
|
| 459 |
+
return '#ed64a6';
|
| 460 |
+
});
|
| 461 |
+
|
| 462 |
+
// Draw labels
|
| 463 |
+
svg.selectAll('text')
|
| 464 |
+
.data(nodes)
|
| 465 |
+
.enter()
|
| 466 |
+
.append('text')
|
| 467 |
+
.attr('x', d => d.x)
|
| 468 |
+
.attr('y', d => d.y - 25)
|
| 469 |
+
.attr('text-anchor', 'middle')
|
| 470 |
+
.text(d => d.id)
|
| 471 |
+
.attr('font-size', '12px')
|
| 472 |
+
.attr('fill', '#333');
|
| 473 |
+
}, 500);
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
// Load BOINC tasks
|
| 477 |
+
async function loadBoincTasks() {
|
| 478 |
+
try {
|
| 479 |
+
const [tasksResponse, statsResponse] = await Promise.all([
|
| 480 |
+
fetch('/api/boinc/tasks'),
|
| 481 |
+
fetch('/api/boinc/statistics')
|
| 482 |
+
]);
|
| 483 |
+
|
| 484 |
+
const tasksData = await tasksResponse.json();
|
| 485 |
+
const statsData = await statsResponse.json();
|
| 486 |
+
|
| 487 |
+
// Display tasks
|
| 488 |
+
const taskList = document.getElementById('task-list');
|
| 489 |
+
taskList.innerHTML = tasksData.tasks.map(task => `
|
| 490 |
+
<li class="task-item ${task.status}">
|
| 491 |
+
<strong>${task.name}</strong>
|
| 492 |
+
<span class="status-badge status-${task.status}">${task.status}</span>
|
| 493 |
+
<div style="margin-top: 8px; font-size: 14px; color: #666;">
|
| 494 |
+
Type: ${task.workunit_type} | Created: ${new Date(task.created_at).toLocaleString()}
|
| 495 |
+
</div>
|
| 496 |
+
</li>
|
| 497 |
+
`).join('');
|
| 498 |
+
|
| 499 |
+
// Display statistics
|
| 500 |
+
const statsDiv = document.getElementById('boinc-stats');
|
| 501 |
+
statsDiv.innerHTML = `
|
| 502 |
+
<p><strong>Total Tasks:</strong> ${statsData.total_tasks}</p>
|
| 503 |
+
<p><strong>Completed:</strong> ${statsData.by_status?.completed || 0}</p>
|
| 504 |
+
<p><strong>Running:</strong> ${statsData.by_status?.running || 0}</p>
|
| 505 |
+
<p><strong>Pending:</strong> ${statsData.by_status?.pending || 0}</p>
|
| 506 |
+
`;
|
| 507 |
+
} catch (error) {
|
| 508 |
+
console.error('Error loading BOINC tasks:', error);
|
| 509 |
+
}
|
| 510 |
+
}
|
| 511 |
+
|
| 512 |
+
// Submit BOINC task
|
| 513 |
+
async function submitBoincTask() {
|
| 514 |
+
const taskType = document.getElementById('task-type').value;
|
| 515 |
+
const inputFile = document.getElementById('input-file').value;
|
| 516 |
+
|
| 517 |
+
if (!inputFile) {
|
| 518 |
+
alert('Please provide an input file path');
|
| 519 |
+
return;
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
try {
|
| 523 |
+
const response = await fetch('/api/boinc/submit', {
|
| 524 |
+
method: 'POST',
|
| 525 |
+
headers: { 'Content-Type': 'application/json' },
|
| 526 |
+
body: JSON.stringify({ workunit_type: taskType, input_file: inputFile })
|
| 527 |
+
});
|
| 528 |
+
|
| 529 |
+
const data = await response.json();
|
| 530 |
+
alert(`Task submitted successfully! Task ID: ${data.task_id}`);
|
| 531 |
+
loadBoincTasks();
|
| 532 |
+
} catch (error) {
|
| 533 |
+
console.error('Error submitting task:', error);
|
| 534 |
+
alert('Failed to submit task');
|
| 535 |
+
}
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
// Load GDC projects
|
| 539 |
+
async function loadGdcProjects() {
|
| 540 |
+
try {
|
| 541 |
+
const response = await fetch('/api/gdc/projects');
|
| 542 |
+
const data = await response.json();
|
| 543 |
+
|
| 544 |
+
const projectsDiv = document.getElementById('gdc-projects');
|
| 545 |
+
projectsDiv.innerHTML = data.projects.map(project => `
|
| 546 |
+
<div class="project-card">
|
| 547 |
+
<h4>${project.name}</h4>
|
| 548 |
+
<p>Project ID: ${project.id}</p>
|
| 549 |
+
<p>Cases: ${project.cases}</p>
|
| 550 |
+
</div>
|
| 551 |
+
`).join('');
|
| 552 |
+
} catch (error) {
|
| 553 |
+
console.error('Error loading GDC projects:', error);
|
| 554 |
+
}
|
| 555 |
+
}
|
| 556 |
+
|
| 557 |
+
// Initialize dashboard on load
|
| 558 |
+
window.onload = () => {
|
| 559 |
+
loadDashboard();
|
| 560 |
+
};
|
| 561 |
+
</script>
|
| 562 |
+
</body>
|
| 563 |
+
</html>
|
requirements.txt
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Web Framework
|
| 2 |
+
fastapi==0.104.1
|
| 3 |
+
uvicorn[standard]==0.24.0
|
| 4 |
+
python-multipart==0.0.6
|
| 5 |
+
jinja2==3.1.2
|
| 6 |
+
|
| 7 |
+
# Neo4j
|
| 8 |
+
neo4j==5.14.1
|
| 9 |
+
py2neo==2021.2.3
|
| 10 |
+
neomodel==5.2.1
|
| 11 |
+
|
| 12 |
+
# GraphQL
|
| 13 |
+
strawberry-graphql[fastapi]==0.216.1
|
| 14 |
+
graphene==3.3
|
| 15 |
+
|
| 16 |
+
# HTTP Clients
|
| 17 |
+
requests==2.31.0
|
| 18 |
+
aiohttp==3.9.1
|
| 19 |
+
httpx==0.25.2
|
| 20 |
+
|
| 21 |
+
# Data Processing
|
| 22 |
+
pandas==2.1.4
|
| 23 |
+
numpy==1.26.2
|
| 24 |
+
biopython==1.81
|
| 25 |
+
|
| 26 |
+
# GDC API Client
|
| 27 |
+
gdc-client==1.6.1
|
| 28 |
+
|
| 29 |
+
# BLAST
|
| 30 |
+
biopython==1.81
|
| 31 |
+
|
| 32 |
+
# Configuration
|
| 33 |
+
pyyaml==6.0.1
|
| 34 |
+
python-dotenv==1.0.0
|
| 35 |
+
|
| 36 |
+
# WebSocket
|
| 37 |
+
websockets==12.0
|
| 38 |
+
|
| 39 |
+
# Database
|
| 40 |
+
sqlalchemy==2.0.23
|
| 41 |
+
|
| 42 |
+
# Utilities
|
| 43 |
+
click==8.1.7
|
| 44 |
+
rich==13.7.0
|
| 45 |
+
tqdm==4.66.1
|
| 46 |
+
|
| 47 |
+
# Development
|
| 48 |
+
pytest==7.4.3
|
| 49 |
+
pytest-asyncio==0.21.1
|
| 50 |
+
black==23.12.1
|
| 51 |
+
flake8==6.1.0
|
run.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Cancer@Home v2 - Main Entry Point
|
| 4 |
+
Quick start script for the entire application
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
import time
|
| 9 |
+
import subprocess
|
| 10 |
+
import webbrowser
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from rich.console import Console
|
| 13 |
+
from rich.panel import Panel
|
| 14 |
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
| 15 |
+
import yaml
|
| 16 |
+
|
| 17 |
+
console = Console()
|
| 18 |
+
|
| 19 |
+
def load_config():
|
| 20 |
+
"""Load configuration"""
|
| 21 |
+
with open('config.yml', 'r') as f:
|
| 22 |
+
return yaml.safe_load(f)
|
| 23 |
+
|
| 24 |
+
def check_docker():
|
| 25 |
+
"""Check if Docker is running"""
|
| 26 |
+
try:
|
| 27 |
+
subprocess.run(['docker', 'ps'], capture_output=True, check=True)
|
| 28 |
+
return True
|
| 29 |
+
except:
|
| 30 |
+
return False
|
| 31 |
+
|
| 32 |
+
def setup_directories():
|
| 33 |
+
"""Create necessary directories"""
|
| 34 |
+
dirs = [
|
| 35 |
+
'data/gdc',
|
| 36 |
+
'data/boinc',
|
| 37 |
+
'data/processed/fastq',
|
| 38 |
+
'data/processed/blast',
|
| 39 |
+
'data/processed/variants',
|
| 40 |
+
'data/cache',
|
| 41 |
+
'logs'
|
| 42 |
+
]
|
| 43 |
+
for dir_path in dirs:
|
| 44 |
+
Path(dir_path).mkdir(parents=True, exist_ok=True)
|
| 45 |
+
|
| 46 |
+
def start_neo4j():
|
| 47 |
+
"""Start Neo4j container"""
|
| 48 |
+
console.print("[cyan]Starting Neo4j database...[/cyan]")
|
| 49 |
+
subprocess.run(['docker-compose', 'up', '-d'], check=True)
|
| 50 |
+
|
| 51 |
+
# Wait for Neo4j to be ready
|
| 52 |
+
console.print("[yellow]Waiting for Neo4j to be ready...[/yellow]")
|
| 53 |
+
time.sleep(10)
|
| 54 |
+
|
| 55 |
+
def initialize_database():
|
| 56 |
+
"""Initialize Neo4j database schema"""
|
| 57 |
+
from backend.neo4j.db_manager import DatabaseManager
|
| 58 |
+
|
| 59 |
+
console.print("[cyan]Initializing database schema...[/cyan]")
|
| 60 |
+
db = DatabaseManager()
|
| 61 |
+
db.initialize_schema()
|
| 62 |
+
console.print("[green]✓ Database initialized[/green]")
|
| 63 |
+
|
| 64 |
+
def start_backend():
|
| 65 |
+
"""Start FastAPI backend"""
|
| 66 |
+
console.print("[cyan]Starting backend server...[/cyan]")
|
| 67 |
+
import uvicorn
|
| 68 |
+
from backend.api.main import app
|
| 69 |
+
|
| 70 |
+
config = load_config()
|
| 71 |
+
|
| 72 |
+
# Run in background
|
| 73 |
+
import threading
|
| 74 |
+
def run_server():
|
| 75 |
+
uvicorn.run(
|
| 76 |
+
app,
|
| 77 |
+
host=config['app']['host'],
|
| 78 |
+
port=config['app']['port'],
|
| 79 |
+
log_level="info"
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
thread = threading.Thread(target=run_server, daemon=True)
|
| 83 |
+
thread.start()
|
| 84 |
+
time.sleep(3)
|
| 85 |
+
|
| 86 |
+
def open_browser():
|
| 87 |
+
"""Open browser to application"""
|
| 88 |
+
config = load_config()
|
| 89 |
+
url = f"http://{config['app']['host']}:{config['app']['port']}"
|
| 90 |
+
console.print(f"[green]✓ Opening browser at {url}[/green]")
|
| 91 |
+
time.sleep(2)
|
| 92 |
+
webbrowser.open(url)
|
| 93 |
+
|
| 94 |
+
def main():
|
| 95 |
+
"""Main entry point"""
|
| 96 |
+
console.clear()
|
| 97 |
+
|
| 98 |
+
# Display banner
|
| 99 |
+
banner = """
|
| 100 |
+
╔═══════════════════════════════════════════╗
|
| 101 |
+
║ Cancer@Home v2.0 ║
|
| 102 |
+
║ Distributed Cancer Genomics Research ║
|
| 103 |
+
╚═══════════════════════════════════════════╝
|
| 104 |
+
"""
|
| 105 |
+
console.print(Panel(banner, style="bold blue"))
|
| 106 |
+
|
| 107 |
+
try:
|
| 108 |
+
with Progress(
|
| 109 |
+
SpinnerColumn(),
|
| 110 |
+
TextColumn("[progress.description]{task.description}"),
|
| 111 |
+
console=console
|
| 112 |
+
) as progress:
|
| 113 |
+
|
| 114 |
+
# Setup
|
| 115 |
+
task = progress.add_task("[cyan]Setting up directories...", total=None)
|
| 116 |
+
setup_directories()
|
| 117 |
+
progress.update(task, completed=True)
|
| 118 |
+
|
| 119 |
+
# Check Docker
|
| 120 |
+
task = progress.add_task("[cyan]Checking Docker...", total=None)
|
| 121 |
+
if not check_docker():
|
| 122 |
+
console.print("[red]✗ Docker is not running. Please start Docker Desktop.[/red]")
|
| 123 |
+
sys.exit(1)
|
| 124 |
+
progress.update(task, completed=True)
|
| 125 |
+
|
| 126 |
+
# Start Neo4j
|
| 127 |
+
task = progress.add_task("[cyan]Starting Neo4j...", total=None)
|
| 128 |
+
start_neo4j()
|
| 129 |
+
progress.update(task, completed=True)
|
| 130 |
+
|
| 131 |
+
# Initialize database
|
| 132 |
+
task = progress.add_task("[cyan]Initializing database...", total=None)
|
| 133 |
+
initialize_database()
|
| 134 |
+
progress.update(task, completed=True)
|
| 135 |
+
|
| 136 |
+
# Start backend
|
| 137 |
+
task = progress.add_task("[cyan]Starting backend server...", total=None)
|
| 138 |
+
start_backend()
|
| 139 |
+
progress.update(task, completed=True)
|
| 140 |
+
|
| 141 |
+
console.print("\n[bold green]✓ Cancer@Home is running![/bold green]\n")
|
| 142 |
+
|
| 143 |
+
config = load_config()
|
| 144 |
+
console.print(f"[cyan]→ Application:[/cyan] http://{config['app']['host']}:{config['app']['port']}")
|
| 145 |
+
console.print(f"[cyan]→ Neo4j Browser:[/cyan] http://localhost:7474")
|
| 146 |
+
console.print(f"[cyan]→ API Docs:[/cyan] http://{config['app']['host']}:{config['app']['port']}/docs")
|
| 147 |
+
console.print(f"[cyan]→ GraphQL:[/cyan] http://{config['app']['host']}:{config['app']['port']}/graphql\n")
|
| 148 |
+
|
| 149 |
+
console.print("[yellow]Press Ctrl+C to stop the server[/yellow]\n")
|
| 150 |
+
|
| 151 |
+
# Open browser
|
| 152 |
+
open_browser()
|
| 153 |
+
|
| 154 |
+
# Keep running
|
| 155 |
+
while True:
|
| 156 |
+
time.sleep(1)
|
| 157 |
+
|
| 158 |
+
except KeyboardInterrupt:
|
| 159 |
+
console.print("\n[yellow]Shutting down...[/yellow]")
|
| 160 |
+
subprocess.run(['docker-compose', 'down'])
|
| 161 |
+
console.print("[green]✓ Goodbye![/green]")
|
| 162 |
+
sys.exit(0)
|
| 163 |
+
except Exception as e:
|
| 164 |
+
console.print(f"[red]✗ Error: {e}[/red]")
|
| 165 |
+
sys.exit(1)
|
| 166 |
+
|
| 167 |
+
if __name__ == "__main__":
|
| 168 |
+
main()
|
setup.ps1
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Windows Setup Script for Cancer@Home v2
|
| 2 |
+
# Run this in PowerShell as Administrator
|
| 3 |
+
|
| 4 |
+
Write-Host "==================================" -ForegroundColor Cyan
|
| 5 |
+
Write-Host "Cancer@Home v2 - Windows Setup" -ForegroundColor Cyan
|
| 6 |
+
Write-Host "==================================" -ForegroundColor Cyan
|
| 7 |
+
Write-Host ""
|
| 8 |
+
|
| 9 |
+
# Check Python
|
| 10 |
+
Write-Host "Checking Python installation..." -ForegroundColor Yellow
|
| 11 |
+
try {
|
| 12 |
+
$pythonVersion = python --version 2>&1
|
| 13 |
+
Write-Host "✓ Python found: $pythonVersion" -ForegroundColor Green
|
| 14 |
+
} catch {
|
| 15 |
+
Write-Host "✗ Python not found. Please install Python 3.8+ from https://www.python.org/" -ForegroundColor Red
|
| 16 |
+
exit 1
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
# Check Docker
|
| 20 |
+
Write-Host "Checking Docker installation..." -ForegroundColor Yellow
|
| 21 |
+
try {
|
| 22 |
+
$dockerVersion = docker --version 2>&1
|
| 23 |
+
Write-Host "✓ Docker found: $dockerVersion" -ForegroundColor Green
|
| 24 |
+
} catch {
|
| 25 |
+
Write-Host "✗ Docker not found. Please install Docker Desktop from https://www.docker.com/products/docker-desktop" -ForegroundColor Red
|
| 26 |
+
exit 1
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
# Create virtual environment
|
| 30 |
+
Write-Host ""
|
| 31 |
+
Write-Host "Creating Python virtual environment..." -ForegroundColor Yellow
|
| 32 |
+
python -m venv venv
|
| 33 |
+
Write-Host "✓ Virtual environment created" -ForegroundColor Green
|
| 34 |
+
|
| 35 |
+
# Activate virtual environment and install dependencies
|
| 36 |
+
Write-Host ""
|
| 37 |
+
Write-Host "Installing Python dependencies..." -ForegroundColor Yellow
|
| 38 |
+
& ".\venv\Scripts\Activate.ps1"
|
| 39 |
+
pip install --upgrade pip
|
| 40 |
+
pip install -r requirements.txt
|
| 41 |
+
Write-Host "✓ Dependencies installed" -ForegroundColor Green
|
| 42 |
+
|
| 43 |
+
# Create necessary directories
|
| 44 |
+
Write-Host ""
|
| 45 |
+
Write-Host "Creating directory structure..." -ForegroundColor Yellow
|
| 46 |
+
$dirs = @(
|
| 47 |
+
"data\gdc",
|
| 48 |
+
"data\boinc",
|
| 49 |
+
"data\processed\fastq",
|
| 50 |
+
"data\processed\blast",
|
| 51 |
+
"data\processed\variants",
|
| 52 |
+
"data\cache",
|
| 53 |
+
"logs"
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
foreach ($dir in $dirs) {
|
| 57 |
+
New-Item -ItemType Directory -Force -Path $dir | Out-Null
|
| 58 |
+
}
|
| 59 |
+
Write-Host "✓ Directories created" -ForegroundColor Green
|
| 60 |
+
|
| 61 |
+
# Start Docker containers
|
| 62 |
+
Write-Host ""
|
| 63 |
+
Write-Host "Starting Neo4j database..." -ForegroundColor Yellow
|
| 64 |
+
docker-compose up -d
|
| 65 |
+
Start-Sleep -Seconds 10
|
| 66 |
+
Write-Host "✓ Neo4j started" -ForegroundColor Green
|
| 67 |
+
|
| 68 |
+
Write-Host ""
|
| 69 |
+
Write-Host "==================================" -ForegroundColor Cyan
|
| 70 |
+
Write-Host "Setup Complete!" -ForegroundColor Green
|
| 71 |
+
Write-Host "==================================" -ForegroundColor Cyan
|
| 72 |
+
Write-Host ""
|
| 73 |
+
Write-Host "To start the application:" -ForegroundColor Yellow
|
| 74 |
+
Write-Host " 1. Activate virtual environment: .\venv\Scripts\Activate.ps1" -ForegroundColor White
|
| 75 |
+
Write-Host " 2. Run the application: python run.py" -ForegroundColor White
|
| 76 |
+
Write-Host ""
|
| 77 |
+
Write-Host "Access points:" -ForegroundColor Yellow
|
| 78 |
+
Write-Host " - Application: http://localhost:5000" -ForegroundColor White
|
| 79 |
+
Write-Host " - Neo4j Browser: http://localhost:7474 (neo4j/cancer123)" -ForegroundColor White
|
| 80 |
+
Write-Host " - API Docs: http://localhost:5000/docs" -ForegroundColor White
|
| 81 |
+
Write-Host ""
|
setup.sh
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Linux/Mac Setup Script for Cancer@Home v2
|
| 3 |
+
|
| 4 |
+
echo "=================================="
|
| 5 |
+
echo "Cancer@Home v2 - Setup"
|
| 6 |
+
echo "=================================="
|
| 7 |
+
echo ""
|
| 8 |
+
|
| 9 |
+
# Check Python
|
| 10 |
+
echo "Checking Python installation..."
|
| 11 |
+
if command -v python3 &> /dev/null; then
|
| 12 |
+
PYTHON_VERSION=$(python3 --version)
|
| 13 |
+
echo "✓ Python found: $PYTHON_VERSION"
|
| 14 |
+
else
|
| 15 |
+
echo "✗ Python not found. Please install Python 3.8+"
|
| 16 |
+
exit 1
|
| 17 |
+
fi
|
| 18 |
+
|
| 19 |
+
# Check Docker
|
| 20 |
+
echo "Checking Docker installation..."
|
| 21 |
+
if command -v docker &> /dev/null; then
|
| 22 |
+
DOCKER_VERSION=$(docker --version)
|
| 23 |
+
echo "✓ Docker found: $DOCKER_VERSION"
|
| 24 |
+
else
|
| 25 |
+
echo "✗ Docker not found. Please install Docker"
|
| 26 |
+
exit 1
|
| 27 |
+
fi
|
| 28 |
+
|
| 29 |
+
# Create virtual environment
|
| 30 |
+
echo ""
|
| 31 |
+
echo "Creating Python virtual environment..."
|
| 32 |
+
python3 -m venv venv
|
| 33 |
+
echo "✓ Virtual environment created"
|
| 34 |
+
|
| 35 |
+
# Activate virtual environment and install dependencies
|
| 36 |
+
echo ""
|
| 37 |
+
echo "Installing Python dependencies..."
|
| 38 |
+
source venv/bin/activate
|
| 39 |
+
pip install --upgrade pip
|
| 40 |
+
pip install -r requirements.txt
|
| 41 |
+
echo "✓ Dependencies installed"
|
| 42 |
+
|
| 43 |
+
# Create necessary directories
|
| 44 |
+
echo ""
|
| 45 |
+
echo "Creating directory structure..."
|
| 46 |
+
mkdir -p data/gdc
|
| 47 |
+
mkdir -p data/boinc
|
| 48 |
+
mkdir -p data/processed/fastq
|
| 49 |
+
mkdir -p data/processed/blast
|
| 50 |
+
mkdir -p data/processed/variants
|
| 51 |
+
mkdir -p data/cache
|
| 52 |
+
mkdir -p logs
|
| 53 |
+
echo "✓ Directories created"
|
| 54 |
+
|
| 55 |
+
# Start Docker containers
|
| 56 |
+
echo ""
|
| 57 |
+
echo "Starting Neo4j database..."
|
| 58 |
+
docker-compose up -d
|
| 59 |
+
sleep 10
|
| 60 |
+
echo "✓ Neo4j started"
|
| 61 |
+
|
| 62 |
+
echo ""
|
| 63 |
+
echo "=================================="
|
| 64 |
+
echo "Setup Complete!"
|
| 65 |
+
echo "=================================="
|
| 66 |
+
echo ""
|
| 67 |
+
echo "To start the application:"
|
| 68 |
+
echo " 1. Activate virtual environment: source venv/bin/activate"
|
| 69 |
+
echo " 2. Run the application: python run.py"
|
| 70 |
+
echo ""
|
| 71 |
+
echo "Access points:"
|
| 72 |
+
echo " - Application: http://localhost:5000"
|
| 73 |
+
echo " - Neo4j Browser: http://localhost:7474 (neo4j/cancer123)"
|
| 74 |
+
echo " - API Docs: http://localhost:5000/docs"
|
| 75 |
+
echo ""
|
test_cancer_at_home.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test Suite for Cancer@Home v2
|
| 3 |
+
Run with: pytest test_cancer_at_home.py
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import pytest
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class TestConfiguration:
|
| 11 |
+
"""Test configuration file"""
|
| 12 |
+
|
| 13 |
+
def test_config_exists(self):
|
| 14 |
+
"""Check if config file exists"""
|
| 15 |
+
assert Path("config.yml").exists()
|
| 16 |
+
|
| 17 |
+
def test_requirements_exists(self):
|
| 18 |
+
"""Check if requirements file exists"""
|
| 19 |
+
assert Path("requirements.txt").exists()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class TestBOINC:
|
| 23 |
+
"""Test BOINC integration"""
|
| 24 |
+
|
| 25 |
+
def test_boinc_client_import(self):
|
| 26 |
+
"""Test BOINC client can be imported"""
|
| 27 |
+
from backend.boinc import BOINCClient
|
| 28 |
+
assert BOINCClient is not None
|
| 29 |
+
|
| 30 |
+
def test_boinc_task_submission(self):
|
| 31 |
+
"""Test task submission"""
|
| 32 |
+
from backend.boinc import BOINCClient
|
| 33 |
+
|
| 34 |
+
client = BOINCClient()
|
| 35 |
+
task_id = client.submit_task("test_task", "test_input.txt")
|
| 36 |
+
|
| 37 |
+
assert task_id is not None
|
| 38 |
+
assert task_id.startswith("wu_")
|
| 39 |
+
|
| 40 |
+
# Check task exists
|
| 41 |
+
task = client.get_task_status(task_id)
|
| 42 |
+
assert task is not None
|
| 43 |
+
assert task.status == "pending"
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class TestGDC:
|
| 47 |
+
"""Test GDC integration"""
|
| 48 |
+
|
| 49 |
+
def test_gdc_client_import(self):
|
| 50 |
+
"""Test GDC client can be imported"""
|
| 51 |
+
from backend.gdc import GDCClient
|
| 52 |
+
assert GDCClient is not None
|
| 53 |
+
|
| 54 |
+
def test_gdc_client_initialization(self):
|
| 55 |
+
"""Test GDC client initialization"""
|
| 56 |
+
from backend.gdc import GDCClient
|
| 57 |
+
|
| 58 |
+
client = GDCClient()
|
| 59 |
+
assert client.api_url == "https://api.gdc.cancer.gov"
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class TestPipeline:
|
| 63 |
+
"""Test bioinformatics pipeline"""
|
| 64 |
+
|
| 65 |
+
def test_fastq_processor_import(self):
|
| 66 |
+
"""Test FASTQ processor import"""
|
| 67 |
+
from backend.pipeline import FASTQProcessor
|
| 68 |
+
assert FASTQProcessor is not None
|
| 69 |
+
|
| 70 |
+
def test_blast_runner_import(self):
|
| 71 |
+
"""Test BLAST runner import"""
|
| 72 |
+
from backend.pipeline import BLASTRunner
|
| 73 |
+
assert BLASTRunner is not None
|
| 74 |
+
|
| 75 |
+
def test_variant_caller_import(self):
|
| 76 |
+
"""Test variant caller import"""
|
| 77 |
+
from backend.pipeline import VariantCaller
|
| 78 |
+
assert VariantCaller is not None
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class TestNeo4j:
|
| 82 |
+
"""Test Neo4j integration"""
|
| 83 |
+
|
| 84 |
+
def test_db_manager_import(self):
|
| 85 |
+
"""Test database manager import"""
|
| 86 |
+
from backend.neo4j import DatabaseManager
|
| 87 |
+
assert DatabaseManager is not None
|
| 88 |
+
|
| 89 |
+
def test_repositories_import(self):
|
| 90 |
+
"""Test repository imports"""
|
| 91 |
+
from backend.neo4j import (
|
| 92 |
+
GeneRepository,
|
| 93 |
+
MutationRepository,
|
| 94 |
+
PatientRepository,
|
| 95 |
+
CancerTypeRepository
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
assert GeneRepository is not None
|
| 99 |
+
assert MutationRepository is not None
|
| 100 |
+
assert PatientRepository is not None
|
| 101 |
+
assert CancerTypeRepository is not None
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class TestAPI:
|
| 105 |
+
"""Test API endpoints"""
|
| 106 |
+
|
| 107 |
+
def test_api_import(self):
|
| 108 |
+
"""Test API can be imported"""
|
| 109 |
+
from backend.api import app
|
| 110 |
+
assert app is not None
|
| 111 |
+
|
| 112 |
+
def test_api_title(self):
|
| 113 |
+
"""Test API metadata"""
|
| 114 |
+
from backend.api import app
|
| 115 |
+
assert app.title == "Cancer@Home v2"
|
| 116 |
+
assert app.version == "2.0.0"
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
class TestDirectoryStructure:
|
| 120 |
+
"""Test directory structure"""
|
| 121 |
+
|
| 122 |
+
def test_backend_exists(self):
|
| 123 |
+
"""Check backend directory"""
|
| 124 |
+
assert Path("backend").exists()
|
| 125 |
+
assert Path("backend/__init__.py").exists()
|
| 126 |
+
|
| 127 |
+
def test_modules_exist(self):
|
| 128 |
+
"""Check all modules exist"""
|
| 129 |
+
modules = [
|
| 130 |
+
"backend/api",
|
| 131 |
+
"backend/boinc",
|
| 132 |
+
"backend/gdc",
|
| 133 |
+
"backend/neo4j",
|
| 134 |
+
"backend/pipeline"
|
| 135 |
+
]
|
| 136 |
+
|
| 137 |
+
for module in modules:
|
| 138 |
+
assert Path(module).exists()
|
| 139 |
+
assert Path(f"{module}/__init__.py").exists()
|
| 140 |
+
|
| 141 |
+
def test_frontend_exists(self):
|
| 142 |
+
"""Check frontend files"""
|
| 143 |
+
assert Path("frontend").exists()
|
| 144 |
+
assert Path("frontend/index.html").exists()
|
| 145 |
+
|
| 146 |
+
def test_documentation_exists(self):
|
| 147 |
+
"""Check documentation files"""
|
| 148 |
+
docs = [
|
| 149 |
+
"README.md",
|
| 150 |
+
"QUICKSTART.md",
|
| 151 |
+
"USER_GUIDE.md",
|
| 152 |
+
"GRAPHQL_EXAMPLES.md",
|
| 153 |
+
"PROJECT_SUMMARY.md"
|
| 154 |
+
]
|
| 155 |
+
|
| 156 |
+
for doc in docs:
|
| 157 |
+
assert Path(doc).exists()
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
class TestSetupScripts:
|
| 161 |
+
"""Test setup scripts"""
|
| 162 |
+
|
| 163 |
+
def test_setup_scripts_exist(self):
|
| 164 |
+
"""Check setup scripts"""
|
| 165 |
+
assert Path("setup.ps1").exists()
|
| 166 |
+
assert Path("setup.sh").exists()
|
| 167 |
+
|
| 168 |
+
def test_run_script_exists(self):
|
| 169 |
+
"""Check run script"""
|
| 170 |
+
assert Path("run.py").exists()
|
| 171 |
+
|
| 172 |
+
def test_docker_compose_exists(self):
|
| 173 |
+
"""Check Docker compose file"""
|
| 174 |
+
assert Path("docker-compose.yml").exists()
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
if __name__ == "__main__":
|
| 178 |
+
pytest.main([__file__, "-v"])
|