SpatialAI_MCP / src /mcp_server /documentation_generator_simple.py
avaliev's picture
Demo Deployment - 0.0.1 version
c75526e verified
#!/usr/bin/env python3
"""
Simple Documentation Generator for OpenProblems MCP Server
Generates curated documentation for:
- Nextflow best practices
- Viash components
- OpenProblems guidelines
- Docker patterns
- Spatial workflow templates
"""
import asyncio
import json
from pathlib import Path
from typing import Dict
class DocumentationGenerator:
def __init__(self, cache_dir: str = "data/docs_cache"):
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(parents=True, exist_ok=True)
async def generate_all_documentation(self) -> Dict[str, str]:
"""Generate comprehensive curated documentation."""
print("πŸ“š Generating curated documentation for OpenProblems MCP Server...")
documentation = {
"nextflow": self._generate_nextflow_docs(),
"viash": self._generate_viash_docs(),
"openproblems": self._generate_openproblems_docs(),
"docker": self._generate_docker_docs(),
"spatial_templates": self._generate_spatial_templates()
}
# Save to cache
print("πŸ”„ Saving documentation to cache...")
await self._save_documentation_cache(documentation)
return documentation
def _generate_nextflow_docs(self) -> str:
"""Generate Nextflow documentation."""
return """# Nextflow DSL2 Best Practices Guide
## Overview
Nextflow enables scalable and reproducible scientific workflows using software containers.
## Essential DSL2 Patterns
### Basic Pipeline Structure
```nextflow
#!/usr/bin/env nextflow
nextflow.enable.dsl=2
params.input = './data/*.h5ad'
params.output_dir = './results'
workflow {
input_ch = Channel.fromPath(params.input)
PROCESS_NAME(input_ch)
}
```
### Process Definition
```nextflow
process SPATIAL_ANALYSIS {
tag "$sample_id"
label 'process_medium'
container 'quay.io/biocontainers/scanpy:1.9.1--pyhd8ed1ab_0'
publishDir "${params.output_dir}/analysis", mode: 'copy'
input:
tuple val(sample_id), path(spatial_data)
output:
tuple val(sample_id), path("${sample_id}_analyzed.h5ad"), emit: analyzed
path "${sample_id}_metrics.json", emit: metrics
script:
\"\"\"
#!/usr/bin/env python
import scanpy as sc
import json
adata = sc.read_h5ad('${spatial_data}')
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
adata.write('${sample_id}_analyzed.h5ad')
metrics = {'n_cells': adata.n_obs, 'n_genes': adata.n_vars}
with open('${sample_id}_metrics.json', 'w') as f:
json.dump(metrics, f, indent=2)
\"\"\"
}
```
## Resource Management
```nextflow
process {
withLabel: 'process_low' {
cpus = 2
memory = '4.GB'
time = '1.h'
}
withLabel: 'process_medium' {
cpus = 4
memory = '8.GB'
time = '2.h'
}
withLabel: 'process_high' {
cpus = 8
memory = '16.GB'
time = '4.h'
}
}
docker {
enabled = true
runOptions = '-u $(id -u):$(id -g)'
}
```
## Error Handling
```nextflow
process ROBUST_PROCESS {
errorStrategy 'retry'
maxRetries 3
script:
\"\"\"
set -euo pipefail
# Your analysis code here
\"\"\"
}
```
## Common Issues and Solutions
1. **Out of Memory**: Increase memory allocation
2. **File Not Found**: Check file paths and staging
3. **Container Issues**: Verify container accessibility
4. **Process Hanging**: Check resource requirements
"""
def _generate_viash_docs(self) -> str:
"""Generate Viash documentation."""
return """# Viash Component Architecture Guide
## Overview
Viash enables building reusable, portable components across Docker, native, and Nextflow platforms.
## Component Structure
### Configuration File (config.vsh.yaml)
```yaml
name: "spatial_qc"
description: "Spatial transcriptomics quality control component"
argument_groups:
- name: "Input/Output"
arguments:
- name: "--input"
type: "file"
description: "Input spatial data (h5ad format)"
required: true
- name: "--output"
type: "file"
direction: "output"
description: "Output filtered data"
required: true
- name: "Parameters"
arguments:
- name: "--min_genes"
type: "integer"
description: "Minimum genes per cell"
default: 200
resources:
- type: "python_script"
path: "script.py"
platforms:
- type: "docker"
image: "quay.io/biocontainers/scanpy:1.9.1--pyhd8ed1ab_0"
- type: "nextflow"
```
### Script Implementation
```python
import argparse
import scanpy as sc
import json
parser = argparse.ArgumentParser()
parser.add_argument('--input', required=True)
parser.add_argument('--output', required=True)
parser.add_argument('--min_genes', type=int, default=200)
args = parser.parse_args()
adata = sc.read_h5ad(args.input)
sc.pp.filter_cells(adata, min_genes=args.min_genes)
adata.write(args.output)
```
## Development Workflow
```bash
# Build component
viash build config.vsh.yaml -p docker
# Test component
viash test config.vsh.yaml
# Build for Nextflow
viash build config.vsh.yaml -p nextflow -o target/nextflow/
```
## Best Practices
1. **Single Responsibility**: Each component should do one thing well
2. **Clear Interfaces**: Well-defined inputs and outputs
3. **Comprehensive Testing**: Unit tests for all functionality
4. **Documentation**: Clear descriptions and examples
"""
def _generate_openproblems_docs(self) -> str:
"""Generate OpenProblems documentation."""
return """# OpenProblems Framework Guide
## Overview
OpenProblems is a community effort to benchmark single-cell and spatial transcriptomics methods.
## Project Architecture
### Repository Structure
```
src/
β”œβ”€β”€ tasks/ # Benchmark tasks
β”‚ β”œβ”€β”€ spatial_decomposition/
β”‚ β”‚ β”œβ”€β”€ methods/ # Benchmark methods
β”‚ β”‚ β”œβ”€β”€ metrics/ # Evaluation metrics
β”‚ β”‚ └── datasets/ # Task datasets
β”‚ └── other_tasks/
β”œβ”€β”€ common/ # Shared components
└── workflows/ # Nextflow workflows
```
### Component Types
#### Dataset Components
Load benchmark datasets with standardized formats.
#### Method Components
Implement spatial analysis methods following OpenProblems standards.
#### Metric Components
Evaluate method performance with standardized metrics.
## Data Formats
### AnnData Structure
```python
import anndata as ad
# Spatial data structure
adata_spatial = ad.read_h5ad('spatial_data.h5ad')
# adata_spatial.X: expression matrix
# adata_spatial.obs: spot metadata
# adata_spatial.var: gene metadata
# adata_spatial.obsm['spatial']: spatial coordinates
# Reference single-cell data
adata_reference = ad.read_h5ad('reference_data.h5ad')
# adata_reference.obs['cell_type']: cell type annotations
```
### Standard Metadata Fields
- **Cell types**: obs['cell_type']
- **Spatial coordinates**: obsm['spatial']
- **Batch information**: obs['batch']
## Best Practices
- Follow OpenProblems naming conventions
- Use standard data formats (AnnData h5ad)
- Include comprehensive documentation
- Ensure reproducibility across platforms
"""
def _generate_docker_docs(self) -> str:
"""Generate Docker documentation."""
return """# Docker Best Practices for Bioinformatics
## Multi-stage Builds
### Optimized Python Environment
```dockerfile
# Build stage
FROM python:3.9-slim as builder
WORKDIR /build
COPY requirements.txt .
RUN pip install --no-cache-dir --user -r requirements.txt
# Production stage
FROM python:3.9-slim
COPY --from=builder /root/.local /root/.local
RUN apt-get update && apt-get install -y procps
WORKDIR /app
```
### Bioinformatics Stack
```dockerfile
FROM python:3.9-slim
RUN apt-get update && apt-get install -y --no-install-recommends \\
libhdf5-dev \\
libblas-dev \\
liblapack-dev \\
&& rm -rf /var/lib/apt/lists/*
RUN pip install --no-cache-dir \\
scanpy>=1.9.0 \\
anndata>=0.8.0 \\
pandas>=1.5.0 \\
numpy>=1.21.0
WORKDIR /app
```
### OpenProblems Compatible Container
```dockerfile
FROM python:3.9-slim
RUN apt-get update && apt-get install -y procps
RUN pip install --no-cache-dir scanpy anndata pandas numpy
# Create non-root user for Nextflow
RUN groupadd -g 1000 nextflow && \\
useradd -u 1000 -g nextflow nextflow
USER nextflow
WORKDIR /app
ENTRYPOINT ["python"]
```
## Best Practices
- Use specific versions for reproducibility
- Use minimal base images
- Create non-root users
- Combine RUN commands to reduce layers
- Use health checks for services
- Set appropriate resource limits
"""
def _generate_spatial_templates(self) -> str:
"""Generate spatial workflow templates."""
return """# Spatial Transcriptomics Pipeline Templates
## 1. Quality Control Workflow
```nextflow
#!/usr/bin/env nextflow
nextflow.enable.dsl=2
params.input_pattern = "*.h5ad"
params.output_dir = "./results"
params.min_genes_per_cell = 200
process SPATIAL_QC {
tag "$sample_id"
label 'process_medium'
container 'quay.io/biocontainers/scanpy:1.9.1--pyhd8ed1ab_0'
publishDir "${params.output_dir}/qc", mode: 'copy'
input:
tuple val(sample_id), path(spatial_data)
output:
tuple val(sample_id), path("${sample_id}_qc.h5ad"), emit: filtered_data
path "${sample_id}_metrics.json", emit: metrics
script:
\"\"\"
#!/usr/bin/env python
import scanpy as sc
import json
adata = sc.read_h5ad('${spatial_data}')
# QC metrics
adata.var['mt'] = adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
# Filter cells and genes
sc.pp.filter_cells(adata, min_genes=${params.min_genes_per_cell})
sc.pp.filter_genes(adata, min_cells=3)
adata.write('${sample_id}_qc.h5ad')
metrics = {
'sample_id': '${sample_id}',
'n_cells': int(adata.n_obs),
'n_genes': int(adata.n_vars)
}
with open('${sample_id}_metrics.json', 'w') as f:
json.dump(metrics, f, indent=2)
\"\"\"
}
workflow {
input_ch = Channel.fromPath(params.input_pattern)
.map { file -> [file.baseName, file] }
SPATIAL_QC(input_ch)
}
```
## 2. Spatial Decomposition Pipeline
```nextflow
process SPATIAL_DECOMPOSITION {
tag "$sample_id"
label 'process_high'
container 'openproblems/spatial-decomposition:latest'
input:
tuple val(sample_id), path(spatial_data), path(reference_data)
output:
tuple val(sample_id), path("${sample_id}_decomposition.h5ad"), emit: results
path "${sample_id}_proportions.csv", emit: proportions
script:
\"\"\"
#!/usr/bin/env python
import anndata as ad
import pandas as pd
import numpy as np
# Load data
adata_spatial = ad.read_h5ad('${spatial_data}')
adata_reference = ad.read_h5ad('${reference_data}')
# Find common genes
common_genes = adata_spatial.var_names.intersection(adata_reference.var_names)
adata_spatial = adata_spatial[:, common_genes].copy()
adata_reference = adata_reference[:, common_genes].copy()
# Get cell types
cell_types = adata_reference.obs['cell_type'].unique()
# Placeholder decomposition (replace with actual method)
n_spots = adata_spatial.n_obs
n_cell_types = len(cell_types)
proportions_matrix = np.random.dirichlet(np.ones(n_cell_types), size=n_spots)
# Create proportions DataFrame
proportions_df = pd.DataFrame(
proportions_matrix,
columns=cell_types,
index=adata_spatial.obs_names
)
proportions_df.to_csv('${sample_id}_proportions.csv')
# Add proportions to spatial data
for cell_type in cell_types:
adata_spatial.obs[f'prop_{cell_type}'] = proportions_df[cell_type].values
adata_spatial.write('${sample_id}_decomposition.h5ad')
\"\"\"
}
```
## 3. Configuration Template
```nextflow
// nextflow.config
params {
input_dir = './data'
output_dir = './results'
reference_data = './reference/atlas.h5ad'
}
process {
withLabel: 'process_medium' {
cpus = 4
memory = '8.GB'
time = '2.h'
}
withLabel: 'process_high' {
cpus = 8
memory = '16.GB'
time = '4.h'
}
}
docker {
enabled = true
runOptions = '-u $(id -u):$(id -g)'
}
```
This provides:
1. **Production-ready QC pipeline** with filtering and reporting
2. **Spatial decomposition workflow** with evaluation metrics
3. **Flexible configuration** for different environments
4. **Comprehensive monitoring** and resource tracking
"""
async def _save_documentation_cache(self, documentation: Dict[str, str]):
"""Save documentation to cache files."""
for source, content in documentation.items():
cache_file = self.cache_dir / f"{source}_docs.md"
with open(cache_file, 'w', encoding='utf-8') as f:
f.write(content)
print(f" πŸ’Ύ Cached {source} documentation ({len(content):,} chars)")
async def load_cached_documentation(self) -> Dict[str, str]:
"""Load documentation from cache if available."""
documentation = {}
for source in ["nextflow", "viash", "openproblems", "docker", "spatial_templates"]:
cache_file = self.cache_dir / f"{source}_docs.md"
if cache_file.exists():
with open(cache_file, 'r', encoding='utf-8') as f:
documentation[source] = f.read()
return documentation
async def main():
"""Main function to generate and cache documentation."""
print("πŸ“š OpenProblems Documentation Generator")
print("=" * 50)
generator = DocumentationGenerator()
print("πŸ”„ Generating curated documentation...")
documentation = await generator.generate_all_documentation()
print(f"\nπŸ“Š Documentation generation complete!")
total_chars = 0
for source, content in documentation.items():
chars = len(content)
total_chars += chars
print(f" βœ… {source}: {chars:,} characters")
print(f"\nπŸŽ‰ Total: {total_chars:,} characters of documentation cached!")
print(" πŸ’Ύ Documentation saved to: data/docs_cache/")
print(" πŸ”— Now available via MCP Resources in your server")
return documentation
if __name__ == "__main__":
asyncio.run(main())