#!/usr/bin/env python3 """ Simple Documentation Generator for OpenProblems MCP Server Generates curated documentation for: - Nextflow best practices - Viash components - OpenProblems guidelines - Docker patterns - Spatial workflow templates """ import asyncio import json from pathlib import Path from typing import Dict class DocumentationGenerator: def __init__(self, cache_dir: str = "data/docs_cache"): self.cache_dir = Path(cache_dir) self.cache_dir.mkdir(parents=True, exist_ok=True) async def generate_all_documentation(self) -> Dict[str, str]: """Generate comprehensive curated documentation.""" print("šŸ“š Generating curated documentation for OpenProblems MCP Server...") documentation = { "nextflow": self._generate_nextflow_docs(), "viash": self._generate_viash_docs(), "openproblems": self._generate_openproblems_docs(), "docker": self._generate_docker_docs(), "spatial_templates": self._generate_spatial_templates() } # Save to cache print("šŸ”„ Saving documentation to cache...") await self._save_documentation_cache(documentation) return documentation def _generate_nextflow_docs(self) -> str: """Generate Nextflow documentation.""" return """# Nextflow DSL2 Best Practices Guide ## Overview Nextflow enables scalable and reproducible scientific workflows using software containers. ## Essential DSL2 Patterns ### Basic Pipeline Structure ```nextflow #!/usr/bin/env nextflow nextflow.enable.dsl=2 params.input = './data/*.h5ad' params.output_dir = './results' workflow { input_ch = Channel.fromPath(params.input) PROCESS_NAME(input_ch) } ``` ### Process Definition ```nextflow process SPATIAL_ANALYSIS { tag "$sample_id" label 'process_medium' container 'quay.io/biocontainers/scanpy:1.9.1--pyhd8ed1ab_0' publishDir "${params.output_dir}/analysis", mode: 'copy' input: tuple val(sample_id), path(spatial_data) output: tuple val(sample_id), path("${sample_id}_analyzed.h5ad"), emit: analyzed path "${sample_id}_metrics.json", emit: metrics script: \"\"\" #!/usr/bin/env python import scanpy as sc import json adata = sc.read_h5ad('${spatial_data}') sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) adata.write('${sample_id}_analyzed.h5ad') metrics = {'n_cells': adata.n_obs, 'n_genes': adata.n_vars} with open('${sample_id}_metrics.json', 'w') as f: json.dump(metrics, f, indent=2) \"\"\" } ``` ## Resource Management ```nextflow process { withLabel: 'process_low' { cpus = 2 memory = '4.GB' time = '1.h' } withLabel: 'process_medium' { cpus = 4 memory = '8.GB' time = '2.h' } withLabel: 'process_high' { cpus = 8 memory = '16.GB' time = '4.h' } } docker { enabled = true runOptions = '-u $(id -u):$(id -g)' } ``` ## Error Handling ```nextflow process ROBUST_PROCESS { errorStrategy 'retry' maxRetries 3 script: \"\"\" set -euo pipefail # Your analysis code here \"\"\" } ``` ## Common Issues and Solutions 1. **Out of Memory**: Increase memory allocation 2. **File Not Found**: Check file paths and staging 3. **Container Issues**: Verify container accessibility 4. **Process Hanging**: Check resource requirements """ def _generate_viash_docs(self) -> str: """Generate Viash documentation.""" return """# Viash Component Architecture Guide ## Overview Viash enables building reusable, portable components across Docker, native, and Nextflow platforms. ## Component Structure ### Configuration File (config.vsh.yaml) ```yaml name: "spatial_qc" description: "Spatial transcriptomics quality control component" argument_groups: - name: "Input/Output" arguments: - name: "--input" type: "file" description: "Input spatial data (h5ad format)" required: true - name: "--output" type: "file" direction: "output" description: "Output filtered data" required: true - name: "Parameters" arguments: - name: "--min_genes" type: "integer" description: "Minimum genes per cell" default: 200 resources: - type: "python_script" path: "script.py" platforms: - type: "docker" image: "quay.io/biocontainers/scanpy:1.9.1--pyhd8ed1ab_0" - type: "nextflow" ``` ### Script Implementation ```python import argparse import scanpy as sc import json parser = argparse.ArgumentParser() parser.add_argument('--input', required=True) parser.add_argument('--output', required=True) parser.add_argument('--min_genes', type=int, default=200) args = parser.parse_args() adata = sc.read_h5ad(args.input) sc.pp.filter_cells(adata, min_genes=args.min_genes) adata.write(args.output) ``` ## Development Workflow ```bash # Build component viash build config.vsh.yaml -p docker # Test component viash test config.vsh.yaml # Build for Nextflow viash build config.vsh.yaml -p nextflow -o target/nextflow/ ``` ## Best Practices 1. **Single Responsibility**: Each component should do one thing well 2. **Clear Interfaces**: Well-defined inputs and outputs 3. **Comprehensive Testing**: Unit tests for all functionality 4. **Documentation**: Clear descriptions and examples """ def _generate_openproblems_docs(self) -> str: """Generate OpenProblems documentation.""" return """# OpenProblems Framework Guide ## Overview OpenProblems is a community effort to benchmark single-cell and spatial transcriptomics methods. ## Project Architecture ### Repository Structure ``` src/ ā”œā”€ā”€ tasks/ # Benchmark tasks │ ā”œā”€ā”€ spatial_decomposition/ │ │ ā”œā”€ā”€ methods/ # Benchmark methods │ │ ā”œā”€ā”€ metrics/ # Evaluation metrics │ │ └── datasets/ # Task datasets │ └── other_tasks/ ā”œā”€ā”€ common/ # Shared components └── workflows/ # Nextflow workflows ``` ### Component Types #### Dataset Components Load benchmark datasets with standardized formats. #### Method Components Implement spatial analysis methods following OpenProblems standards. #### Metric Components Evaluate method performance with standardized metrics. ## Data Formats ### AnnData Structure ```python import anndata as ad # Spatial data structure adata_spatial = ad.read_h5ad('spatial_data.h5ad') # adata_spatial.X: expression matrix # adata_spatial.obs: spot metadata # adata_spatial.var: gene metadata # adata_spatial.obsm['spatial']: spatial coordinates # Reference single-cell data adata_reference = ad.read_h5ad('reference_data.h5ad') # adata_reference.obs['cell_type']: cell type annotations ``` ### Standard Metadata Fields - **Cell types**: obs['cell_type'] - **Spatial coordinates**: obsm['spatial'] - **Batch information**: obs['batch'] ## Best Practices - Follow OpenProblems naming conventions - Use standard data formats (AnnData h5ad) - Include comprehensive documentation - Ensure reproducibility across platforms """ def _generate_docker_docs(self) -> str: """Generate Docker documentation.""" return """# Docker Best Practices for Bioinformatics ## Multi-stage Builds ### Optimized Python Environment ```dockerfile # Build stage FROM python:3.9-slim as builder WORKDIR /build COPY requirements.txt . RUN pip install --no-cache-dir --user -r requirements.txt # Production stage FROM python:3.9-slim COPY --from=builder /root/.local /root/.local RUN apt-get update && apt-get install -y procps WORKDIR /app ``` ### Bioinformatics Stack ```dockerfile FROM python:3.9-slim RUN apt-get update && apt-get install -y --no-install-recommends \\ libhdf5-dev \\ libblas-dev \\ liblapack-dev \\ && rm -rf /var/lib/apt/lists/* RUN pip install --no-cache-dir \\ scanpy>=1.9.0 \\ anndata>=0.8.0 \\ pandas>=1.5.0 \\ numpy>=1.21.0 WORKDIR /app ``` ### OpenProblems Compatible Container ```dockerfile FROM python:3.9-slim RUN apt-get update && apt-get install -y procps RUN pip install --no-cache-dir scanpy anndata pandas numpy # Create non-root user for Nextflow RUN groupadd -g 1000 nextflow && \\ useradd -u 1000 -g nextflow nextflow USER nextflow WORKDIR /app ENTRYPOINT ["python"] ``` ## Best Practices - Use specific versions for reproducibility - Use minimal base images - Create non-root users - Combine RUN commands to reduce layers - Use health checks for services - Set appropriate resource limits """ def _generate_spatial_templates(self) -> str: """Generate spatial workflow templates.""" return """# Spatial Transcriptomics Pipeline Templates ## 1. Quality Control Workflow ```nextflow #!/usr/bin/env nextflow nextflow.enable.dsl=2 params.input_pattern = "*.h5ad" params.output_dir = "./results" params.min_genes_per_cell = 200 process SPATIAL_QC { tag "$sample_id" label 'process_medium' container 'quay.io/biocontainers/scanpy:1.9.1--pyhd8ed1ab_0' publishDir "${params.output_dir}/qc", mode: 'copy' input: tuple val(sample_id), path(spatial_data) output: tuple val(sample_id), path("${sample_id}_qc.h5ad"), emit: filtered_data path "${sample_id}_metrics.json", emit: metrics script: \"\"\" #!/usr/bin/env python import scanpy as sc import json adata = sc.read_h5ad('${spatial_data}') # QC metrics adata.var['mt'] = adata.var_names.str.startswith('MT-') sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True) # Filter cells and genes sc.pp.filter_cells(adata, min_genes=${params.min_genes_per_cell}) sc.pp.filter_genes(adata, min_cells=3) adata.write('${sample_id}_qc.h5ad') metrics = { 'sample_id': '${sample_id}', 'n_cells': int(adata.n_obs), 'n_genes': int(adata.n_vars) } with open('${sample_id}_metrics.json', 'w') as f: json.dump(metrics, f, indent=2) \"\"\" } workflow { input_ch = Channel.fromPath(params.input_pattern) .map { file -> [file.baseName, file] } SPATIAL_QC(input_ch) } ``` ## 2. Spatial Decomposition Pipeline ```nextflow process SPATIAL_DECOMPOSITION { tag "$sample_id" label 'process_high' container 'openproblems/spatial-decomposition:latest' input: tuple val(sample_id), path(spatial_data), path(reference_data) output: tuple val(sample_id), path("${sample_id}_decomposition.h5ad"), emit: results path "${sample_id}_proportions.csv", emit: proportions script: \"\"\" #!/usr/bin/env python import anndata as ad import pandas as pd import numpy as np # Load data adata_spatial = ad.read_h5ad('${spatial_data}') adata_reference = ad.read_h5ad('${reference_data}') # Find common genes common_genes = adata_spatial.var_names.intersection(adata_reference.var_names) adata_spatial = adata_spatial[:, common_genes].copy() adata_reference = adata_reference[:, common_genes].copy() # Get cell types cell_types = adata_reference.obs['cell_type'].unique() # Placeholder decomposition (replace with actual method) n_spots = adata_spatial.n_obs n_cell_types = len(cell_types) proportions_matrix = np.random.dirichlet(np.ones(n_cell_types), size=n_spots) # Create proportions DataFrame proportions_df = pd.DataFrame( proportions_matrix, columns=cell_types, index=adata_spatial.obs_names ) proportions_df.to_csv('${sample_id}_proportions.csv') # Add proportions to spatial data for cell_type in cell_types: adata_spatial.obs[f'prop_{cell_type}'] = proportions_df[cell_type].values adata_spatial.write('${sample_id}_decomposition.h5ad') \"\"\" } ``` ## 3. Configuration Template ```nextflow // nextflow.config params { input_dir = './data' output_dir = './results' reference_data = './reference/atlas.h5ad' } process { withLabel: 'process_medium' { cpus = 4 memory = '8.GB' time = '2.h' } withLabel: 'process_high' { cpus = 8 memory = '16.GB' time = '4.h' } } docker { enabled = true runOptions = '-u $(id -u):$(id -g)' } ``` This provides: 1. **Production-ready QC pipeline** with filtering and reporting 2. **Spatial decomposition workflow** with evaluation metrics 3. **Flexible configuration** for different environments 4. **Comprehensive monitoring** and resource tracking """ async def _save_documentation_cache(self, documentation: Dict[str, str]): """Save documentation to cache files.""" for source, content in documentation.items(): cache_file = self.cache_dir / f"{source}_docs.md" with open(cache_file, 'w', encoding='utf-8') as f: f.write(content) print(f" šŸ’¾ Cached {source} documentation ({len(content):,} chars)") async def load_cached_documentation(self) -> Dict[str, str]: """Load documentation from cache if available.""" documentation = {} for source in ["nextflow", "viash", "openproblems", "docker", "spatial_templates"]: cache_file = self.cache_dir / f"{source}_docs.md" if cache_file.exists(): with open(cache_file, 'r', encoding='utf-8') as f: documentation[source] = f.read() return documentation async def main(): """Main function to generate and cache documentation.""" print("šŸ“š OpenProblems Documentation Generator") print("=" * 50) generator = DocumentationGenerator() print("šŸ”„ Generating curated documentation...") documentation = await generator.generate_all_documentation() print(f"\nšŸ“Š Documentation generation complete!") total_chars = 0 for source, content in documentation.items(): chars = len(content) total_chars += chars print(f" āœ… {source}: {chars:,} characters") print(f"\nšŸŽ‰ Total: {total_chars:,} characters of documentation cached!") print(" šŸ’¾ Documentation saved to: data/docs_cache/") print(" šŸ”— Now available via MCP Resources in your server") return documentation if __name__ == "__main__": asyncio.run(main())