Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Simple Documentation Generator for OpenProblems MCP Server | |
| Generates curated documentation for: | |
| - Nextflow best practices | |
| - Viash components | |
| - OpenProblems guidelines | |
| - Docker patterns | |
| - Spatial workflow templates | |
| """ | |
| import asyncio | |
| import json | |
| from pathlib import Path | |
| from typing import Dict | |
| class DocumentationGenerator: | |
| def __init__(self, cache_dir: str = "data/docs_cache"): | |
| self.cache_dir = Path(cache_dir) | |
| self.cache_dir.mkdir(parents=True, exist_ok=True) | |
| async def generate_all_documentation(self) -> Dict[str, str]: | |
| """Generate comprehensive curated documentation.""" | |
| print("π Generating curated documentation for OpenProblems MCP Server...") | |
| documentation = { | |
| "nextflow": self._generate_nextflow_docs(), | |
| "viash": self._generate_viash_docs(), | |
| "openproblems": self._generate_openproblems_docs(), | |
| "docker": self._generate_docker_docs(), | |
| "spatial_templates": self._generate_spatial_templates() | |
| } | |
| # Save to cache | |
| print("π Saving documentation to cache...") | |
| await self._save_documentation_cache(documentation) | |
| return documentation | |
| def _generate_nextflow_docs(self) -> str: | |
| """Generate Nextflow documentation.""" | |
| return """# Nextflow DSL2 Best Practices Guide | |
| ## Overview | |
| Nextflow enables scalable and reproducible scientific workflows using software containers. | |
| ## Essential DSL2 Patterns | |
| ### Basic Pipeline Structure | |
| ```nextflow | |
| #!/usr/bin/env nextflow | |
| nextflow.enable.dsl=2 | |
| params.input = './data/*.h5ad' | |
| params.output_dir = './results' | |
| workflow { | |
| input_ch = Channel.fromPath(params.input) | |
| PROCESS_NAME(input_ch) | |
| } | |
| ``` | |
| ### Process Definition | |
| ```nextflow | |
| process SPATIAL_ANALYSIS { | |
| tag "$sample_id" | |
| label 'process_medium' | |
| container 'quay.io/biocontainers/scanpy:1.9.1--pyhd8ed1ab_0' | |
| publishDir "${params.output_dir}/analysis", mode: 'copy' | |
| input: | |
| tuple val(sample_id), path(spatial_data) | |
| output: | |
| tuple val(sample_id), path("${sample_id}_analyzed.h5ad"), emit: analyzed | |
| path "${sample_id}_metrics.json", emit: metrics | |
| script: | |
| \"\"\" | |
| #!/usr/bin/env python | |
| import scanpy as sc | |
| import json | |
| adata = sc.read_h5ad('${spatial_data}') | |
| sc.pp.filter_cells(adata, min_genes=200) | |
| sc.pp.filter_genes(adata, min_cells=3) | |
| adata.write('${sample_id}_analyzed.h5ad') | |
| metrics = {'n_cells': adata.n_obs, 'n_genes': adata.n_vars} | |
| with open('${sample_id}_metrics.json', 'w') as f: | |
| json.dump(metrics, f, indent=2) | |
| \"\"\" | |
| } | |
| ``` | |
| ## Resource Management | |
| ```nextflow | |
| process { | |
| withLabel: 'process_low' { | |
| cpus = 2 | |
| memory = '4.GB' | |
| time = '1.h' | |
| } | |
| withLabel: 'process_medium' { | |
| cpus = 4 | |
| memory = '8.GB' | |
| time = '2.h' | |
| } | |
| withLabel: 'process_high' { | |
| cpus = 8 | |
| memory = '16.GB' | |
| time = '4.h' | |
| } | |
| } | |
| docker { | |
| enabled = true | |
| runOptions = '-u $(id -u):$(id -g)' | |
| } | |
| ``` | |
| ## Error Handling | |
| ```nextflow | |
| process ROBUST_PROCESS { | |
| errorStrategy 'retry' | |
| maxRetries 3 | |
| script: | |
| \"\"\" | |
| set -euo pipefail | |
| # Your analysis code here | |
| \"\"\" | |
| } | |
| ``` | |
| ## Common Issues and Solutions | |
| 1. **Out of Memory**: Increase memory allocation | |
| 2. **File Not Found**: Check file paths and staging | |
| 3. **Container Issues**: Verify container accessibility | |
| 4. **Process Hanging**: Check resource requirements | |
| """ | |
| def _generate_viash_docs(self) -> str: | |
| """Generate Viash documentation.""" | |
| return """# Viash Component Architecture Guide | |
| ## Overview | |
| Viash enables building reusable, portable components across Docker, native, and Nextflow platforms. | |
| ## Component Structure | |
| ### Configuration File (config.vsh.yaml) | |
| ```yaml | |
| name: "spatial_qc" | |
| description: "Spatial transcriptomics quality control component" | |
| argument_groups: | |
| - name: "Input/Output" | |
| arguments: | |
| - name: "--input" | |
| type: "file" | |
| description: "Input spatial data (h5ad format)" | |
| required: true | |
| - name: "--output" | |
| type: "file" | |
| direction: "output" | |
| description: "Output filtered data" | |
| required: true | |
| - name: "Parameters" | |
| arguments: | |
| - name: "--min_genes" | |
| type: "integer" | |
| description: "Minimum genes per cell" | |
| default: 200 | |
| resources: | |
| - type: "python_script" | |
| path: "script.py" | |
| platforms: | |
| - type: "docker" | |
| image: "quay.io/biocontainers/scanpy:1.9.1--pyhd8ed1ab_0" | |
| - type: "nextflow" | |
| ``` | |
| ### Script Implementation | |
| ```python | |
| import argparse | |
| import scanpy as sc | |
| import json | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--input', required=True) | |
| parser.add_argument('--output', required=True) | |
| parser.add_argument('--min_genes', type=int, default=200) | |
| args = parser.parse_args() | |
| adata = sc.read_h5ad(args.input) | |
| sc.pp.filter_cells(adata, min_genes=args.min_genes) | |
| adata.write(args.output) | |
| ``` | |
| ## Development Workflow | |
| ```bash | |
| # Build component | |
| viash build config.vsh.yaml -p docker | |
| # Test component | |
| viash test config.vsh.yaml | |
| # Build for Nextflow | |
| viash build config.vsh.yaml -p nextflow -o target/nextflow/ | |
| ``` | |
| ## Best Practices | |
| 1. **Single Responsibility**: Each component should do one thing well | |
| 2. **Clear Interfaces**: Well-defined inputs and outputs | |
| 3. **Comprehensive Testing**: Unit tests for all functionality | |
| 4. **Documentation**: Clear descriptions and examples | |
| """ | |
| def _generate_openproblems_docs(self) -> str: | |
| """Generate OpenProblems documentation.""" | |
| return """# OpenProblems Framework Guide | |
| ## Overview | |
| OpenProblems is a community effort to benchmark single-cell and spatial transcriptomics methods. | |
| ## Project Architecture | |
| ### Repository Structure | |
| ``` | |
| src/ | |
| βββ tasks/ # Benchmark tasks | |
| β βββ spatial_decomposition/ | |
| β β βββ methods/ # Benchmark methods | |
| β β βββ metrics/ # Evaluation metrics | |
| β β βββ datasets/ # Task datasets | |
| β βββ other_tasks/ | |
| βββ common/ # Shared components | |
| βββ workflows/ # Nextflow workflows | |
| ``` | |
| ### Component Types | |
| #### Dataset Components | |
| Load benchmark datasets with standardized formats. | |
| #### Method Components | |
| Implement spatial analysis methods following OpenProblems standards. | |
| #### Metric Components | |
| Evaluate method performance with standardized metrics. | |
| ## Data Formats | |
| ### AnnData Structure | |
| ```python | |
| import anndata as ad | |
| # Spatial data structure | |
| adata_spatial = ad.read_h5ad('spatial_data.h5ad') | |
| # adata_spatial.X: expression matrix | |
| # adata_spatial.obs: spot metadata | |
| # adata_spatial.var: gene metadata | |
| # adata_spatial.obsm['spatial']: spatial coordinates | |
| # Reference single-cell data | |
| adata_reference = ad.read_h5ad('reference_data.h5ad') | |
| # adata_reference.obs['cell_type']: cell type annotations | |
| ``` | |
| ### Standard Metadata Fields | |
| - **Cell types**: obs['cell_type'] | |
| - **Spatial coordinates**: obsm['spatial'] | |
| - **Batch information**: obs['batch'] | |
| ## Best Practices | |
| - Follow OpenProblems naming conventions | |
| - Use standard data formats (AnnData h5ad) | |
| - Include comprehensive documentation | |
| - Ensure reproducibility across platforms | |
| """ | |
| def _generate_docker_docs(self) -> str: | |
| """Generate Docker documentation.""" | |
| return """# Docker Best Practices for Bioinformatics | |
| ## Multi-stage Builds | |
| ### Optimized Python Environment | |
| ```dockerfile | |
| # Build stage | |
| FROM python:3.9-slim as builder | |
| WORKDIR /build | |
| COPY requirements.txt . | |
| RUN pip install --no-cache-dir --user -r requirements.txt | |
| # Production stage | |
| FROM python:3.9-slim | |
| COPY --from=builder /root/.local /root/.local | |
| RUN apt-get update && apt-get install -y procps | |
| WORKDIR /app | |
| ``` | |
| ### Bioinformatics Stack | |
| ```dockerfile | |
| FROM python:3.9-slim | |
| RUN apt-get update && apt-get install -y --no-install-recommends \\ | |
| libhdf5-dev \\ | |
| libblas-dev \\ | |
| liblapack-dev \\ | |
| && rm -rf /var/lib/apt/lists/* | |
| RUN pip install --no-cache-dir \\ | |
| scanpy>=1.9.0 \\ | |
| anndata>=0.8.0 \\ | |
| pandas>=1.5.0 \\ | |
| numpy>=1.21.0 | |
| WORKDIR /app | |
| ``` | |
| ### OpenProblems Compatible Container | |
| ```dockerfile | |
| FROM python:3.9-slim | |
| RUN apt-get update && apt-get install -y procps | |
| RUN pip install --no-cache-dir scanpy anndata pandas numpy | |
| # Create non-root user for Nextflow | |
| RUN groupadd -g 1000 nextflow && \\ | |
| useradd -u 1000 -g nextflow nextflow | |
| USER nextflow | |
| WORKDIR /app | |
| ENTRYPOINT ["python"] | |
| ``` | |
| ## Best Practices | |
| - Use specific versions for reproducibility | |
| - Use minimal base images | |
| - Create non-root users | |
| - Combine RUN commands to reduce layers | |
| - Use health checks for services | |
| - Set appropriate resource limits | |
| """ | |
| def _generate_spatial_templates(self) -> str: | |
| """Generate spatial workflow templates.""" | |
| return """# Spatial Transcriptomics Pipeline Templates | |
| ## 1. Quality Control Workflow | |
| ```nextflow | |
| #!/usr/bin/env nextflow | |
| nextflow.enable.dsl=2 | |
| params.input_pattern = "*.h5ad" | |
| params.output_dir = "./results" | |
| params.min_genes_per_cell = 200 | |
| process SPATIAL_QC { | |
| tag "$sample_id" | |
| label 'process_medium' | |
| container 'quay.io/biocontainers/scanpy:1.9.1--pyhd8ed1ab_0' | |
| publishDir "${params.output_dir}/qc", mode: 'copy' | |
| input: | |
| tuple val(sample_id), path(spatial_data) | |
| output: | |
| tuple val(sample_id), path("${sample_id}_qc.h5ad"), emit: filtered_data | |
| path "${sample_id}_metrics.json", emit: metrics | |
| script: | |
| \"\"\" | |
| #!/usr/bin/env python | |
| import scanpy as sc | |
| import json | |
| adata = sc.read_h5ad('${spatial_data}') | |
| # QC metrics | |
| adata.var['mt'] = adata.var_names.str.startswith('MT-') | |
| sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True) | |
| # Filter cells and genes | |
| sc.pp.filter_cells(adata, min_genes=${params.min_genes_per_cell}) | |
| sc.pp.filter_genes(adata, min_cells=3) | |
| adata.write('${sample_id}_qc.h5ad') | |
| metrics = { | |
| 'sample_id': '${sample_id}', | |
| 'n_cells': int(adata.n_obs), | |
| 'n_genes': int(adata.n_vars) | |
| } | |
| with open('${sample_id}_metrics.json', 'w') as f: | |
| json.dump(metrics, f, indent=2) | |
| \"\"\" | |
| } | |
| workflow { | |
| input_ch = Channel.fromPath(params.input_pattern) | |
| .map { file -> [file.baseName, file] } | |
| SPATIAL_QC(input_ch) | |
| } | |
| ``` | |
| ## 2. Spatial Decomposition Pipeline | |
| ```nextflow | |
| process SPATIAL_DECOMPOSITION { | |
| tag "$sample_id" | |
| label 'process_high' | |
| container 'openproblems/spatial-decomposition:latest' | |
| input: | |
| tuple val(sample_id), path(spatial_data), path(reference_data) | |
| output: | |
| tuple val(sample_id), path("${sample_id}_decomposition.h5ad"), emit: results | |
| path "${sample_id}_proportions.csv", emit: proportions | |
| script: | |
| \"\"\" | |
| #!/usr/bin/env python | |
| import anndata as ad | |
| import pandas as pd | |
| import numpy as np | |
| # Load data | |
| adata_spatial = ad.read_h5ad('${spatial_data}') | |
| adata_reference = ad.read_h5ad('${reference_data}') | |
| # Find common genes | |
| common_genes = adata_spatial.var_names.intersection(adata_reference.var_names) | |
| adata_spatial = adata_spatial[:, common_genes].copy() | |
| adata_reference = adata_reference[:, common_genes].copy() | |
| # Get cell types | |
| cell_types = adata_reference.obs['cell_type'].unique() | |
| # Placeholder decomposition (replace with actual method) | |
| n_spots = adata_spatial.n_obs | |
| n_cell_types = len(cell_types) | |
| proportions_matrix = np.random.dirichlet(np.ones(n_cell_types), size=n_spots) | |
| # Create proportions DataFrame | |
| proportions_df = pd.DataFrame( | |
| proportions_matrix, | |
| columns=cell_types, | |
| index=adata_spatial.obs_names | |
| ) | |
| proportions_df.to_csv('${sample_id}_proportions.csv') | |
| # Add proportions to spatial data | |
| for cell_type in cell_types: | |
| adata_spatial.obs[f'prop_{cell_type}'] = proportions_df[cell_type].values | |
| adata_spatial.write('${sample_id}_decomposition.h5ad') | |
| \"\"\" | |
| } | |
| ``` | |
| ## 3. Configuration Template | |
| ```nextflow | |
| // nextflow.config | |
| params { | |
| input_dir = './data' | |
| output_dir = './results' | |
| reference_data = './reference/atlas.h5ad' | |
| } | |
| process { | |
| withLabel: 'process_medium' { | |
| cpus = 4 | |
| memory = '8.GB' | |
| time = '2.h' | |
| } | |
| withLabel: 'process_high' { | |
| cpus = 8 | |
| memory = '16.GB' | |
| time = '4.h' | |
| } | |
| } | |
| docker { | |
| enabled = true | |
| runOptions = '-u $(id -u):$(id -g)' | |
| } | |
| ``` | |
| This provides: | |
| 1. **Production-ready QC pipeline** with filtering and reporting | |
| 2. **Spatial decomposition workflow** with evaluation metrics | |
| 3. **Flexible configuration** for different environments | |
| 4. **Comprehensive monitoring** and resource tracking | |
| """ | |
| async def _save_documentation_cache(self, documentation: Dict[str, str]): | |
| """Save documentation to cache files.""" | |
| for source, content in documentation.items(): | |
| cache_file = self.cache_dir / f"{source}_docs.md" | |
| with open(cache_file, 'w', encoding='utf-8') as f: | |
| f.write(content) | |
| print(f" πΎ Cached {source} documentation ({len(content):,} chars)") | |
| async def load_cached_documentation(self) -> Dict[str, str]: | |
| """Load documentation from cache if available.""" | |
| documentation = {} | |
| for source in ["nextflow", "viash", "openproblems", "docker", "spatial_templates"]: | |
| cache_file = self.cache_dir / f"{source}_docs.md" | |
| if cache_file.exists(): | |
| with open(cache_file, 'r', encoding='utf-8') as f: | |
| documentation[source] = f.read() | |
| return documentation | |
| async def main(): | |
| """Main function to generate and cache documentation.""" | |
| print("π OpenProblems Documentation Generator") | |
| print("=" * 50) | |
| generator = DocumentationGenerator() | |
| print("π Generating curated documentation...") | |
| documentation = await generator.generate_all_documentation() | |
| print(f"\nπ Documentation generation complete!") | |
| total_chars = 0 | |
| for source, content in documentation.items(): | |
| chars = len(content) | |
| total_chars += chars | |
| print(f" β {source}: {chars:,} characters") | |
| print(f"\nπ Total: {total_chars:,} characters of documentation cached!") | |
| print(" πΎ Documentation saved to: data/docs_cache/") | |
| print(" π Now available via MCP Resources in your server") | |
| return documentation | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |