Spaces:
Sleeping
Sleeping
Upload 328 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +31 -0
- .env.template +64 -0
- .gitattributes +35 -0
- .gitignore +231 -0
- .gitmodules +9 -0
- .pre-commit-config.yaml +22 -0
- .python-version +1 -0
- .scider/SCIDER.md +11 -0
- .scider/skills/content-refinement-agent/SKILL.md +256 -0
- .scider/skills/content-refinement-agent/references/halt-rules.md +125 -0
- .scider/skills/content-refinement-agent/references/prompt.md +136 -0
- .scider/skills/content-refinement-agent/references/reviewer-rubric.md +131 -0
- .scider/skills/content-refinement-agent/references/safe-revision-rules.md +129 -0
- .scider/skills/content-refinement-agent/scripts/apply_worklog.py +94 -0
- .scider/skills/content-refinement-agent/scripts/score_delta.py +164 -0
- .scider/skills/content-refinement-agent/scripts/snapshot.py +47 -0
- .scider/skills/exploratory-data-analysis/SKILL.md +442 -0
- .scider/skills/exploratory-data-analysis/assets/report_template.md +196 -0
- .scider/skills/exploratory-data-analysis/references/bioinformatics_genomics_formats.md +664 -0
- .scider/skills/exploratory-data-analysis/references/chemistry_molecular_formats.md +664 -0
- .scider/skills/exploratory-data-analysis/references/general_scientific_formats.md +518 -0
- .scider/skills/exploratory-data-analysis/references/microscopy_imaging_formats.md +620 -0
- .scider/skills/exploratory-data-analysis/references/proteomics_metabolomics_formats.md +517 -0
- .scider/skills/exploratory-data-analysis/references/spectroscopy_analytical_formats.md +633 -0
- .scider/skills/exploratory-data-analysis/scripts/eda_analyzer.py +548 -0
- .scider/skills/literature-review-agent/SKILL.md +357 -0
- .scider/skills/literature-review-agent/references/citation-density-rule.md +71 -0
- .scider/skills/literature-review-agent/references/discovery-pipeline.md +151 -0
- .scider/skills/literature-review-agent/references/exa-search-cookbook.md +245 -0
- .scider/skills/literature-review-agent/references/prompt.md +77 -0
- .scider/skills/literature-review-agent/references/s2-api-cookbook.md +138 -0
- .scider/skills/literature-review-agent/references/verification-rules.md +100 -0
- .scider/skills/literature-review-agent/scripts/bibtex_format.py +211 -0
- .scider/skills/literature-review-agent/scripts/check_cutoff.py +63 -0
- .scider/skills/literature-review-agent/scripts/citation_coverage.py +104 -0
- .scider/skills/literature-review-agent/scripts/dedupe_by_id.py +98 -0
- .scider/skills/literature-review-agent/scripts/exa_search.py +169 -0
- .scider/skills/literature-review-agent/scripts/levenshtein_match.py +73 -0
- .scider/skills/literature-review-agent/scripts/pre_dedup_candidates.py +156 -0
- .scider/skills/literature-review-agent/scripts/s2_cache.py +113 -0
- .scider/skills/literature-review-agent/scripts/s2_search.py +208 -0
- .scider/skills/literature-review-agent/scripts/sync_keys.py +119 -0
- .scider/skills/literature-review-agent/scripts/validate_pool.py +145 -0
- .scider/skills/matplotlib/SKILL.md +356 -0
- .scider/skills/matplotlib/references/api_reference.md +412 -0
- .scider/skills/matplotlib/references/common_issues.md +563 -0
- .scider/skills/matplotlib/references/plot_types.md +476 -0
- .scider/skills/matplotlib/references/styling_guide.md +589 -0
- .scider/skills/matplotlib/scripts/plot_template.py +446 -0
- .scider/skills/matplotlib/scripts/style_configurator.py +413 -0
.dockerignore
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
.Python
|
| 6 |
+
*.so
|
| 7 |
+
*.egg
|
| 8 |
+
*.egg-info/
|
| 9 |
+
dist/
|
| 10 |
+
build/
|
| 11 |
+
.env
|
| 12 |
+
.venv/
|
| 13 |
+
venv/
|
| 14 |
+
ENV/
|
| 15 |
+
env/
|
| 16 |
+
*.log
|
| 17 |
+
workspace/
|
| 18 |
+
.pytest_cache/
|
| 19 |
+
.coverage
|
| 20 |
+
htmlcov/
|
| 21 |
+
.DS_Store
|
| 22 |
+
*.swp
|
| 23 |
+
*.swo
|
| 24 |
+
*~
|
| 25 |
+
.git/
|
| 26 |
+
.github/
|
| 27 |
+
.claude/
|
| 28 |
+
benchmarks/
|
| 29 |
+
tmp_*
|
| 30 |
+
rsync_tmp_*
|
| 31 |
+
*.ipynb
|
.env.template
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --- SciDER ---
|
| 2 |
+
|
| 3 |
+
# Provide any combination of provider keys. SciDER's unified model catalog
|
| 4 |
+
# (model_settings/catalog.yaml) lets you mix-and-match providers per role —
|
| 5 |
+
# e.g. ideation on Gemini, experiment_coding on GPT-5. Models whose key is
|
| 6 |
+
# missing are simply marked unavailable in the frontend.
|
| 7 |
+
OPENAI_API_KEY=...
|
| 8 |
+
GEMINI_API_KEY=...
|
| 9 |
+
ANTHROPIC_API_KEY=...
|
| 10 |
+
# Optional: Semantic Scholar API key for better rate limits (https://www.semanticscholar.org/product/api)
|
| 11 |
+
# S2_API_KEY=...
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
## User Approval
|
| 16 |
+
# Set to true to enable interactive user approval at critical agent steps
|
| 17 |
+
USER_APPROVAL_ENABLED=true
|
| 18 |
+
|
| 19 |
+
## HuggingFace Dataset Download
|
| 20 |
+
# Set to true to allow using HuggingFace repo names as data paths
|
| 21 |
+
HF_DATASET_DOWNLOAD_ENABLED=false
|
| 22 |
+
# HF_DATASET_CACHE_DIR=tmp_hf_datasets
|
| 23 |
+
# Maximum dataset size in MB (default 100)
|
| 24 |
+
# HF_DATASET_MAX_SIZE_MB=100
|
| 25 |
+
|
| 26 |
+
## Logging
|
| 27 |
+
# LOGURU_LEVEL=INFO
|
| 28 |
+
LOGURU_LEVEL=DEBUG
|
| 29 |
+
LOG_SYSTEM_PROMPT=false
|
| 30 |
+
|
| 31 |
+
## Coding Agent Switch
|
| 32 |
+
# choice: claude_sdk (default), native, openhands (requires SCIDER_ENABLE_OPENHANDS=1)
|
| 33 |
+
# - claude_sdk: Claude Agent SDK (requires ANTHROPIC_API_KEY)
|
| 34 |
+
# - native: SciDER's built-in coding agent (uses experiment_coding model, any LiteLLM provider)
|
| 35 |
+
# - openhands: OpenHands sandbox (requires SCIDER_ENABLE_OPENHANDS=1)
|
| 36 |
+
# legacy aliases: v3 = claude_sdk, v2 = openhands
|
| 37 |
+
CODING_AGENT_VERSION=claude_sdk
|
| 38 |
+
# choice: See https://platform.claude.com/docs/en/about-claude/models/overview
|
| 39 |
+
CLAUDE_SDK_MODEL=claude-haiku-4-5
|
| 40 |
+
|
| 41 |
+
## Openhands
|
| 42 |
+
SCIDER_ENABLE_OPENHANDS=false
|
| 43 |
+
OPENHANDS_MODEL=gemini/gemini-2.5-flash
|
| 44 |
+
OPENHANDS_API_KEY=...
|
| 45 |
+
|
| 46 |
+
## Context Compression Pipeline (runs in query() before each LLM call)
|
| 47 |
+
# Level 1: Persist oversized tool results to disk
|
| 48 |
+
COMPACT_TOOL_RESULT_MAX_CHARS=50000
|
| 49 |
+
# COMPACT_TOOL_RESULT_PREVIEW_CHARS=2000
|
| 50 |
+
# Level 2: Snip old tool results (keep N most recent)
|
| 51 |
+
COMPACT_SNIP_KEEP_RECENT=5
|
| 52 |
+
# Level 3: LLM-based autocompact (trigger threshold in tokens)
|
| 53 |
+
COMPACT_AUTOCOMPACT_TOKEN_THRESHOLD=256000
|
| 54 |
+
COMPACT_AUTOCOMPACT_MODEL=history
|
| 55 |
+
# COMPACT_AUTOCOMPACT_KEEP_RATIO=0.4
|
| 56 |
+
# COMPACT_AUTOCOMPACT_KEEP_FIRST_N=4
|
| 57 |
+
|
| 58 |
+
## Permissions
|
| 59 |
+
# Path to tool permission overrides (JSON file)
|
| 60 |
+
# SCIDER_PERMISSIONS_FILE=.claude/permissions.json
|
| 61 |
+
|
| 62 |
+
## Memory System (file-based cross-session memory in .scider/memory/)
|
| 63 |
+
# SCIDER_MEMORY_READ=true # Load memory index into agent context (default: true)
|
| 64 |
+
# SCIDER_MEMORY_WRITE=true # Allow agents to write new memories (default: true)
|
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[codz]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py.cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# UV
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
#uv.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
#poetry.toml
|
| 110 |
+
|
| 111 |
+
# pdm
|
| 112 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 113 |
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 114 |
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 115 |
+
#pdm.lock
|
| 116 |
+
#pdm.toml
|
| 117 |
+
.pdm-python
|
| 118 |
+
.pdm-build/
|
| 119 |
+
|
| 120 |
+
# pixi
|
| 121 |
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 122 |
+
#pixi.lock
|
| 123 |
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 124 |
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 125 |
+
.pixi
|
| 126 |
+
|
| 127 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 128 |
+
__pypackages__/
|
| 129 |
+
|
| 130 |
+
# Celery stuff
|
| 131 |
+
celerybeat-schedule
|
| 132 |
+
celerybeat.pid
|
| 133 |
+
|
| 134 |
+
# SageMath parsed files
|
| 135 |
+
*.sage.py
|
| 136 |
+
|
| 137 |
+
# Environments
|
| 138 |
+
.env
|
| 139 |
+
.envrc
|
| 140 |
+
.venv
|
| 141 |
+
env/
|
| 142 |
+
venv/
|
| 143 |
+
ENV/
|
| 144 |
+
env.bak/
|
| 145 |
+
venv.bak/
|
| 146 |
+
|
| 147 |
+
# Spyder project settings
|
| 148 |
+
.spyderproject
|
| 149 |
+
.spyproject
|
| 150 |
+
|
| 151 |
+
# Rope project settings
|
| 152 |
+
.ropeproject
|
| 153 |
+
|
| 154 |
+
# mkdocs documentation
|
| 155 |
+
/site
|
| 156 |
+
|
| 157 |
+
# mypy
|
| 158 |
+
.mypy_cache/
|
| 159 |
+
.dmypy.json
|
| 160 |
+
dmypy.json
|
| 161 |
+
|
| 162 |
+
# Pyre type checker
|
| 163 |
+
.pyre/
|
| 164 |
+
|
| 165 |
+
# pytype static type analyzer
|
| 166 |
+
.pytype/
|
| 167 |
+
|
| 168 |
+
# Cython debug symbols
|
| 169 |
+
cython_debug/
|
| 170 |
+
|
| 171 |
+
# PyCharm
|
| 172 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 173 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 174 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 175 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 176 |
+
#.idea/
|
| 177 |
+
|
| 178 |
+
# Abstra
|
| 179 |
+
# Abstra is an AI-powered process automation framework.
|
| 180 |
+
# Ignore directories containing user credentials, local state, and settings.
|
| 181 |
+
# Learn more at https://abstra.io/docs
|
| 182 |
+
.abstra/
|
| 183 |
+
|
| 184 |
+
# Visual Studio Code
|
| 185 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 186 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 187 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 188 |
+
# you could uncomment the following to ignore the entire vscode folder
|
| 189 |
+
# .vscode/
|
| 190 |
+
|
| 191 |
+
# Ruff stuff:
|
| 192 |
+
.ruff_cache/
|
| 193 |
+
|
| 194 |
+
# PyPI configuration file
|
| 195 |
+
.pypirc
|
| 196 |
+
|
| 197 |
+
# Cursor
|
| 198 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 199 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 200 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
| 201 |
+
.cursorignore
|
| 202 |
+
.cursorindexingignore
|
| 203 |
+
|
| 204 |
+
# Marimo
|
| 205 |
+
marimo/_static/
|
| 206 |
+
marimo/_lsp/
|
| 207 |
+
__marimo__/
|
| 208 |
+
|
| 209 |
+
# temporary files
|
| 210 |
+
tmp_*
|
| 211 |
+
rsync_tmp_*
|
| 212 |
+
.aider*
|
| 213 |
+
data_analysis.md
|
| 214 |
+
software-agent-sdk
|
| 215 |
+
env
|
| 216 |
+
|
| 217 |
+
streamlit-client/case-study-memory/
|
| 218 |
+
saved_chats/
|
| 219 |
+
|
| 220 |
+
# vibe coding
|
| 221 |
+
.claude/
|
| 222 |
+
.agents/
|
| 223 |
+
.windsurf/
|
| 224 |
+
|
| 225 |
+
# Ignore .scider/ contents but allow specific entries to be tracked
|
| 226 |
+
.scider/*
|
| 227 |
+
!.scider/skills/
|
| 228 |
+
!.scider/rules/
|
| 229 |
+
!.scider/SCIDER.md
|
| 230 |
+
|
| 231 |
+
workspace/
|
.gitmodules
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[submodule "benchmarks/mlebench/mle-bench"]
|
| 2 |
+
path = benchmarks/mlebench/mle-bench
|
| 3 |
+
url = git@github.com:leonardodalinky/mle-bench.git
|
| 4 |
+
[submodule "benchmarks/scicodebench/SciCode"]
|
| 5 |
+
path = benchmarks/scicodebench/SciCode
|
| 6 |
+
url = git@github.com:leonardodalinky/SciCode.git
|
| 7 |
+
[submodule "benchmarks/aiideabench/AI_Idea_Bench"]
|
| 8 |
+
path = benchmarks/aiideabench/AI_Idea_Bench
|
| 9 |
+
url = git@github.com:leonardodalinky/AI_Idea_Bench_2025.git
|
.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
repos:
|
| 2 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
| 3 |
+
rev: v6.0.0
|
| 4 |
+
hooks:
|
| 5 |
+
- id: trailing-whitespace
|
| 6 |
+
- id: end-of-file-fixer
|
| 7 |
+
- id: name-tests-test
|
| 8 |
+
- id: requirements-txt-fixer
|
| 9 |
+
- repo: https://github.com/pycqa/isort
|
| 10 |
+
rev: 5.13.2
|
| 11 |
+
hooks:
|
| 12 |
+
- id: isort
|
| 13 |
+
args: ["--profile", "black", "--line-length=100", "--python-version=310"]
|
| 14 |
+
- repo: https://github.com/psf/black
|
| 15 |
+
rev: 25.1.0
|
| 16 |
+
hooks:
|
| 17 |
+
- id: black
|
| 18 |
+
args: ["--line-length=100", "--target-version=py310"]
|
| 19 |
+
- repo: https://github.com/kynan/nbstripout
|
| 20 |
+
rev: 0.8.2
|
| 21 |
+
hooks:
|
| 22 |
+
- id: nbstripout
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.13
|
.scider/SCIDER.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SCIDER.md
|
| 2 |
+
|
| 3 |
+
## Approach
|
| 4 |
+
- Think before acting. Read existing files before writing code.
|
| 5 |
+
- Be concise in output but thorough in reasoning.
|
| 6 |
+
- Prefer editing over rewriting whole files.
|
| 7 |
+
- Do not re-read files you have already read unless the file may have changed.
|
| 8 |
+
- Test your code before declaring done.
|
| 9 |
+
- No sycophantic openers or closing fluff.
|
| 10 |
+
- Keep solutions simple and direct.
|
| 11 |
+
- User instructions always override this file.
|
.scider/skills/content-refinement-agent/SKILL.md
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
name: content-refinement-agent
|
| 3 |
+
description: Step 5 of the PaperOrchestra pipeline (arXiv:2604.05018). Iteratively refine drafts/paper.tex by simulating peer review and applying targeted revisions, with strict accept/revert halt rules. Maintains a worklog and snapshots each iteration so revert is real, not symbolic. TRIGGER when the orchestrator delegates Step 5 or when the user asks to "refine the draft", "iterate on the paper", or "run peer review on this paper".
|
| 4 |
+
allowed_agents: [writing]
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
# Content Refinement Agent (Step 5)
|
| 8 |
+
|
| 9 |
+
Faithful implementation of the Content Refinement Agent from PaperOrchestra
|
| 10 |
+
(Song et al., 2026, arXiv:2604.05018, §4 Step 5, App. F.1 pp. 49–51).
|
| 11 |
+
|
| 12 |
+
**Cost: ~5–7 LLM calls** (App. B), typically ~3 refinement iterations, each
|
| 13 |
+
consisting of one reviewer call and one revision call.
|
| 14 |
+
|
| 15 |
+
The paper highlights this step as one of the largest contributors to overall
|
| 16 |
+
quality: refinement alone accounts for +19% (CVPR) and +22% (ICLR) absolute
|
| 17 |
+
acceptance-rate improvement (Fig. 4). Get this step right.
|
| 18 |
+
|
| 19 |
+
## Inputs
|
| 20 |
+
|
| 21 |
+
- `workspace/drafts/paper.tex` — output of Step 4
|
| 22 |
+
- `workspace/inputs/conference_guidelines.md`
|
| 23 |
+
- `workspace/inputs/experimental_log.md` — used as ground truth for the
|
| 24 |
+
hallucination check
|
| 25 |
+
- `workspace/citation_pool.json` / `workspace/refs.bib` — the allowed
|
| 26 |
+
bibliography
|
| 27 |
+
|
| 28 |
+
## Outputs
|
| 29 |
+
|
| 30 |
+
- `workspace/refinement/iter1/`, `iter2/`, `iter3/` — per-iteration snapshots
|
| 31 |
+
containing `paper.tex`, `paper.pdf`, `review.json`, `score.json`
|
| 32 |
+
- `workspace/refinement/worklog.json` — append-only history of decisions
|
| 33 |
+
- `workspace/final/paper.tex` and `workspace/final/paper.pdf` — copy of the
|
| 34 |
+
best accepted snapshot
|
| 35 |
+
|
| 36 |
+
## The refinement loop
|
| 37 |
+
|
| 38 |
+
```
|
| 39 |
+
prev_score = score(paper.tex) # baseline from initial draft
|
| 40 |
+
snapshot iter0/
|
| 41 |
+
|
| 42 |
+
for iter in 1..ITER_CAP (default 3):
|
| 43 |
+
1. simulate_review(paper.tex) → review.json
|
| 44 |
+
(uses `references/reviewer-rubric.md` rubric)
|
| 45 |
+
|
| 46 |
+
2. apply_revision(paper.tex, review.json) → new_paper.tex
|
| 47 |
+
(uses verbatim Refinement Agent prompt at `references/prompt.md`)
|
| 48 |
+
|
| 49 |
+
3. snapshot iter<N>/ with new_paper.tex, review.json
|
| 50 |
+
latexmk -pdf new_paper.tex → iter<N>/paper.pdf
|
| 51 |
+
|
| 52 |
+
4. score(new_paper.tex) → curr_score
|
| 53 |
+
|
| 54 |
+
5. decide via score_delta.py:
|
| 55 |
+
- if curr.overall > prev.overall: ACCEPT
|
| 56 |
+
- elif curr.overall == prev.overall and net_subaxis ≥0: ACCEPT
|
| 57 |
+
- else: REVERT
|
| 58 |
+
|
| 59 |
+
6. apply_worklog.py to append the decision
|
| 60 |
+
|
| 61 |
+
7. if REVERT or no actionable weaknesses or iter == ITER_CAP: HALT
|
| 62 |
+
|
| 63 |
+
paper.tex ← new_paper.tex (only on ACCEPT)
|
| 64 |
+
prev_score ← curr_score
|
| 65 |
+
|
| 66 |
+
cp <best iter>/paper.tex → workspace/final/paper.tex
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
The "best" snapshot at HALT is the one with the highest accepted overall
|
| 70 |
+
score. On a REVERT halt, the best is the iteration immediately before the
|
| 71 |
+
revert.
|
| 72 |
+
|
| 73 |
+
## Step-by-step
|
| 74 |
+
|
| 75 |
+
### 0. Snapshot the initial draft
|
| 76 |
+
|
| 77 |
+
```bash
|
| 78 |
+
python skills/content-refinement-agent/scripts/snapshot.py \
|
| 79 |
+
--src workspace/drafts/paper.tex \
|
| 80 |
+
--dst workspace/refinement/iter0/
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
This creates `iter0/paper.tex`. Then compile to `iter0/paper.pdf`:
|
| 84 |
+
|
| 85 |
+
```bash
|
| 86 |
+
cd workspace/refinement/iter0/ && latexmk -pdf -interaction=nonstopmode paper.tex
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
Score it (see Step 1 below) → `iter0/score.json`.
|
| 90 |
+
|
| 91 |
+
### 1. Simulate peer review
|
| 92 |
+
|
| 93 |
+
For each iteration N starting from 1:
|
| 94 |
+
|
| 95 |
+
Load `references/reviewer-rubric.md` as the system prompt for the simulated
|
| 96 |
+
reviewer call. The reviewer reads `iter<N-1>/paper.pdf` (or `paper.tex` if
|
| 97 |
+
your host LLM lacks PDF input) and produces a JSON of strengths,
|
| 98 |
+
weaknesses, questions, and per-axis scores.
|
| 99 |
+
|
| 100 |
+
The rubric is structured to mimic AgentReview (Jin et al., 2024) — the
|
| 101 |
+
paper's chosen evaluator. We ship a faithful rubric in the references
|
| 102 |
+
directory; the host agent's LLM does the actual reviewing.
|
| 103 |
+
|
| 104 |
+
Save to `workspace/refinement/iter<N>/review.json`.
|
| 105 |
+
|
| 106 |
+
### 2. Score the draft
|
| 107 |
+
|
| 108 |
+
The reviewer call produces both qualitative feedback and a per-axis score:
|
| 109 |
+
|
| 110 |
+
```json
|
| 111 |
+
{
|
| 112 |
+
"axis_scores": {
|
| 113 |
+
"scientific_depth": {"score": 65, "justification": "..."},
|
| 114 |
+
"technical_execution": {"score": 70, "justification": "..."},
|
| 115 |
+
"logical_flow": {"score": 60, "justification": "..."},
|
| 116 |
+
"writing_clarity": {"score": 55, "justification": "..."},
|
| 117 |
+
"evidence_presentation":{"score": 72, "justification": "..."},
|
| 118 |
+
"academic_style": {"score": 68, "justification": "..."}
|
| 119 |
+
},
|
| 120 |
+
"overall_score": 64.5,
|
| 121 |
+
"strengths": [...],
|
| 122 |
+
"weaknesses": [...],
|
| 123 |
+
"questions": [...]
|
| 124 |
+
}
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
Save to `iter<N>/score.json`. (Combined with `review.json` if your host
|
| 128 |
+
emits one document; the schemas overlap.)
|
| 129 |
+
|
| 130 |
+
### 3. Apply revision
|
| 131 |
+
|
| 132 |
+
Load the **verbatim Content Refinement Agent prompt** at `references/prompt.md`.
|
| 133 |
+
Prepend the Anti-Leakage Prompt. Inputs:
|
| 134 |
+
|
| 135 |
+
- `paper.tex` — current draft
|
| 136 |
+
- `paper.pdf` — compiled PDF (multimodal context if available)
|
| 137 |
+
- `conference_guidelines.md`
|
| 138 |
+
- `experimental_log.md` — ground truth for numeric claims
|
| 139 |
+
- `worklog.json` — history of previous changes
|
| 140 |
+
- `citation_pool.json` — the allowed bibliography
|
| 141 |
+
- `reviewer_feedback` — the JSON from Step 1
|
| 142 |
+
|
| 143 |
+
The prompt instructs the model to address weaknesses, integrate question
|
| 144 |
+
answers, and emit two output blocks:
|
| 145 |
+
|
| 146 |
+
1. A worklog JSON `{addressed_weaknesses[], integrated_answers[], actions_taken[]}`
|
| 147 |
+
2. The full revised LaTeX code
|
| 148 |
+
|
| 149 |
+
Save the revised LaTeX as `iter<N>/paper.tex`. Append the worklog JSON to
|
| 150 |
+
`workspace/refinement/worklog.json` via `apply_worklog.py`.
|
| 151 |
+
|
| 152 |
+
### 4. Compile and re-score
|
| 153 |
+
|
| 154 |
+
```bash
|
| 155 |
+
cd workspace/refinement/iter<N>/ && latexmk -pdf -interaction=nonstopmode paper.tex
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
Then re-run the simulated review on the new draft → updated `score.json`
|
| 159 |
+
for the new iteration. (This is the "re-score after revision" call.)
|
| 160 |
+
|
| 161 |
+
### 5. Apply the accept/revert decision
|
| 162 |
+
|
| 163 |
+
The calling loop must track `CONSECUTIVE_SMALL` (starts at 0) and pass it
|
| 164 |
+
on each call so `score_delta.py` can detect the plateau:
|
| 165 |
+
|
| 166 |
+
```bash
|
| 167 |
+
python skills/content-refinement-agent/scripts/score_delta.py \
|
| 168 |
+
--prev workspace/refinement/iter<N-1>/score.json \
|
| 169 |
+
--curr workspace/refinement/iter<N>/score.json \
|
| 170 |
+
--plateau-threshold 1.0 \
|
| 171 |
+
--plateau-streak 3 \
|
| 172 |
+
--consecutive-small $CONSECUTIVE_SMALL \
|
| 173 |
+
> workspace/refinement/iter<N>/delta.json
|
| 174 |
+
|
| 175 |
+
EXIT=$?
|
| 176 |
+
# Update streak for next iteration:
|
| 177 |
+
CONSECUTIVE_SMALL=$(python3 -c "
|
| 178 |
+
import json
|
| 179 |
+
d = json.load(open('workspace/refinement/iter<N>/delta.json'))
|
| 180 |
+
print(d['consecutive_small'])
|
| 181 |
+
")
|
| 182 |
+
```
|
| 183 |
+
|
| 184 |
+
Exit codes:
|
| 185 |
+
- `0` — ACCEPT (overall improved or tied with non-negative net sub-axis, no plateau)
|
| 186 |
+
- `1` — REVERT (overall decreased)
|
| 187 |
+
- `2` — REVERT (tied overall, but net sub-axis change negative)
|
| 188 |
+
- `4` — HALT_PLATEAU (accepted but N consecutive iterations below threshold — stop early)
|
| 189 |
+
|
| 190 |
+
Behavior:
|
| 191 |
+
|
| 192 |
+
- **ACCEPT (exit 0)**: keep `iter<N>/paper.tex` as the new best. Continue to iter N+1.
|
| 193 |
+
- **REVERT (exit 1 or 2)**: copy `iter<N-1>/paper.tex` back as canonical, halt.
|
| 194 |
+
- **HALT_PLATEAU (exit 4)**: keep current (it was accepted), but stop — further
|
| 195 |
+
iterations are unlikely to yield meaningful gains. In practice ~85% of
|
| 196 |
+
refinement gain comes in iteration 1; the plateau fires when subsequent
|
| 197 |
+
iterations improve by less than 1 point for 3 consecutive rounds.
|
| 198 |
+
|
| 199 |
+
Always log the decision via `apply_worklog.py --decision ...`.
|
| 200 |
+
|
| 201 |
+
### 6. Halt rules
|
| 202 |
+
|
| 203 |
+
Halt the loop when ANY of these is true:
|
| 204 |
+
|
| 205 |
+
1. Iteration count reaches `ITER_CAP` (default 3).
|
| 206 |
+
2. `score_delta.py` returned exit code 1 or 2 (REVERT).
|
| 207 |
+
3. The simulated reviewer's `weaknesses` list is empty (no actionable
|
| 208 |
+
feedback to apply).
|
| 209 |
+
4. `score_delta.py` returned exit code 4 (HALT_PLATEAU — plateau early-stop).
|
| 210 |
+
|
| 211 |
+
### 7. Promote the best snapshot
|
| 212 |
+
|
| 213 |
+
Identify the iteration with the highest accepted `overall_score` (this may
|
| 214 |
+
be the latest accepted iteration, OR an earlier one if a later iteration
|
| 215 |
+
was reverted). Copy:
|
| 216 |
+
|
| 217 |
+
```bash
|
| 218 |
+
cp workspace/refinement/iter<best>/paper.tex workspace/final/paper.tex
|
| 219 |
+
cp workspace/refinement/iter<best>/paper.pdf workspace/final/paper.pdf
|
| 220 |
+
```
|
| 221 |
+
|
| 222 |
+
Then in the final report, tell the user:
|
| 223 |
+
- How many iterations were run
|
| 224 |
+
- The final overall score
|
| 225 |
+
- The score trajectory (e.g., "iter0 64.5 → iter1 67.3 (accept) → iter2 69.1 (accept) → iter3 68.9 (revert, halt)")
|
| 226 |
+
- Which iteration was promoted
|
| 227 |
+
|
| 228 |
+
## Critical safety constraints (App. F.1 page 50–51)
|
| 229 |
+
|
| 230 |
+
The paper explicitly notes that early versions of the Refinement Agent
|
| 231 |
+
"exploited the automated reviewer's scoring function by superficially
|
| 232 |
+
listing missing baselines as limitations to artificially inflate
|
| 233 |
+
acceptance scores." The verbatim prompt forbids this. **You must honor it:**
|
| 234 |
+
|
| 235 |
+
- **Ignore reviewer requests for new experiments, ablations, or baselines.**
|
| 236 |
+
The Refinement Agent's job is presentation, not new science. If the
|
| 237 |
+
reviewer asks for missing data, simply skip those points — do NOT add
|
| 238 |
+
fabricated experiments, do NOT add a "future work" item promising them.
|
| 239 |
+
- **Never explicitly state a limitation.** The phrase "we acknowledge as a
|
| 240 |
+
limitation that..." is forbidden. The model can address weaknesses
|
| 241 |
+
through clearer explanation, but must not game the evaluator by listing
|
| 242 |
+
them defensively.
|
| 243 |
+
- **All numeric claims MUST be verified against `experimental_log.md`.**
|
| 244 |
+
The agent cannot introduce new numbers, only re-present existing ones.
|
| 245 |
+
|
| 246 |
+
These rules prevent reward hacking and keep the refinement loop honest.
|
| 247 |
+
|
| 248 |
+
## Resources
|
| 249 |
+
|
| 250 |
+
- `references/prompt.md` — verbatim Content Refinement Agent prompt from App. F.1
|
| 251 |
+
- `references/reviewer-rubric.md` — AgentReview-style scoring rubric (6 axes)
|
| 252 |
+
- `references/halt-rules.md` — accept/revert/halt logic in formal pseudocode
|
| 253 |
+
- `references/safe-revision-rules.md` — anti-reward-hack constraints
|
| 254 |
+
- `scripts/score_delta.py` — accept/revert decision from two score JSONs
|
| 255 |
+
- `scripts/apply_worklog.py` — append iteration entries to worklog.json
|
| 256 |
+
- `scripts/snapshot.py` — copy paper.tex/paper.pdf into iter<N>/ for rollback
|
.scider/skills/content-refinement-agent/references/halt-rules.md
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Halt Rules
|
| 2 |
+
|
| 3 |
+
Source: arXiv:2604.05018, §4 Step 5 ("Iterative Content Refinement"):
|
| 4 |
+
|
| 5 |
+
> After modifying the LaTeX source to address weaknesses, revisions are
|
| 6 |
+
> accepted if the overall score increases, or if it ties when net sub-axis
|
| 7 |
+
> gains are non-negative. The agent immediately reverts to the previous
|
| 8 |
+
> version and halts upon any overall score decrease, negative tie-breaker,
|
| 9 |
+
> or reaching the iteration limit.
|
| 10 |
+
|
| 11 |
+
Encoded as deterministic logic in `scripts/score_delta.py`. This file is the
|
| 12 |
+
human-readable specification.
|
| 13 |
+
|
| 14 |
+
## Definitions
|
| 15 |
+
|
| 16 |
+
Let:
|
| 17 |
+
|
| 18 |
+
- `prev` = score JSON from the previous accepted iteration
|
| 19 |
+
- `curr` = score JSON from the just-completed iteration
|
| 20 |
+
- `prev.overall` = `prev.overall_score`
|
| 21 |
+
- `curr.overall` = `curr.overall_score`
|
| 22 |
+
- `subaxis_delta(axis)` = `curr.axis_scores[axis].score - prev.axis_scores[axis].score`
|
| 23 |
+
- `net_subaxis_delta` = `sum(subaxis_delta(a) for a in 6 axes)`
|
| 24 |
+
|
| 25 |
+
## Decision rules (in order)
|
| 26 |
+
|
| 27 |
+
```
|
| 28 |
+
if curr.overall > prev.overall:
|
| 29 |
+
DECISION = ACCEPT_IMPROVED
|
| 30 |
+
|
| 31 |
+
elif curr.overall == prev.overall:
|
| 32 |
+
if net_subaxis_delta >= 0:
|
| 33 |
+
DECISION = ACCEPT_TIED_NON_NEGATIVE
|
| 34 |
+
else:
|
| 35 |
+
DECISION = REVERT_TIED_NEGATIVE_SUBAXIS
|
| 36 |
+
|
| 37 |
+
else: # curr.overall < prev.overall
|
| 38 |
+
DECISION = REVERT_OVERALL_DECREASED
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
The script exits with:
|
| 42 |
+
|
| 43 |
+
| Exit code | Meaning | Loop action |
|
| 44 |
+
|---|---|---|
|
| 45 |
+
| 0 | ACCEPT_IMPROVED | keep new draft, continue loop |
|
| 46 |
+
| 0 | ACCEPT_TIED_NON_NEGATIVE | keep new draft, continue loop |
|
| 47 |
+
| 1 | REVERT_OVERALL_DECREASED | rollback to prev, halt loop |
|
| 48 |
+
| 2 | REVERT_TIED_NEGATIVE_SUBAXIS | rollback to prev, halt loop |
|
| 49 |
+
|
| 50 |
+
The script also prints a one-line decision string and a JSON object on
|
| 51 |
+
stdout for the host agent to log.
|
| 52 |
+
|
| 53 |
+
## Loop-level halt conditions
|
| 54 |
+
|
| 55 |
+
In addition to the per-iteration accept/revert decision, the loop halts
|
| 56 |
+
when ANY of these is true:
|
| 57 |
+
|
| 58 |
+
1. **Iteration cap reached.** Default 3 (configurable via env var
|
| 59 |
+
`PO_REFINE_MAX_ITER`). Per the paper Table 7, the typical
|
| 60 |
+
refinement count is "3× content refinement loop".
|
| 61 |
+
2. **REVERT decision** from `score_delta.py` (exit code 1 or 2).
|
| 62 |
+
3. **Empty weaknesses list.** If the simulated reviewer's `weaknesses`
|
| 63 |
+
array is empty, there is nothing to fix — halt.
|
| 64 |
+
4. **Plateau early-stop (exit code 4).** `score_delta.py` returns
|
| 65 |
+
`HALT_PLATEAU` when `N` consecutive accepted iterations each have
|
| 66 |
+
`overall_delta < threshold`. Default: threshold=1.0 points, N=3.
|
| 67 |
+
Configurable via `--plateau-threshold` and `--plateau-streak`.
|
| 68 |
+
|
| 69 |
+
The calling loop must pass `--consecutive-small <count>` to
|
| 70 |
+
`score_delta.py` to track the streak across iterations:
|
| 71 |
+
|
| 72 |
+
```bash
|
| 73 |
+
CONSECUTIVE_SMALL=0
|
| 74 |
+
for iter in 1 2 3 ...; do
|
| 75 |
+
# ... run refinement LLM call ...
|
| 76 |
+
python score_delta.py \
|
| 77 |
+
--prev iter$((iter-1))/score.json \
|
| 78 |
+
--curr iter${iter}/score.json \
|
| 79 |
+
--plateau-threshold 1.0 \
|
| 80 |
+
--plateau-streak 3 \
|
| 81 |
+
--consecutive-small $CONSECUTIVE_SMALL
|
| 82 |
+
EXIT=$?
|
| 83 |
+
# Update streak counter from script output
|
| 84 |
+
CONSECUTIVE_SMALL=$(python -c "import json,sys; \
|
| 85 |
+
d=json.loads(open('iter${iter}/delta.json').read()); \
|
| 86 |
+
print(d['consecutive_small'])")
|
| 87 |
+
if [ $EXIT -ne 0 ]; then break; fi
|
| 88 |
+
done
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
**Why this matters**: in practice, ~85% of the refinement gain comes
|
| 92 |
+
in the first iteration (scores jump 5-8 points). Subsequent iterations
|
| 93 |
+
typically improve by <1 point. Without early-stop, the loop runs 3 full
|
| 94 |
+
LLM calls even when iterations 2 and 3 contribute near-zero value.
|
| 95 |
+
|
| 96 |
+
## Promoting the best snapshot
|
| 97 |
+
|
| 98 |
+
After halt, identify the iteration with the highest `accepted` overall
|
| 99 |
+
score:
|
| 100 |
+
|
| 101 |
+
```python
|
| 102 |
+
accepted_iters = [it for it in worklog.iterations if it.decision.startswith("ACCEPT")]
|
| 103 |
+
best = max(accepted_iters, key=lambda it: it.score.overall_score)
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
If the loop halted on REVERT, `best` is the iteration immediately *before*
|
| 107 |
+
the reverted one. Copy its `paper.tex` and `paper.pdf` to
|
| 108 |
+
`workspace/final/`.
|
| 109 |
+
|
| 110 |
+
## Worked example
|
| 111 |
+
|
| 112 |
+
Suppose:
|
| 113 |
+
|
| 114 |
+
| iter | overall | depth | exec | flow | clarity | evidence | style | decision |
|
| 115 |
+
|---|---|---|---|---|---|---|---|---|
|
| 116 |
+
| 0 | 64.5 | 65 | 70 | 60 | 55 | 72 | 68 | (baseline) |
|
| 117 |
+
| 1 | 67.3 | 68 | 73 | 64 | 58 | 74 | 70 | ACCEPT_IMPROVED |
|
| 118 |
+
| 2 | 67.3 | 70 | 73 | 64 | 58 | 73 | 71 | ACCEPT_TIED_NON_NEGATIVE (Σdelta = +2) |
|
| 119 |
+
| 3 | 66.0 | 70 | 70 | 62 | 56 | 73 | 71 | REVERT_OVERALL_DECREASED, HALT |
|
| 120 |
+
|
| 121 |
+
Promoted: iter 2 (`final/paper.tex` ← `iter2/paper.tex`).
|
| 122 |
+
Score trajectory in the run report:
|
| 123 |
+
```
|
| 124 |
+
64.5 → 67.3 (accept) → 67.3 (accept tied) → 66.0 (revert, halt)
|
| 125 |
+
```
|
.scider/skills/content-refinement-agent/references/prompt.md
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Content Refinement Agent — verbatim prompt
|
| 2 |
+
|
| 3 |
+
**Source: arXiv:2604.05018, Appendix F.1, pages 49–51 (verbatim).**
|
| 4 |
+
|
| 5 |
+
This is the exact prompt used by the Content Refinement Agent in the paper.
|
| 6 |
+
Use it as your system message when applying a revision. The Anti-Leakage
|
| 7 |
+
Prompt (`../paper-orchestra/references/anti-leakage-prompt.md`) MUST be
|
| 8 |
+
prepended.
|
| 9 |
+
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
```
|
| 13 |
+
Role: Senior AI Researcher.
|
| 14 |
+
|
| 15 |
+
Task: Revise and strengthen a LaTeX research paper by systematically
|
| 16 |
+
addressing peer review feedback.
|
| 17 |
+
|
| 18 |
+
You are the author responsible for the "Rebuttal via Revision" phase. You
|
| 19 |
+
will receive:
|
| 20 |
+
- paper.tex: The current LaTeX source code.
|
| 21 |
+
- paper.pdf: The compiled PDF context.
|
| 22 |
+
- conference_guidelines.md: The formatting and page limit rules.
|
| 23 |
+
- experimental_log.md: The Ground Truth for all data and metrics.
|
| 24 |
+
- worklog.json: History of previous changes.
|
| 25 |
+
- citation_map.json: The allowed bibliography.
|
| 26 |
+
- reviewer_feedback: A JSON object containing specific Strengths,
|
| 27 |
+
Weaknesses, Questions, and Decisions from an LLM reviewer.
|
| 28 |
+
|
| 29 |
+
Your Goal
|
| 30 |
+
|
| 31 |
+
1. Analyze Feedback: Deconstruct the reviewer_feedback into actionable
|
| 32 |
+
editing tasks.
|
| 33 |
+
2. Address Weaknesses: Rewrite sections to clarify logic, strengthen
|
| 34 |
+
arguments, or justify design choices pointed out as weak.
|
| 35 |
+
3. Integrate Answers: Incorporate answers to the reviewer's "Questions"
|
| 36 |
+
directly into the manuscript (e.g., adding training cost details to
|
| 37 |
+
the Implementation section).
|
| 38 |
+
4. Execution: Generate a JSON worklog of your editorial decisions and the
|
| 39 |
+
full, revised LaTeX source.
|
| 40 |
+
|
| 41 |
+
Critical Execution Standards
|
| 42 |
+
|
| 43 |
+
1. Content Revision Strategy
|
| 44 |
+
- Weakness Mitigation: If the reviewer flags "incremental novelty",
|
| 45 |
+
rewrite the Introduction and Related Work to explicitly contrast
|
| 46 |
+
your contribution against prior art. If they flag "unclear
|
| 47 |
+
methodology", restructure the relevant section for clarity.
|
| 48 |
+
- Answering Questions: Do NOT write a separate response letter. If the
|
| 49 |
+
reviewer asks "What is the inference latency?", you must find a
|
| 50 |
+
natural place in the paper (e.g., Experiments or Discussion) to
|
| 51 |
+
insert that information, ensuring it aligns with experimental_log.md.
|
| 52 |
+
- Preserve Strengths: Do not delete or heavily alter sections listed
|
| 53 |
+
under "Strengths" unless necessary for space or flow.
|
| 54 |
+
|
| 55 |
+
2. Data Integrity & Hallucination Check
|
| 56 |
+
- Ground Truth: All numerical claims (accuracy, parameter count,
|
| 57 |
+
training hours, latency) MUST be verified against
|
| 58 |
+
experimental_log.md.
|
| 59 |
+
- Missing Data: If the reviewer asks for new experiments, ablations, or
|
| 60 |
+
baselines that are NOT in experimental_log.md, simply ignore those
|
| 61 |
+
specific requests. Your job is purely presentation refinement of the
|
| 62 |
+
existing completed experiments, not adding or promising to add new
|
| 63 |
+
experiments.
|
| 64 |
+
|
| 65 |
+
3. Writing Style & Tone
|
| 66 |
+
- Academic Tone: Maintain a formal, objective, and precise tone. Avoid
|
| 67 |
+
defensive language.
|
| 68 |
+
- Conciseness: If the paper is near the page limit, prioritize density
|
| 69 |
+
of information over flowery prose.
|
| 70 |
+
- Flow: Ensure that new insertions (answers to questions) transition
|
| 71 |
+
smoothly with existing text.
|
| 72 |
+
|
| 73 |
+
4. LaTeX & Citation Integrity
|
| 74 |
+
- Structure: Do not break the LaTeX compilation. Keep packages and
|
| 75 |
+
environments stable. If using figure* for wide figures, ensure they
|
| 76 |
+
are closed with \end{{figure*}} (not \end{{figure}}). Check for
|
| 77 |
+
completeness.
|
| 78 |
+
- Citations: Use ONLY keys from citation_map.json.
|
| 79 |
+
|
| 80 |
+
Output Format (Strict)
|
| 81 |
+
|
| 82 |
+
You MUST return your response in two distinct code blocks in this exact
|
| 83 |
+
order:
|
| 84 |
+
|
| 85 |
+
1. Worklog for the current turn (JSON):
|
| 86 |
+
{{
|
| 87 |
+
"addressed_weaknesses": [
|
| 88 |
+
"Clarified contribution novelty in Intro (Reviewer point 2)",
|
| 89 |
+
"Added justification for two-stage training (Reviewer point 1)"
|
| 90 |
+
],
|
| 91 |
+
"integrated_answers": [
|
| 92 |
+
"Added training cost (45 GPU hours) to Implementation Details",
|
| 93 |
+
"Added epsilon hyperparameter explanation to Method section"
|
| 94 |
+
],
|
| 95 |
+
"actions_taken": [
|
| 96 |
+
"Rewrote Section 3.2 for clarity",
|
| 97 |
+
"Inserted new paragraph in Section 5.1 regarding latency"
|
| 98 |
+
]
|
| 99 |
+
}}
|
| 100 |
+
|
| 101 |
+
2. The FULL revised LaTeX code:
|
| 102 |
+
```latex
|
| 103 |
+
... Full revised LaTeX code here ...
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
Important Notes
|
| 107 |
+
|
| 108 |
+
- Completeness: Always provide the FULL LaTeX code. Do not return diffs
|
| 109 |
+
or partial snippets.
|
| 110 |
+
- Responsiveness: Every question in the reviewer_feedback must be
|
| 111 |
+
addressed by improving the presentation, EXCEPT for questions asking
|
| 112 |
+
for new experiments or data not in experimental_log.md (which should
|
| 113 |
+
be ignored). Never explicitly state a limitation.
|
| 114 |
+
- Safety: Do not remove the \documentclass or essential preamble.
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
---
|
| 118 |
+
|
| 119 |
+
## Why "never explicitly state a limitation" is a hard rule
|
| 120 |
+
|
| 121 |
+
From App. F.1 p.51, the paper explains:
|
| 122 |
+
|
| 123 |
+
> We explicitly instruct the Content Refinement Agent to ignore reviewer
|
| 124 |
+
> requests for additional experiments. This constraint is crucial to
|
| 125 |
+
> prevent the agent from generating fabricated results or making false
|
| 126 |
+
> promises within the paper... Furthermore, the directive to "never
|
| 127 |
+
> explicitly state a limitation" prevents reward hacking. During early
|
| 128 |
+
> testing, the agent exploited the automated reviewer's scoring function
|
| 129 |
+
> by superficially listing missing baselines as limitations to
|
| 130 |
+
> artificially inflate acceptance scores. Banning this behavior from the
|
| 131 |
+
> refinement loop forces the agent to genuinely improve the manuscript's
|
| 132 |
+
> presentation and clarity rather than gamifying the evaluation metric.
|
| 133 |
+
|
| 134 |
+
`safe-revision-rules.md` formalizes this as a deterministic gate the host
|
| 135 |
+
agent should run after each revision: grep the new draft for the substring
|
| 136 |
+
`limitation` (case-insensitive) and reject if found.
|
.scider/skills/content-refinement-agent/references/reviewer-rubric.md
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Reviewer Rubric (AgentReview-style)
|
| 2 |
+
|
| 3 |
+
The Content Refinement Agent loop needs a simulated reviewer that produces
|
| 4 |
+
**structured, scoreable** feedback the host agent can compare iteration to
|
| 5 |
+
iteration. The paper uses AgentReview (Jin et al., 2024) as its evaluator
|
| 6 |
+
in §5 (App. F.1 references "AgentReview" by name and uses its output schema:
|
| 7 |
+
"strengths, weaknesses, questions, decisions").
|
| 8 |
+
|
| 9 |
+
This document defines a faithful AgentReview-style reviewer prompt to use
|
| 10 |
+
under any host LLM. Use it as the system message for the simulated review
|
| 11 |
+
call before each refinement iteration.
|
| 12 |
+
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
## System prompt for the simulated reviewer
|
| 16 |
+
|
| 17 |
+
```
|
| 18 |
+
You are an expert academic peer reviewer for a top-tier machine learning
|
| 19 |
+
conference (CVPR, ICLR, NeurIPS, ICML). Read the provided LaTeX paper or
|
| 20 |
+
PDF and produce a rigorous, structured review.
|
| 21 |
+
|
| 22 |
+
Your review must be CONSERVATIVE. High scores are rare and must be
|
| 23 |
+
explicitly justified with concrete evidence from the paper. Assume most
|
| 24 |
+
drafts are not publication-ready.
|
| 25 |
+
|
| 26 |
+
You MUST score the paper on six axes (0-100 each):
|
| 27 |
+
|
| 28 |
+
1. Scientific Depth & Soundness
|
| 29 |
+
- Are the theoretical foundations and experimental setups rigorous?
|
| 30 |
+
- Are claims justified and free of unsupported leaps?
|
| 31 |
+
|
| 32 |
+
2. Technical Execution
|
| 33 |
+
- Within the bounds of the described idea, is the methodology
|
| 34 |
+
implemented innovatively and effectively?
|
| 35 |
+
- Are the design choices justified by the experimental results?
|
| 36 |
+
|
| 37 |
+
3. Logical Flow
|
| 38 |
+
- Do sections transition smoothly from Abstract through Conclusion?
|
| 39 |
+
- Are subsections structured logically with clear signposting?
|
| 40 |
+
|
| 41 |
+
4. Writing Clarity
|
| 42 |
+
- Is the prose precise, concise, and free of repetitive phrasing?
|
| 43 |
+
- Are technical terms defined before use?
|
| 44 |
+
|
| 45 |
+
5. Evidence Presentation
|
| 46 |
+
- Are figures, tables, and results integrated and referenced cleanly?
|
| 47 |
+
- Do visuals support the text claims directly?
|
| 48 |
+
|
| 49 |
+
6. Academic Style
|
| 50 |
+
- Polished, professional academic tone?
|
| 51 |
+
- Consistent terminology throughout?
|
| 52 |
+
|
| 53 |
+
For each axis, provide a score AND a 2-5 sentence evidence-based
|
| 54 |
+
justification quoting concrete passages or pointing to specific failings.
|
| 55 |
+
|
| 56 |
+
Then identify:
|
| 57 |
+
|
| 58 |
+
- Strengths: 3-5 bullet points naming things the paper does well.
|
| 59 |
+
- Weaknesses: 3-5 bullet points naming concrete, fixable issues.
|
| 60 |
+
- Questions: 2-4 specific questions the paper should answer for a
|
| 61 |
+
reader to be convinced.
|
| 62 |
+
- Decision: one of "Strong Accept", "Accept", "Borderline", "Reject",
|
| 63 |
+
"Strong Reject".
|
| 64 |
+
- Overall Score: weighted average 0-100. Use:
|
| 65 |
+
overall = 0.20*depth + 0.20*execution + 0.15*flow
|
| 66 |
+
+ 0.15*clarity + 0.20*evidence + 0.10*style
|
| 67 |
+
|
| 68 |
+
Output STRICT JSON only. No prose outside the JSON.
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
## Output JSON schema
|
| 72 |
+
|
| 73 |
+
```json
|
| 74 |
+
{
|
| 75 |
+
"axis_scores": {
|
| 76 |
+
"scientific_depth": {
|
| 77 |
+
"score": 65,
|
| 78 |
+
"justification": "Loss formulation is grounded in the cited prior work but the ablation on the audio-visual fusion layer is small (n=3 seeds) and the variance bands overlap, making the claim of necessity weak. Section 3.2 introduces the cached memory without proving its necessity vs. simple pooling."
|
| 79 |
+
},
|
| 80 |
+
"technical_execution": { "score": 70, "justification": "..." },
|
| 81 |
+
"logical_flow": { "score": 60, "justification": "..." },
|
| 82 |
+
"writing_clarity": { "score": 55, "justification": "..." },
|
| 83 |
+
"evidence_presentation": { "score": 72, "justification": "..." },
|
| 84 |
+
"academic_style": { "score": 68, "justification": "..." }
|
| 85 |
+
},
|
| 86 |
+
"strengths": [
|
| 87 |
+
"Clear problem statement in the Introduction with three concrete failure cases of prior SAM-based methods.",
|
| 88 |
+
"Well-organized Related Work that contrasts the three competing paradigms.",
|
| 89 |
+
"..."
|
| 90 |
+
],
|
| 91 |
+
"weaknesses": [
|
| 92 |
+
"The ablation in Table 2 lacks confidence intervals; 0.4 J-index gaps may not be significant.",
|
| 93 |
+
"Section 3.4 introduces the IoU loss term λ without justifying λ=1.0 vs other values.",
|
| 94 |
+
"Figure 3 is referenced once and never discussed in the prose.",
|
| 95 |
+
"..."
|
| 96 |
+
],
|
| 97 |
+
"questions": [
|
| 98 |
+
"What is the inference latency on a single A100?",
|
| 99 |
+
"How does the temporal branch behave on videos longer than the training distribution?"
|
| 100 |
+
],
|
| 101 |
+
"decision": "Borderline",
|
| 102 |
+
"overall_score": 64.5
|
| 103 |
+
}
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
## How the loop uses this output
|
| 107 |
+
|
| 108 |
+
The `score_delta.py` script reads two consecutive score JSONs and applies
|
| 109 |
+
the halt rules. The `apply_worklog.py` script appends a timestamped entry
|
| 110 |
+
to `workspace/refinement/worklog.json`. The Content Refinement Agent's
|
| 111 |
+
revision call takes the full `review.json` as `reviewer_feedback` input.
|
| 112 |
+
|
| 113 |
+
## Anti-inflation guardrails
|
| 114 |
+
|
| 115 |
+
To prevent the simulated reviewer from being gameable, the rubric has hard
|
| 116 |
+
caps drawn from the paper's Literature Review Quality autorater
|
| 117 |
+
(App. F.3 — see also `paper-autoraters/references/litreview-quality-prompt.md`):
|
| 118 |
+
|
| 119 |
+
| Axis | Hard cap |
|
| 120 |
+
|---|---|
|
| 121 |
+
| Scientific Depth | ≤60 if claims are unsupported by experiments |
|
| 122 |
+
| Technical Execution | ≤55 if methodology section omits key implementation details |
|
| 123 |
+
| Logical Flow | ≤60 if sections don't reference the figures/tables they need |
|
| 124 |
+
| Writing Clarity | ≤60 if repetitive phrasing or undefined acronyms |
|
| 125 |
+
| Evidence Presentation | ≤55 if any figure is unreferenced from the text |
|
| 126 |
+
| Academic Style | ≤55 if defensive language is present |
|
| 127 |
+
|
| 128 |
+
These caps are baked into the rubric prompt to keep the reviewer honest.
|
| 129 |
+
The Content Refinement Agent's "never explicitly state a limitation" rule
|
| 130 |
+
combined with these caps closes the reward-hacking loop the paper observed
|
| 131 |
+
in early testing (App. F.1 p.51).
|
.scider/skills/content-refinement-agent/references/safe-revision-rules.md
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Safe Revision Rules
|
| 2 |
+
|
| 3 |
+
The Content Refinement Agent prompt (App. F.1 p.50–51) imposes two
|
| 4 |
+
anti-reward-hacking constraints. Both must be enforced not just by the
|
| 5 |
+
prompt but by deterministic post-revision gates, because LLMs occasionally
|
| 6 |
+
forget instructions buried in long prompts.
|
| 7 |
+
|
| 8 |
+
## Rule 1 — Ignore reviewer requests for new experiments
|
| 9 |
+
|
| 10 |
+
The simulated reviewer will sometimes ask:
|
| 11 |
+
|
| 12 |
+
- "What if you ablated the temperature parameter?"
|
| 13 |
+
- "How does this compare to baseline X?"
|
| 14 |
+
- "Have you tried this on dataset Y?"
|
| 15 |
+
|
| 16 |
+
The Refinement Agent must **not** fabricate answers to these. The paper:
|
| 17 |
+
|
| 18 |
+
> If the reviewer asks for new experiments, ablations, or baselines that
|
| 19 |
+
> are NOT in experimental_log.md, simply ignore those specific requests.
|
| 20 |
+
> Your job is purely presentation refinement of the existing completed
|
| 21 |
+
> experiments, not adding or promising to add new experiments.
|
| 22 |
+
|
| 23 |
+
### Enforcement
|
| 24 |
+
|
| 25 |
+
There is no fully deterministic way to grep for "fabricated experiments" —
|
| 26 |
+
it requires reading the new content and cross-checking against
|
| 27 |
+
`experimental_log.md`. The pragmatic check:
|
| 28 |
+
|
| 29 |
+
1. Run the orphan-citation gate from `section-writing-agent/scripts/orphan_cite_gate.py`.
|
| 30 |
+
New numeric claims often come bundled with new (orphan) citations.
|
| 31 |
+
2. Run a numeric-claim grep: extract every `\d+\.\d+%?` from the new draft,
|
| 32 |
+
intersect with `\d+\.\d+%?` in `experimental_log.md`. New numbers in the
|
| 33 |
+
draft that aren't in the log are suspicious. (False positives possible
|
| 34 |
+
for parameter counts and dates; review manually.)
|
| 35 |
+
|
| 36 |
+
The orchestrator should re-prompt the refinement step if either gate fires
|
| 37 |
+
with new fabricated claims.
|
| 38 |
+
|
| 39 |
+
## Rule 2 — Never explicitly state a limitation
|
| 40 |
+
|
| 41 |
+
The paper:
|
| 42 |
+
|
| 43 |
+
> The directive to "never explicitly state a limitation" prevents reward
|
| 44 |
+
> hacking. During early testing, the agent exploited the automated
|
| 45 |
+
> reviewer's scoring function by superficially listing missing baselines
|
| 46 |
+
> as limitations to artificially inflate acceptance scores.
|
| 47 |
+
|
| 48 |
+
### Enforcement (deterministic)
|
| 49 |
+
|
| 50 |
+
Grep the revised draft for the substring `limitation` (case-insensitive),
|
| 51 |
+
excluding LaTeX comments. If found anywhere in the body, reject the
|
| 52 |
+
revision and re-prompt:
|
| 53 |
+
|
| 54 |
+
```bash
|
| 55 |
+
# pseudocode — implement inline in the host agent
|
| 56 |
+
grep -in -E '\blimitation' workspace/refinement/iter<N>/paper.tex \
|
| 57 |
+
| grep -v '^\s*%'
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
Allowed contexts (these are NOT violations):
|
| 61 |
+
|
| 62 |
+
- LaTeX comments: `% address the limitation of ...`
|
| 63 |
+
- Citation context: a paper title containing "limitation" cited in
|
| 64 |
+
`\cite{...}`. The grep should ignore the inside of `\cite{...}` braces.
|
| 65 |
+
- Quoted prior-work descriptions: "Smith et al. acknowledge the
|
| 66 |
+
limitation..." — context-dependent. The simplest rule is "no instances
|
| 67 |
+
of the word 'limitation' in the running prose at all", and let the host
|
| 68 |
+
agent handle edge cases by re-prompting if a legitimate use is needed.
|
| 69 |
+
|
| 70 |
+
This is a strict rule. The Refinement Agent should rewrite "we acknowledge
|
| 71 |
+
the limitation that our method..." as "our method assumes..." or "the
|
| 72 |
+
proposed approach is most effective when...". Reframing, not listing.
|
| 73 |
+
|
| 74 |
+
## Rule 3 — Numeric ground truth
|
| 75 |
+
|
| 76 |
+
> All numerical claims (accuracy, parameter count, training hours,
|
| 77 |
+
> latency) MUST be verified against experimental_log.md.
|
| 78 |
+
|
| 79 |
+
The grep heuristic above catches this partially. The host agent should
|
| 80 |
+
also instruct the refinement step explicitly: "any numeric value you cite
|
| 81 |
+
in your revision must already exist in experimental_log.md or
|
| 82 |
+
metrics.json."
|
| 83 |
+
|
| 84 |
+
## Rule 4 — Citation integrity
|
| 85 |
+
|
| 86 |
+
The orphan-citation gate from
|
| 87 |
+
`section-writing-agent/scripts/orphan_cite_gate.py` must pass after every
|
| 88 |
+
refinement iteration. Re-run it as part of the post-revision checks:
|
| 89 |
+
|
| 90 |
+
```bash
|
| 91 |
+
python skills/section-writing-agent/scripts/orphan_cite_gate.py \
|
| 92 |
+
workspace/refinement/iter<N>/paper.tex \
|
| 93 |
+
workspace/refs.bib
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
If the refinement step introduced a new `\cite{KEY}` not in `refs.bib`,
|
| 97 |
+
revert the iteration and re-prompt with an explicit instruction to use
|
| 98 |
+
only existing keys.
|
| 99 |
+
|
| 100 |
+
## Rule 5 — LaTeX integrity
|
| 101 |
+
|
| 102 |
+
Re-run `latex_sanity.py` and `latexmk -pdf` after every revision. If the
|
| 103 |
+
revision broke the build, revert.
|
| 104 |
+
|
| 105 |
+
## Summary checklist for each refinement iteration
|
| 106 |
+
|
| 107 |
+
```bash
|
| 108 |
+
# 1. apply revision → iter<N>/paper.tex
|
| 109 |
+
# 2. compile
|
| 110 |
+
cd workspace/refinement/iter<N>/ && latexmk -pdf -interaction=nonstopmode paper.tex
|
| 111 |
+
|
| 112 |
+
# 3. structural sanity
|
| 113 |
+
python skills/section-writing-agent/scripts/latex_sanity.py paper.tex || REVERT
|
| 114 |
+
python skills/section-writing-agent/scripts/orphan_cite_gate.py paper.tex ../../refs.bib || REVERT
|
| 115 |
+
|
| 116 |
+
# 4. anti-leakage
|
| 117 |
+
python skills/paper-orchestra/scripts/anti_leakage_check.py paper.tex || REVERT
|
| 118 |
+
|
| 119 |
+
# 5. limitation grep (Rule 2)
|
| 120 |
+
grep -in -E '\blimitation' paper.tex | grep -v '^\s*%' && REVERT
|
| 121 |
+
|
| 122 |
+
# 6. score and decide
|
| 123 |
+
python skills/content-refinement-agent/scripts/score_delta.py \
|
| 124 |
+
--prev ../iter<N-1>/score.json --curr score.json
|
| 125 |
+
# exit 0 → keep, exit 1/2 → revert
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
If all gates pass and `score_delta.py` returns 0, the iteration is
|
| 129 |
+
accepted.
|
.scider/skills/content-refinement-agent/scripts/apply_worklog.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
apply_worklog.py — Append a timestamped iteration entry to worklog.json.
|
| 4 |
+
|
| 5 |
+
The worklog is the canonical history of the refinement loop: every
|
| 6 |
+
iteration's review, score, decision, and actions taken. The orchestrator
|
| 7 |
+
reads it at the end to identify the best snapshot to promote.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
python apply_worklog.py \\
|
| 11 |
+
--worklog workspace/refinement/worklog.json \\
|
| 12 |
+
--iter 2 \\
|
| 13 |
+
--review iter2/review.json \\
|
| 14 |
+
--score iter2/score.json \\
|
| 15 |
+
--decision ACCEPT_IMPROVED \\
|
| 16 |
+
--actions iter2/worklog_entry.json # the agent's emitted worklog block
|
| 17 |
+
|
| 18 |
+
The script creates worklog.json if it doesn't exist.
|
| 19 |
+
"""
|
| 20 |
+
import argparse
|
| 21 |
+
import datetime as dt
|
| 22 |
+
import json
|
| 23 |
+
import os
|
| 24 |
+
import sys
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def load_json(path: str | None) -> dict | list | None:
|
| 28 |
+
if not path or not os.path.exists(path):
|
| 29 |
+
return None
|
| 30 |
+
with open(path) as f:
|
| 31 |
+
return json.load(f)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def main() -> int:
|
| 35 |
+
p = argparse.ArgumentParser(description=__doc__)
|
| 36 |
+
p.add_argument("--worklog", required=True, help="path to worklog.json")
|
| 37 |
+
p.add_argument("--iter", type=int, required=True, help="iteration number (0-indexed)")
|
| 38 |
+
p.add_argument("--review", help="path to review.json for this iteration")
|
| 39 |
+
p.add_argument("--score", help="path to score.json for this iteration")
|
| 40 |
+
p.add_argument(
|
| 41 |
+
"--decision",
|
| 42 |
+
required=True,
|
| 43 |
+
help="ACCEPT_IMPROVED / ACCEPT_TIED_NON_NEGATIVE / "
|
| 44 |
+
"REVERT_OVERALL_DECREASED / REVERT_TIED_NEGATIVE_SUBAXIS",
|
| 45 |
+
)
|
| 46 |
+
p.add_argument(
|
| 47 |
+
"--actions",
|
| 48 |
+
help="path to the agent's worklog block JSON "
|
| 49 |
+
"(addressed_weaknesses, integrated_answers, actions_taken)",
|
| 50 |
+
)
|
| 51 |
+
p.add_argument("--halted-because", help="reason if this iteration triggers a halt")
|
| 52 |
+
args = p.parse_args()
|
| 53 |
+
|
| 54 |
+
if os.path.exists(args.worklog):
|
| 55 |
+
with open(args.worklog) as f:
|
| 56 |
+
wl = json.load(f)
|
| 57 |
+
else:
|
| 58 |
+
wl = {"iterations": [], "halted_because": None, "best_iter": None}
|
| 59 |
+
|
| 60 |
+
entry = {
|
| 61 |
+
"iter": args.iter,
|
| 62 |
+
"timestamp": dt.datetime.now(dt.timezone.utc).isoformat(),
|
| 63 |
+
"decision": args.decision,
|
| 64 |
+
"review": load_json(args.review),
|
| 65 |
+
"score": load_json(args.score),
|
| 66 |
+
"actions": load_json(args.actions),
|
| 67 |
+
}
|
| 68 |
+
wl["iterations"].append(entry)
|
| 69 |
+
|
| 70 |
+
if args.halted_because:
|
| 71 |
+
wl["halted_because"] = args.halted_because
|
| 72 |
+
|
| 73 |
+
# Re-compute best_iter: highest accepted overall_score
|
| 74 |
+
accepted = [
|
| 75 |
+
it
|
| 76 |
+
for it in wl["iterations"]
|
| 77 |
+
if it.get("decision", "").startswith("ACCEPT") and it.get("score")
|
| 78 |
+
]
|
| 79 |
+
if accepted:
|
| 80 |
+
best = max(accepted, key=lambda it: it["score"].get("overall_score", 0))
|
| 81 |
+
wl["best_iter"] = best["iter"]
|
| 82 |
+
|
| 83 |
+
os.makedirs(os.path.dirname(os.path.abspath(args.worklog)) or ".", exist_ok=True)
|
| 84 |
+
with open(args.worklog, "w") as f:
|
| 85 |
+
json.dump(wl, f, indent=2, ensure_ascii=False)
|
| 86 |
+
|
| 87 |
+
print(f"OK: appended iter {args.iter} ({args.decision}) to {args.worklog}")
|
| 88 |
+
if wl["best_iter"] is not None:
|
| 89 |
+
print(f" current best_iter: {wl['best_iter']}")
|
| 90 |
+
return 0
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
sys.exit(main())
|
.scider/skills/content-refinement-agent/scripts/score_delta.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
score_delta.py — Apply the PaperOrchestra refinement halt rules from two
|
| 4 |
+
score JSONs.
|
| 5 |
+
|
| 6 |
+
Encodes the halt rules from arXiv:2604.05018 §4 Step 5:
|
| 7 |
+
|
| 8 |
+
- ACCEPT if curr.overall > prev.overall
|
| 9 |
+
- ACCEPT if curr.overall == prev.overall AND net sub-axis delta >= 0
|
| 10 |
+
- REVERT (overall_decreased) if curr.overall < prev.overall
|
| 11 |
+
- REVERT (tied_negative_subaxis) if curr.overall == prev.overall AND
|
| 12 |
+
net sub-axis delta < 0
|
| 13 |
+
|
| 14 |
+
Additionally encodes the plateau early-stop rule (not in the original paper
|
| 15 |
+
but added to match its cost budget of ~5-7 LLM calls):
|
| 16 |
+
|
| 17 |
+
- HALT_PLATEAU if the improvement is accepted but overall_delta is below
|
| 18 |
+
--plateau-threshold for --plateau-streak or more consecutive iterations.
|
| 19 |
+
Exit code 4. The loop should stop — further iterations are unlikely to
|
| 20 |
+
yield meaningful gains.
|
| 21 |
+
|
| 22 |
+
Exit codes:
|
| 23 |
+
0 ACCEPT (improved or tied non-negative, and no plateau)
|
| 24 |
+
1 REVERT (overall decreased)
|
| 25 |
+
2 REVERT (tied with negative sub-axis delta)
|
| 26 |
+
3 argument or input error
|
| 27 |
+
4 HALT_PLATEAU (accepted but diminishing returns detected)
|
| 28 |
+
|
| 29 |
+
Score JSON shape (see references/reviewer-rubric.md):
|
| 30 |
+
{
|
| 31 |
+
"axis_scores": {
|
| 32 |
+
"scientific_depth": {"score": 65, ...},
|
| 33 |
+
"technical_execution": {"score": 70, ...},
|
| 34 |
+
"logical_flow": {"score": 60, ...},
|
| 35 |
+
"writing_clarity": {"score": 55, ...},
|
| 36 |
+
"evidence_presentation":{"score": 72, ...},
|
| 37 |
+
"academic_style": {"score": 68, ...}
|
| 38 |
+
},
|
| 39 |
+
"overall_score": 64.5,
|
| 40 |
+
...
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
Usage:
|
| 44 |
+
python score_delta.py --prev iter0/score.json --curr iter1/score.json
|
| 45 |
+
python score_delta.py --prev iter2/score.json --curr iter3/score.json \\
|
| 46 |
+
--plateau-threshold 1.0 --plateau-streak 2 --consecutive-small 2
|
| 47 |
+
"""
|
| 48 |
+
import argparse
|
| 49 |
+
import json
|
| 50 |
+
import sys
|
| 51 |
+
|
| 52 |
+
AXES = [
|
| 53 |
+
"scientific_depth",
|
| 54 |
+
"technical_execution",
|
| 55 |
+
"logical_flow",
|
| 56 |
+
"writing_clarity",
|
| 57 |
+
"evidence_presentation",
|
| 58 |
+
"academic_style",
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
DEFAULT_PLATEAU_THRESHOLD = 1.0 # points
|
| 62 |
+
DEFAULT_PLATEAU_STREAK = 3 # consecutive iterations below threshold → halt
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def load(path: str) -> dict:
|
| 66 |
+
with open(path) as f:
|
| 67 |
+
return json.load(f)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def main() -> int:
|
| 71 |
+
p = argparse.ArgumentParser(description=__doc__)
|
| 72 |
+
p.add_argument("--prev", required=True, help="Score JSON from previous accepted iteration")
|
| 73 |
+
p.add_argument("--curr", required=True, help="Score JSON from just-completed iteration")
|
| 74 |
+
p.add_argument(
|
| 75 |
+
"--plateau-threshold",
|
| 76 |
+
type=float,
|
| 77 |
+
default=DEFAULT_PLATEAU_THRESHOLD,
|
| 78 |
+
metavar="POINTS",
|
| 79 |
+
help=f"Minimum overall_delta to not count as a 'small' improvement "
|
| 80 |
+
f"(default: {DEFAULT_PLATEAU_THRESHOLD})",
|
| 81 |
+
)
|
| 82 |
+
p.add_argument(
|
| 83 |
+
"--plateau-streak",
|
| 84 |
+
type=int,
|
| 85 |
+
default=DEFAULT_PLATEAU_STREAK,
|
| 86 |
+
metavar="N",
|
| 87 |
+
help=f"Number of consecutive small improvements before HALT_PLATEAU "
|
| 88 |
+
f"(default: {DEFAULT_PLATEAU_STREAK})",
|
| 89 |
+
)
|
| 90 |
+
p.add_argument(
|
| 91 |
+
"--consecutive-small",
|
| 92 |
+
type=int,
|
| 93 |
+
default=0,
|
| 94 |
+
metavar="N",
|
| 95 |
+
help="Number of consecutive small-delta accepted iterations so far "
|
| 96 |
+
"(maintained by the calling loop; default: 0)",
|
| 97 |
+
)
|
| 98 |
+
args = p.parse_args()
|
| 99 |
+
|
| 100 |
+
try:
|
| 101 |
+
prev = load(args.prev)
|
| 102 |
+
curr = load(args.curr)
|
| 103 |
+
except (OSError, json.JSONDecodeError) as e:
|
| 104 |
+
print(f"ERROR: failed to load score JSONs: {e}", file=sys.stderr)
|
| 105 |
+
return 3
|
| 106 |
+
|
| 107 |
+
p_overall = float(prev.get("overall_score", 0))
|
| 108 |
+
c_overall = float(curr.get("overall_score", 0))
|
| 109 |
+
overall_delta = c_overall - p_overall
|
| 110 |
+
|
| 111 |
+
p_axes = prev.get("axis_scores") or {}
|
| 112 |
+
c_axes = curr.get("axis_scores") or {}
|
| 113 |
+
deltas: dict[str, float] = {}
|
| 114 |
+
for ax in AXES:
|
| 115 |
+
ps = float((p_axes.get(ax) or {}).get("score", 0))
|
| 116 |
+
cs = float((c_axes.get(ax) or {}).get("score", 0))
|
| 117 |
+
deltas[ax] = cs - ps
|
| 118 |
+
net_subaxis = sum(deltas.values())
|
| 119 |
+
|
| 120 |
+
# --- Primary accept/revert decision ---
|
| 121 |
+
if c_overall > p_overall:
|
| 122 |
+
decision = "ACCEPT_IMPROVED"
|
| 123 |
+
exit_code = 0
|
| 124 |
+
elif c_overall == p_overall:
|
| 125 |
+
if net_subaxis >= 0:
|
| 126 |
+
decision = "ACCEPT_TIED_NON_NEGATIVE"
|
| 127 |
+
exit_code = 0
|
| 128 |
+
else:
|
| 129 |
+
decision = "REVERT_TIED_NEGATIVE_SUBAXIS"
|
| 130 |
+
exit_code = 2
|
| 131 |
+
else:
|
| 132 |
+
decision = "REVERT_OVERALL_DECREASED"
|
| 133 |
+
exit_code = 1
|
| 134 |
+
|
| 135 |
+
# --- Plateau early-stop (only applies to accepted iterations) ---
|
| 136 |
+
is_small_delta = overall_delta < args.plateau_threshold
|
| 137 |
+
new_consecutive_small = (args.consecutive_small + 1) if is_small_delta else 0
|
| 138 |
+
plateau_triggered = False
|
| 139 |
+
|
| 140 |
+
if exit_code == 0 and new_consecutive_small >= args.plateau_streak:
|
| 141 |
+
decision = "HALT_PLATEAU"
|
| 142 |
+
exit_code = 4
|
| 143 |
+
plateau_triggered = True
|
| 144 |
+
|
| 145 |
+
out = {
|
| 146 |
+
"decision": decision,
|
| 147 |
+
"exit_code": exit_code,
|
| 148 |
+
"overall_prev": p_overall,
|
| 149 |
+
"overall_curr": c_overall,
|
| 150 |
+
"overall_delta": overall_delta,
|
| 151 |
+
"subaxis_deltas": deltas,
|
| 152 |
+
"net_subaxis": net_subaxis,
|
| 153 |
+
"is_small_delta": is_small_delta,
|
| 154 |
+
"consecutive_small": new_consecutive_small,
|
| 155 |
+
"plateau_threshold": args.plateau_threshold,
|
| 156 |
+
"plateau_streak": args.plateau_streak,
|
| 157 |
+
"plateau_triggered": plateau_triggered,
|
| 158 |
+
}
|
| 159 |
+
print(json.dumps(out, indent=2))
|
| 160 |
+
return exit_code
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
if __name__ == "__main__":
|
| 164 |
+
sys.exit(main())
|
.scider/skills/content-refinement-agent/scripts/snapshot.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
snapshot.py — Copy a paper.tex (and optionally paper.pdf) into a refinement
|
| 4 |
+
iteration directory, so reverts are real, not symbolic.
|
| 5 |
+
|
| 6 |
+
The PaperOrchestra refinement halt rules require the loop to roll back to
|
| 7 |
+
the previous iteration on overall-score decrease or tied negative sub-axis
|
| 8 |
+
delta. To do that physically, every iteration's draft must be preserved.
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
python snapshot.py --src paper.tex --dst iter2/
|
| 12 |
+
python snapshot.py --src paper.tex --src-pdf paper.pdf --dst iter2/
|
| 13 |
+
"""
|
| 14 |
+
import argparse
|
| 15 |
+
import os
|
| 16 |
+
import shutil
|
| 17 |
+
import sys
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def main() -> int:
|
| 21 |
+
p = argparse.ArgumentParser(description=__doc__)
|
| 22 |
+
p.add_argument("--src", required=True, help="source paper.tex path")
|
| 23 |
+
p.add_argument("--src-pdf", help="optional source paper.pdf path")
|
| 24 |
+
p.add_argument("--dst", required=True, help="destination iteration directory")
|
| 25 |
+
args = p.parse_args()
|
| 26 |
+
|
| 27 |
+
if not os.path.isfile(args.src):
|
| 28 |
+
print(f"ERROR: {args.src} not found", file=sys.stderr)
|
| 29 |
+
return 1
|
| 30 |
+
|
| 31 |
+
os.makedirs(args.dst, exist_ok=True)
|
| 32 |
+
dst_tex = os.path.join(args.dst, "paper.tex")
|
| 33 |
+
shutil.copy2(args.src, dst_tex)
|
| 34 |
+
print(f"OK: snapshot {args.src} → {dst_tex}")
|
| 35 |
+
|
| 36 |
+
if args.src_pdf:
|
| 37 |
+
if not os.path.isfile(args.src_pdf):
|
| 38 |
+
print(f"WARN: {args.src_pdf} not found, skipping PDF snapshot", file=sys.stderr)
|
| 39 |
+
else:
|
| 40 |
+
dst_pdf = os.path.join(args.dst, "paper.pdf")
|
| 41 |
+
shutil.copy2(args.src_pdf, dst_pdf)
|
| 42 |
+
print(f"OK: snapshot {args.src_pdf} → {dst_pdf}")
|
| 43 |
+
return 0
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
sys.exit(main())
|
.scider/skills/exploratory-data-analysis/SKILL.md
ADDED
|
@@ -0,0 +1,442 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
name: exploratory-data-analysis
|
| 3 |
+
description: Comprehensive EDA on scientific data files — structure, content, quality, and characteristics analysis across 200+ formats. Use when analyzing any data file to understand its structure, quality, and downstream analysis recommendations.
|
| 4 |
+
allowed_agents: [data]
|
| 5 |
+
preload_for: [data]
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
# Exploratory Data Analysis
|
| 9 |
+
|
| 10 |
+
## Overview
|
| 11 |
+
|
| 12 |
+
Perform comprehensive exploratory data analysis (EDA) on scientific data files across multiple domains. This skill provides automated file type detection, format-specific analysis, data quality assessment, and generates detailed markdown reports suitable for documentation and downstream analysis planning.
|
| 13 |
+
|
| 14 |
+
**Key Capabilities:**
|
| 15 |
+
- Automatic detection and analysis of 200+ scientific file formats
|
| 16 |
+
- Comprehensive format-specific metadata extraction
|
| 17 |
+
- Data quality and integrity assessment
|
| 18 |
+
- Statistical summaries and distributions
|
| 19 |
+
- Visualization recommendations
|
| 20 |
+
- Downstream analysis suggestions
|
| 21 |
+
- Markdown report generation
|
| 22 |
+
|
| 23 |
+
## When to Use This Skill
|
| 24 |
+
|
| 25 |
+
Use this skill when:
|
| 26 |
+
- User provides a path to a scientific data file for analysis
|
| 27 |
+
- User asks to "explore", "analyze", or "summarize" a data file
|
| 28 |
+
- User wants to understand the structure and content of scientific data
|
| 29 |
+
- User needs a comprehensive report of a dataset before analysis
|
| 30 |
+
- User wants to assess data quality or completeness
|
| 31 |
+
- User asks what type of analysis is appropriate for a file
|
| 32 |
+
|
| 33 |
+
## Supported File Categories
|
| 34 |
+
|
| 35 |
+
The skill has comprehensive coverage of scientific file formats organized into six major categories:
|
| 36 |
+
|
| 37 |
+
### 1. Chemistry and Molecular Formats (60+ extensions)
|
| 38 |
+
Structure files, computational chemistry outputs, molecular dynamics trajectories, and chemical databases.
|
| 39 |
+
|
| 40 |
+
**File types include:** `.pdb`, `.cif`, `.mol`, `.mol2`, `.sdf`, `.xyz`, `.smi`, `.gro`, `.log`, `.fchk`, `.cube`, `.dcd`, `.xtc`, `.trr`, `.prmtop`, `.psf`, and more.
|
| 41 |
+
|
| 42 |
+
**Reference file:** `references/chemistry_molecular_formats.md`
|
| 43 |
+
|
| 44 |
+
### 2. Bioinformatics and Genomics Formats (50+ extensions)
|
| 45 |
+
Sequence data, alignments, annotations, variants, and expression data.
|
| 46 |
+
|
| 47 |
+
**File types include:** `.fasta`, `.fastq`, `.sam`, `.bam`, `.vcf`, `.bed`, `.gff`, `.gtf`, `.bigwig`, `.h5ad`, `.loom`, `.counts`, `.mtx`, and more.
|
| 48 |
+
|
| 49 |
+
**Reference file:** `references/bioinformatics_genomics_formats.md`
|
| 50 |
+
|
| 51 |
+
### 3. Microscopy and Imaging Formats (45+ extensions)
|
| 52 |
+
Microscopy images, medical imaging, whole slide imaging, and electron microscopy.
|
| 53 |
+
|
| 54 |
+
**File types include:** `.tif`, `.nd2`, `.lif`, `.czi`, `.ims`, `.dcm`, `.nii`, `.mrc`, `.dm3`, `.vsi`, `.svs`, `.ome.tiff`, and more.
|
| 55 |
+
|
| 56 |
+
**Reference file:** `references/microscopy_imaging_formats.md`
|
| 57 |
+
|
| 58 |
+
### 4. Spectroscopy and Analytical Chemistry Formats (35+ extensions)
|
| 59 |
+
NMR, mass spectrometry, IR/Raman, UV-Vis, X-ray, chromatography, and other analytical techniques.
|
| 60 |
+
|
| 61 |
+
**File types include:** `.fid`, `.mzML`, `.mzXML`, `.raw`, `.mgf`, `.spc`, `.jdx`, `.xy`, `.cif` (crystallography), `.wdf`, and more.
|
| 62 |
+
|
| 63 |
+
**Reference file:** `references/spectroscopy_analytical_formats.md`
|
| 64 |
+
|
| 65 |
+
### 5. Proteomics and Metabolomics Formats (30+ extensions)
|
| 66 |
+
Mass spec proteomics, metabolomics, lipidomics, and multi-omics data.
|
| 67 |
+
|
| 68 |
+
**File types include:** `.mzML`, `.pepXML`, `.protXML`, `.mzid`, `.mzTab`, `.sky`, `.mgf`, `.msp`, `.h5ad`, and more.
|
| 69 |
+
|
| 70 |
+
**Reference file:** `references/proteomics_metabolomics_formats.md`
|
| 71 |
+
|
| 72 |
+
### 6. General Scientific Data Formats (30+ extensions)
|
| 73 |
+
Arrays, tables, hierarchical data, compressed archives, and common scientific formats.
|
| 74 |
+
|
| 75 |
+
**File types include:** `.npy`, `.npz`, `.csv`, `.xlsx`, `.json`, `.hdf5`, `.zarr`, `.parquet`, `.mat`, `.fits`, `.nc`, `.xml`, and more.
|
| 76 |
+
|
| 77 |
+
**Reference file:** `references/general_scientific_formats.md`
|
| 78 |
+
|
| 79 |
+
## Workflow
|
| 80 |
+
|
| 81 |
+
### Step 1: File Type Detection
|
| 82 |
+
|
| 83 |
+
When a user provides a file path, first identify the file type:
|
| 84 |
+
|
| 85 |
+
1. Extract the file extension
|
| 86 |
+
2. Look up the extension in the appropriate reference file
|
| 87 |
+
3. Identify the file category and format description
|
| 88 |
+
4. Load format-specific information
|
| 89 |
+
|
| 90 |
+
**Example:**
|
| 91 |
+
```
|
| 92 |
+
User: "Analyze data.fastq"
|
| 93 |
+
→ Extension: .fastq
|
| 94 |
+
→ Category: bioinformatics_genomics
|
| 95 |
+
→ Format: FASTQ Format (sequence data with quality scores)
|
| 96 |
+
→ Reference: references/bioinformatics_genomics_formats.md
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
### Step 2: Load Format-Specific Information
|
| 100 |
+
|
| 101 |
+
Based on the file type, read the corresponding reference file to understand:
|
| 102 |
+
- **Typical Data:** What kind of data this format contains
|
| 103 |
+
- **Use Cases:** Common applications for this format
|
| 104 |
+
- **Python Libraries:** How to read the file in Python
|
| 105 |
+
- **EDA Approach:** What analyses are appropriate for this data type
|
| 106 |
+
|
| 107 |
+
Search the reference file for the specific extension (e.g., search for "### .fastq" in `bioinformatics_genomics_formats.md`).
|
| 108 |
+
|
| 109 |
+
### Step 3: Perform Data Analysis
|
| 110 |
+
|
| 111 |
+
Use the `scripts/eda_analyzer.py` script OR implement custom analysis:
|
| 112 |
+
|
| 113 |
+
**Option A: Use the analyzer script**
|
| 114 |
+
```python
|
| 115 |
+
# The script automatically:
|
| 116 |
+
# 1. Detects file type
|
| 117 |
+
# 2. Loads reference information
|
| 118 |
+
# 3. Performs format-specific analysis
|
| 119 |
+
# 4. Generates markdown report
|
| 120 |
+
|
| 121 |
+
python scripts/eda_analyzer.py <filepath> [output.md]
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
**Option B: Custom analysis in the conversation**
|
| 125 |
+
Based on the format information from the reference file, perform appropriate analysis:
|
| 126 |
+
|
| 127 |
+
For tabular data (CSV, TSV, Excel):
|
| 128 |
+
- Load with pandas
|
| 129 |
+
- Check dimensions, data types
|
| 130 |
+
- Analyze missing values
|
| 131 |
+
- Calculate summary statistics
|
| 132 |
+
- Identify outliers
|
| 133 |
+
- Check for duplicates
|
| 134 |
+
|
| 135 |
+
For sequence data (FASTA, FASTQ):
|
| 136 |
+
- Count sequences
|
| 137 |
+
- Analyze length distributions
|
| 138 |
+
- Calculate GC content
|
| 139 |
+
- Assess quality scores (FASTQ)
|
| 140 |
+
|
| 141 |
+
For images (TIFF, ND2, CZI):
|
| 142 |
+
- Check dimensions (X, Y, Z, C, T)
|
| 143 |
+
- Analyze bit depth and value range
|
| 144 |
+
- Extract metadata (channels, timestamps, spatial calibration)
|
| 145 |
+
- Calculate intensity statistics
|
| 146 |
+
|
| 147 |
+
For arrays (NPY, HDF5):
|
| 148 |
+
- Check shape and dimensions
|
| 149 |
+
- Analyze data type
|
| 150 |
+
- Calculate statistical summaries
|
| 151 |
+
- Check for missing/invalid values
|
| 152 |
+
|
| 153 |
+
### Step 4: Generate Comprehensive Report
|
| 154 |
+
|
| 155 |
+
Create a markdown report with the following sections:
|
| 156 |
+
|
| 157 |
+
#### Required Sections:
|
| 158 |
+
1. **Title and Metadata**
|
| 159 |
+
- Filename and timestamp
|
| 160 |
+
- File size and location
|
| 161 |
+
|
| 162 |
+
2. **Basic Information**
|
| 163 |
+
- File properties
|
| 164 |
+
- Format identification
|
| 165 |
+
|
| 166 |
+
3. **File Type Details**
|
| 167 |
+
- Format description from reference
|
| 168 |
+
- Typical data content
|
| 169 |
+
- Common use cases
|
| 170 |
+
- Python libraries for reading
|
| 171 |
+
|
| 172 |
+
4. **Data Analysis**
|
| 173 |
+
- Structure and dimensions
|
| 174 |
+
- Statistical summaries
|
| 175 |
+
- Quality assessment
|
| 176 |
+
- Data characteristics
|
| 177 |
+
|
| 178 |
+
5. **Key Findings**
|
| 179 |
+
- Notable patterns
|
| 180 |
+
- Potential issues
|
| 181 |
+
- Quality metrics
|
| 182 |
+
|
| 183 |
+
6. **Recommendations**
|
| 184 |
+
- Preprocessing steps
|
| 185 |
+
- Appropriate analyses
|
| 186 |
+
- Tools and methods
|
| 187 |
+
- Visualization approaches
|
| 188 |
+
|
| 189 |
+
#### Template Location
|
| 190 |
+
Use `assets/report_template.md` as a guide for report structure.
|
| 191 |
+
|
| 192 |
+
### Step 5: Save Report
|
| 193 |
+
|
| 194 |
+
Save the markdown report with a descriptive filename:
|
| 195 |
+
- Pattern: `{original_filename}_eda_report.md`
|
| 196 |
+
- Example: `experiment_data.fastq` → `experiment_data_eda_report.md`
|
| 197 |
+
|
| 198 |
+
## Detailed Format References
|
| 199 |
+
|
| 200 |
+
Each reference file contains comprehensive information for dozens of file types. To find information about a specific format:
|
| 201 |
+
|
| 202 |
+
1. Identify the category from the extension
|
| 203 |
+
2. Read the appropriate reference file
|
| 204 |
+
3. Search for the section heading matching the extension (e.g., "### .pdb")
|
| 205 |
+
4. Extract the format information
|
| 206 |
+
|
| 207 |
+
### Reference File Structure
|
| 208 |
+
|
| 209 |
+
Each format entry includes:
|
| 210 |
+
- **Description:** What the format is
|
| 211 |
+
- **Typical Data:** What it contains
|
| 212 |
+
- **Use Cases:** Common applications
|
| 213 |
+
- **Python Libraries:** How to read it (with code examples)
|
| 214 |
+
- **EDA Approach:** Specific analyses to perform
|
| 215 |
+
|
| 216 |
+
**Example lookup:**
|
| 217 |
+
```markdown
|
| 218 |
+
### .pdb - Protein Data Bank
|
| 219 |
+
**Description:** Standard format for 3D structures of biological macromolecules
|
| 220 |
+
**Typical Data:** Atomic coordinates, residue information, secondary structure
|
| 221 |
+
**Use Cases:** Protein structure analysis, molecular visualization, docking
|
| 222 |
+
**Python Libraries:**
|
| 223 |
+
- `Biopython`: `Bio.PDB`
|
| 224 |
+
- `MDAnalysis`: `MDAnalysis.Universe('file.pdb')`
|
| 225 |
+
**EDA Approach:**
|
| 226 |
+
- Structure validation (bond lengths, angles)
|
| 227 |
+
- B-factor distribution
|
| 228 |
+
- Missing residues detection
|
| 229 |
+
- Ramachandran plots
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
## Best Practices
|
| 233 |
+
|
| 234 |
+
### Reading Reference Files
|
| 235 |
+
|
| 236 |
+
Reference files are large (10,000+ words each). To efficiently use them:
|
| 237 |
+
|
| 238 |
+
1. **Search by extension:** Use grep to find the specific format
|
| 239 |
+
```python
|
| 240 |
+
import re
|
| 241 |
+
with open('references/chemistry_molecular_formats.md', 'r') as f:
|
| 242 |
+
content = f.read()
|
| 243 |
+
pattern = r'### \.pdb[^#]*?(?=###|\Z)'
|
| 244 |
+
match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
|
| 245 |
+
```
|
| 246 |
+
|
| 247 |
+
2. **Extract relevant sections:** Don't load entire reference files into context unnecessarily
|
| 248 |
+
|
| 249 |
+
3. **Cache format info:** If analyzing multiple files of the same type, reuse the format information
|
| 250 |
+
|
| 251 |
+
### Data Analysis
|
| 252 |
+
|
| 253 |
+
1. **Sample large files:** For files with millions of records, analyze a representative sample
|
| 254 |
+
2. **Handle errors gracefully:** Many scientific formats require specific libraries; provide clear installation instructions
|
| 255 |
+
3. **Validate metadata:** Cross-check metadata consistency (e.g., stated dimensions vs actual data)
|
| 256 |
+
4. **Consider data provenance:** Note instrument, software versions, processing steps
|
| 257 |
+
|
| 258 |
+
### Report Generation
|
| 259 |
+
|
| 260 |
+
1. **Be comprehensive:** Include all relevant information for downstream analysis
|
| 261 |
+
2. **Be specific:** Provide concrete recommendations based on the file type
|
| 262 |
+
3. **Be actionable:** Suggest specific next steps and tools
|
| 263 |
+
4. **Include code examples:** Show how to load and work with the data
|
| 264 |
+
|
| 265 |
+
## Examples
|
| 266 |
+
|
| 267 |
+
### Example 1: Analyzing a FASTQ file
|
| 268 |
+
|
| 269 |
+
```python
|
| 270 |
+
# User provides: "Analyze reads.fastq"
|
| 271 |
+
|
| 272 |
+
# 1. Detect file type
|
| 273 |
+
extension = '.fastq'
|
| 274 |
+
category = 'bioinformatics_genomics'
|
| 275 |
+
|
| 276 |
+
# 2. Read reference info
|
| 277 |
+
# Search references/bioinformatics_genomics_formats.md for "### .fastq"
|
| 278 |
+
|
| 279 |
+
# 3. Perform analysis
|
| 280 |
+
from Bio import SeqIO
|
| 281 |
+
sequences = list(SeqIO.parse('reads.fastq', 'fastq'))
|
| 282 |
+
# Calculate: read count, length distribution, quality scores, GC content
|
| 283 |
+
|
| 284 |
+
# 4. Generate report
|
| 285 |
+
# Include: format description, analysis results, QC recommendations
|
| 286 |
+
|
| 287 |
+
# 5. Save as: reads_eda_report.md
|
| 288 |
+
```
|
| 289 |
+
|
| 290 |
+
### Example 2: Analyzing a CSV dataset
|
| 291 |
+
|
| 292 |
+
```python
|
| 293 |
+
# User provides: "Explore experiment_results.csv"
|
| 294 |
+
|
| 295 |
+
# 1. Detect: .csv → general_scientific
|
| 296 |
+
|
| 297 |
+
# 2. Load reference for CSV format
|
| 298 |
+
|
| 299 |
+
# 3. Analyze
|
| 300 |
+
import pandas as pd
|
| 301 |
+
df = pd.read_csv('experiment_results.csv')
|
| 302 |
+
# Dimensions, dtypes, missing values, statistics, correlations
|
| 303 |
+
|
| 304 |
+
# 4. Generate report with:
|
| 305 |
+
# - Data structure
|
| 306 |
+
# - Missing value patterns
|
| 307 |
+
# - Statistical summaries
|
| 308 |
+
# - Correlation matrix
|
| 309 |
+
# - Outlier detection results
|
| 310 |
+
|
| 311 |
+
# 5. Save report
|
| 312 |
+
```
|
| 313 |
+
|
| 314 |
+
### Example 3: Analyzing microscopy data
|
| 315 |
+
|
| 316 |
+
```python
|
| 317 |
+
# User provides: "Analyze cells.nd2"
|
| 318 |
+
|
| 319 |
+
# 1. Detect: .nd2 → microscopy_imaging (Nikon format)
|
| 320 |
+
|
| 321 |
+
# 2. Read reference for ND2 format
|
| 322 |
+
# Learn: multi-dimensional (XYZCT), requires nd2reader
|
| 323 |
+
|
| 324 |
+
# 3. Analyze
|
| 325 |
+
from nd2reader import ND2Reader
|
| 326 |
+
with ND2Reader('cells.nd2') as images:
|
| 327 |
+
# Extract: dimensions, channels, timepoints, metadata
|
| 328 |
+
# Calculate: intensity statistics, frame info
|
| 329 |
+
|
| 330 |
+
# 4. Generate report with:
|
| 331 |
+
# - Image dimensions (XY, Z-stacks, time, channels)
|
| 332 |
+
# - Channel wavelengths
|
| 333 |
+
# - Pixel size and calibration
|
| 334 |
+
# - Recommendations for image analysis
|
| 335 |
+
|
| 336 |
+
# 5. Save report
|
| 337 |
+
```
|
| 338 |
+
|
| 339 |
+
## Troubleshooting
|
| 340 |
+
|
| 341 |
+
### Missing Libraries
|
| 342 |
+
|
| 343 |
+
Many scientific formats require specialized libraries:
|
| 344 |
+
|
| 345 |
+
**Problem:** Import error when trying to read a file
|
| 346 |
+
|
| 347 |
+
**Solution:** Provide clear installation instructions
|
| 348 |
+
```python
|
| 349 |
+
try:
|
| 350 |
+
from Bio import SeqIO
|
| 351 |
+
except ImportError:
|
| 352 |
+
print("Install Biopython: uv pip install biopython")
|
| 353 |
+
```
|
| 354 |
+
|
| 355 |
+
Common requirements by category:
|
| 356 |
+
- **Bioinformatics:** `biopython`, `pysam`, `pyBigWig`
|
| 357 |
+
- **Chemistry:** `rdkit`, `mdanalysis`, `cclib`
|
| 358 |
+
- **Microscopy:** `tifffile`, `nd2reader`, `aicsimageio`, `pydicom`
|
| 359 |
+
- **Spectroscopy:** `nmrglue`, `pymzml`, `pyteomics`
|
| 360 |
+
- **General:** `pandas`, `numpy`, `h5py`, `scipy`
|
| 361 |
+
|
| 362 |
+
### Unknown File Types
|
| 363 |
+
|
| 364 |
+
If a file extension is not in the references:
|
| 365 |
+
|
| 366 |
+
1. Ask the user about the file format
|
| 367 |
+
2. Check if it's a vendor-specific variant
|
| 368 |
+
3. Attempt generic analysis based on file structure (text vs binary)
|
| 369 |
+
4. Provide general recommendations
|
| 370 |
+
|
| 371 |
+
### Large Files
|
| 372 |
+
|
| 373 |
+
For very large files:
|
| 374 |
+
|
| 375 |
+
1. Use sampling strategies (first N records)
|
| 376 |
+
2. Use memory-mapped access (for HDF5, NPY)
|
| 377 |
+
3. Process in chunks (for CSV, FASTQ)
|
| 378 |
+
4. Provide estimates based on samples
|
| 379 |
+
|
| 380 |
+
## Script Usage
|
| 381 |
+
|
| 382 |
+
The `scripts/eda_analyzer.py` can be used directly:
|
| 383 |
+
|
| 384 |
+
```bash
|
| 385 |
+
# Basic usage
|
| 386 |
+
python scripts/eda_analyzer.py data.csv
|
| 387 |
+
|
| 388 |
+
# Specify output file
|
| 389 |
+
python scripts/eda_analyzer.py data.csv output_report.md
|
| 390 |
+
|
| 391 |
+
# The script will:
|
| 392 |
+
# 1. Auto-detect file type
|
| 393 |
+
# 2. Load format references
|
| 394 |
+
# 3. Perform appropriate analysis
|
| 395 |
+
# 4. Generate markdown report
|
| 396 |
+
```
|
| 397 |
+
|
| 398 |
+
The script supports automatic analysis for many common formats, but custom analysis in the conversation provides more flexibility and domain-specific insights.
|
| 399 |
+
|
| 400 |
+
## Advanced Usage
|
| 401 |
+
|
| 402 |
+
### Multi-File Analysis
|
| 403 |
+
|
| 404 |
+
When analyzing multiple related files:
|
| 405 |
+
1. Perform individual EDA on each file
|
| 406 |
+
2. Create a summary comparison report
|
| 407 |
+
3. Identify relationships and dependencies
|
| 408 |
+
4. Suggest integration strategies
|
| 409 |
+
|
| 410 |
+
### Quality Control
|
| 411 |
+
|
| 412 |
+
For data quality assessment:
|
| 413 |
+
1. Check format compliance
|
| 414 |
+
2. Validate metadata consistency
|
| 415 |
+
3. Assess completeness
|
| 416 |
+
4. Identify outliers and anomalies
|
| 417 |
+
5. Compare to expected ranges/distributions
|
| 418 |
+
|
| 419 |
+
### Preprocessing Recommendations
|
| 420 |
+
|
| 421 |
+
Based on data characteristics, recommend:
|
| 422 |
+
1. Normalization strategies
|
| 423 |
+
2. Missing value imputation
|
| 424 |
+
3. Outlier handling
|
| 425 |
+
4. Batch correction
|
| 426 |
+
5. Format conversions
|
| 427 |
+
|
| 428 |
+
## Resources
|
| 429 |
+
|
| 430 |
+
### scripts/
|
| 431 |
+
- `eda_analyzer.py`: Comprehensive analysis script that can be run directly or imported
|
| 432 |
+
|
| 433 |
+
### references/
|
| 434 |
+
- `chemistry_molecular_formats.md`: 60+ chemistry/molecular file formats
|
| 435 |
+
- `bioinformatics_genomics_formats.md`: 50+ bioinformatics formats
|
| 436 |
+
- `microscopy_imaging_formats.md`: 45+ imaging formats
|
| 437 |
+
- `spectroscopy_analytical_formats.md`: 35+ spectroscopy formats
|
| 438 |
+
- `proteomics_metabolomics_formats.md`: 30+ omics formats
|
| 439 |
+
- `general_scientific_formats.md`: 30+ general formats
|
| 440 |
+
|
| 441 |
+
### assets/
|
| 442 |
+
- `report_template.md`: Comprehensive markdown template for EDA reports
|
.scider/skills/exploratory-data-analysis/assets/report_template.md
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Exploratory Data Analysis Report: {FILENAME}
|
| 2 |
+
|
| 3 |
+
**Generated:** {TIMESTAMP}
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Executive Summary
|
| 8 |
+
|
| 9 |
+
This report provides a comprehensive exploratory data analysis of the file `{FILENAME}`. The analysis includes file type identification, format-specific metadata extraction, data quality assessment, and recommendations for downstream analysis.
|
| 10 |
+
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
## Basic Information
|
| 14 |
+
|
| 15 |
+
- **Filename:** `{FILENAME}`
|
| 16 |
+
- **Full Path:** `{FILEPATH}`
|
| 17 |
+
- **File Size:** {FILE_SIZE_HUMAN} ({FILE_SIZE_BYTES} bytes)
|
| 18 |
+
- **Last Modified:** {MODIFIED_DATE}
|
| 19 |
+
- **Extension:** `.{EXTENSION}`
|
| 20 |
+
- **Format Category:** {CATEGORY}
|
| 21 |
+
|
| 22 |
+
---
|
| 23 |
+
|
| 24 |
+
## File Type Details
|
| 25 |
+
|
| 26 |
+
### Format Description
|
| 27 |
+
{FORMAT_DESCRIPTION}
|
| 28 |
+
|
| 29 |
+
### Typical Data Content
|
| 30 |
+
{TYPICAL_DATA}
|
| 31 |
+
|
| 32 |
+
### Common Use Cases
|
| 33 |
+
{USE_CASES}
|
| 34 |
+
|
| 35 |
+
### Python Libraries for Reading
|
| 36 |
+
{PYTHON_LIBRARIES}
|
| 37 |
+
|
| 38 |
+
---
|
| 39 |
+
|
| 40 |
+
## Data Structure Analysis
|
| 41 |
+
|
| 42 |
+
### Overview
|
| 43 |
+
{DATA_STRUCTURE_OVERVIEW}
|
| 44 |
+
|
| 45 |
+
### Dimensions
|
| 46 |
+
{DIMENSIONS}
|
| 47 |
+
|
| 48 |
+
### Data Types
|
| 49 |
+
{DATA_TYPES}
|
| 50 |
+
|
| 51 |
+
---
|
| 52 |
+
|
| 53 |
+
## Quality Assessment
|
| 54 |
+
|
| 55 |
+
### Completeness
|
| 56 |
+
- **Missing Values:** {MISSING_VALUES}
|
| 57 |
+
- **Data Coverage:** {COVERAGE}
|
| 58 |
+
|
| 59 |
+
### Validity
|
| 60 |
+
- **Range Check:** {RANGE_CHECK}
|
| 61 |
+
- **Format Compliance:** {FORMAT_COMPLIANCE}
|
| 62 |
+
- **Consistency:** {CONSISTENCY}
|
| 63 |
+
|
| 64 |
+
### Integrity
|
| 65 |
+
- **Checksum/Validation:** {VALIDATION}
|
| 66 |
+
- **File Corruption Check:** {CORRUPTION_CHECK}
|
| 67 |
+
|
| 68 |
+
---
|
| 69 |
+
|
| 70 |
+
## Statistical Summary
|
| 71 |
+
|
| 72 |
+
### Numerical Variables
|
| 73 |
+
{NUMERICAL_STATS}
|
| 74 |
+
|
| 75 |
+
### Categorical Variables
|
| 76 |
+
{CATEGORICAL_STATS}
|
| 77 |
+
|
| 78 |
+
### Distributions
|
| 79 |
+
{DISTRIBUTIONS}
|
| 80 |
+
|
| 81 |
+
---
|
| 82 |
+
|
| 83 |
+
## Data Characteristics
|
| 84 |
+
|
| 85 |
+
### Temporal Properties (if applicable)
|
| 86 |
+
- **Time Range:** {TIME_RANGE}
|
| 87 |
+
- **Sampling Rate:** {SAMPLING_RATE}
|
| 88 |
+
- **Missing Time Points:** {MISSING_TIMEPOINTS}
|
| 89 |
+
|
| 90 |
+
### Spatial Properties (if applicable)
|
| 91 |
+
- **Dimensions:** {SPATIAL_DIMENSIONS}
|
| 92 |
+
- **Resolution:** {SPATIAL_RESOLUTION}
|
| 93 |
+
- **Coordinate System:** {COORDINATE_SYSTEM}
|
| 94 |
+
|
| 95 |
+
### Experimental Metadata (if applicable)
|
| 96 |
+
- **Instrument:** {INSTRUMENT}
|
| 97 |
+
- **Method:** {METHOD}
|
| 98 |
+
- **Sample Info:** {SAMPLE_INFO}
|
| 99 |
+
|
| 100 |
+
---
|
| 101 |
+
|
| 102 |
+
## Key Findings
|
| 103 |
+
|
| 104 |
+
1. **Data Volume:** {DATA_VOLUME_FINDING}
|
| 105 |
+
2. **Data Quality:** {DATA_QUALITY_FINDING}
|
| 106 |
+
3. **Notable Patterns:** {PATTERNS_FINDING}
|
| 107 |
+
4. **Potential Issues:** {ISSUES_FINDING}
|
| 108 |
+
|
| 109 |
+
---
|
| 110 |
+
|
| 111 |
+
## Visualizations
|
| 112 |
+
|
| 113 |
+
### Distribution Plots
|
| 114 |
+
{DISTRIBUTION_PLOTS}
|
| 115 |
+
|
| 116 |
+
### Correlation Analysis
|
| 117 |
+
{CORRELATION_PLOTS}
|
| 118 |
+
|
| 119 |
+
### Time Series (if applicable)
|
| 120 |
+
{TIMESERIES_PLOTS}
|
| 121 |
+
|
| 122 |
+
---
|
| 123 |
+
|
| 124 |
+
## Recommendations for Further Analysis
|
| 125 |
+
|
| 126 |
+
### Immediate Actions
|
| 127 |
+
1. {RECOMMENDATION_1}
|
| 128 |
+
2. {RECOMMENDATION_2}
|
| 129 |
+
3. {RECOMMENDATION_3}
|
| 130 |
+
|
| 131 |
+
### Preprocessing Steps
|
| 132 |
+
- {PREPROCESSING_1}
|
| 133 |
+
- {PREPROCESSING_2}
|
| 134 |
+
- {PREPROCESSING_3}
|
| 135 |
+
|
| 136 |
+
### Analytical Approaches
|
| 137 |
+
{ANALYTICAL_APPROACHES}
|
| 138 |
+
|
| 139 |
+
### Tools and Methods
|
| 140 |
+
- **Recommended Software:** {RECOMMENDED_SOFTWARE}
|
| 141 |
+
- **Statistical Methods:** {STATISTICAL_METHODS}
|
| 142 |
+
- **Visualization Tools:** {VIZ_TOOLS}
|
| 143 |
+
|
| 144 |
+
---
|
| 145 |
+
|
| 146 |
+
## Data Processing Workflow
|
| 147 |
+
|
| 148 |
+
```
|
| 149 |
+
{WORKFLOW_DIAGRAM}
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
---
|
| 153 |
+
|
| 154 |
+
## Potential Challenges
|
| 155 |
+
|
| 156 |
+
1. **Challenge:** {CHALLENGE_1}
|
| 157 |
+
- **Mitigation:** {MITIGATION_1}
|
| 158 |
+
|
| 159 |
+
2. **Challenge:** {CHALLENGE_2}
|
| 160 |
+
- **Mitigation:** {MITIGATION_2}
|
| 161 |
+
|
| 162 |
+
---
|
| 163 |
+
|
| 164 |
+
## References and Resources
|
| 165 |
+
|
| 166 |
+
### Format Specification
|
| 167 |
+
- {FORMAT_SPEC_LINK}
|
| 168 |
+
|
| 169 |
+
### Python Libraries Documentation
|
| 170 |
+
- {LIBRARY_DOCS}
|
| 171 |
+
|
| 172 |
+
### Related Analysis Examples
|
| 173 |
+
- {EXAMPLE_LINKS}
|
| 174 |
+
|
| 175 |
+
---
|
| 176 |
+
|
| 177 |
+
## Appendix
|
| 178 |
+
|
| 179 |
+
### Complete File Metadata
|
| 180 |
+
```json
|
| 181 |
+
{COMPLETE_METADATA}
|
| 182 |
+
```
|
| 183 |
+
|
| 184 |
+
### Analysis Parameters
|
| 185 |
+
```json
|
| 186 |
+
{ANALYSIS_PARAMETERS}
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
### Software Versions
|
| 190 |
+
- Python: {PYTHON_VERSION}
|
| 191 |
+
- Key Libraries: {LIBRARY_VERSIONS}
|
| 192 |
+
|
| 193 |
+
---
|
| 194 |
+
|
| 195 |
+
*This report was automatically generated by the exploratory-data-analysis skill.*
|
| 196 |
+
*For questions or issues, refer to the skill documentation.*
|
.scider/skills/exploratory-data-analysis/references/bioinformatics_genomics_formats.md
ADDED
|
@@ -0,0 +1,664 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Bioinformatics and Genomics File Formats Reference
|
| 2 |
+
|
| 3 |
+
This reference covers file formats used in genomics, transcriptomics, sequence analysis, and related bioinformatics applications.
|
| 4 |
+
|
| 5 |
+
## Sequence Data Formats
|
| 6 |
+
|
| 7 |
+
### .fasta / .fa / .fna - FASTA Format
|
| 8 |
+
**Description:** Text-based format for nucleotide or protein sequences
|
| 9 |
+
**Typical Data:** DNA, RNA, or protein sequences with headers
|
| 10 |
+
**Use Cases:** Sequence storage, BLAST searches, alignments
|
| 11 |
+
**Python Libraries:**
|
| 12 |
+
- `Biopython`: `SeqIO.parse('file.fasta', 'fasta')`
|
| 13 |
+
- `pyfaidx`: Fast indexed FASTA access
|
| 14 |
+
- `screed`: Fast sequence parsing
|
| 15 |
+
**EDA Approach:**
|
| 16 |
+
- Sequence count and length distribution
|
| 17 |
+
- GC content analysis
|
| 18 |
+
- N content (ambiguous bases)
|
| 19 |
+
- Sequence ID parsing
|
| 20 |
+
- Duplicate detection
|
| 21 |
+
- Quality metrics for assemblies (N50, L50)
|
| 22 |
+
|
| 23 |
+
### .fastq / .fq - FASTQ Format
|
| 24 |
+
**Description:** Sequence data with base quality scores
|
| 25 |
+
**Typical Data:** Raw sequencing reads with Phred quality scores
|
| 26 |
+
**Use Cases:** NGS data, quality control, read mapping
|
| 27 |
+
**Python Libraries:**
|
| 28 |
+
- `Biopython`: `SeqIO.parse('file.fastq', 'fastq')`
|
| 29 |
+
- `pysam`: Fast FASTQ/BAM operations
|
| 30 |
+
- `HTSeq`: Sequencing data analysis
|
| 31 |
+
**EDA Approach:**
|
| 32 |
+
- Read count and length distribution
|
| 33 |
+
- Quality score distribution (per-base, per-read)
|
| 34 |
+
- GC content and bias
|
| 35 |
+
- Duplicate rate estimation
|
| 36 |
+
- Adapter contamination detection
|
| 37 |
+
- k-mer frequency analysis
|
| 38 |
+
- Encoding format validation (Phred33/64)
|
| 39 |
+
|
| 40 |
+
### .sam - Sequence Alignment/Map
|
| 41 |
+
**Description:** Tab-delimited text format for alignments
|
| 42 |
+
**Typical Data:** Aligned sequencing reads with mapping quality
|
| 43 |
+
**Use Cases:** Read alignment storage, variant calling
|
| 44 |
+
**Python Libraries:**
|
| 45 |
+
- `pysam`: `pysam.AlignmentFile('file.sam', 'r')`
|
| 46 |
+
- `HTSeq`: `HTSeq.SAM_Reader('file.sam')`
|
| 47 |
+
**EDA Approach:**
|
| 48 |
+
- Mapping rate and quality distribution
|
| 49 |
+
- Coverage analysis
|
| 50 |
+
- Insert size distribution (paired-end)
|
| 51 |
+
- Alignment flags distribution
|
| 52 |
+
- CIGAR string patterns
|
| 53 |
+
- Mismatch and indel rates
|
| 54 |
+
- Duplicate and supplementary alignment counts
|
| 55 |
+
|
| 56 |
+
### .bam - Binary Alignment/Map
|
| 57 |
+
**Description:** Compressed binary version of SAM
|
| 58 |
+
**Typical Data:** Aligned reads in compressed format
|
| 59 |
+
**Use Cases:** Efficient storage and processing of alignments
|
| 60 |
+
**Python Libraries:**
|
| 61 |
+
- `pysam`: Full BAM support with indexing
|
| 62 |
+
- `bamnostic`: Pure Python BAM reader
|
| 63 |
+
**EDA Approach:**
|
| 64 |
+
- Same as SAM plus:
|
| 65 |
+
- Compression ratio analysis
|
| 66 |
+
- Index file (.bai) validation
|
| 67 |
+
- Chromosome-wise statistics
|
| 68 |
+
- Strand bias detection
|
| 69 |
+
- Read group analysis
|
| 70 |
+
|
| 71 |
+
### .cram - CRAM Format
|
| 72 |
+
**Description:** Highly compressed alignment format
|
| 73 |
+
**Typical Data:** Reference-compressed aligned reads
|
| 74 |
+
**Use Cases:** Long-term storage, space-efficient archives
|
| 75 |
+
**Python Libraries:**
|
| 76 |
+
- `pysam`: CRAM support (requires reference)
|
| 77 |
+
- Reference genome must be accessible
|
| 78 |
+
**EDA Approach:**
|
| 79 |
+
- Compression efficiency vs BAM
|
| 80 |
+
- Reference dependency validation
|
| 81 |
+
- Lossy vs lossless compression assessment
|
| 82 |
+
- Decompression performance
|
| 83 |
+
- Similar alignment metrics as BAM
|
| 84 |
+
|
| 85 |
+
### .bed - Browser Extensible Data
|
| 86 |
+
**Description:** Tab-delimited format for genomic features
|
| 87 |
+
**Typical Data:** Genomic intervals (chr, start, end) with annotations
|
| 88 |
+
**Use Cases:** Peak calling, variant annotation, genome browsing
|
| 89 |
+
**Python Libraries:**
|
| 90 |
+
- `pybedtools`: `pybedtools.BedTool('file.bed')`
|
| 91 |
+
- `pyranges`: `pyranges.read_bed('file.bed')`
|
| 92 |
+
- `pandas`: Simple BED reading
|
| 93 |
+
**EDA Approach:**
|
| 94 |
+
- Feature count and size distribution
|
| 95 |
+
- Chromosome distribution
|
| 96 |
+
- Strand bias
|
| 97 |
+
- Score distribution (if present)
|
| 98 |
+
- Overlap and proximity analysis
|
| 99 |
+
- Coverage statistics
|
| 100 |
+
- Gap analysis between features
|
| 101 |
+
|
| 102 |
+
### .bedGraph - BED with Graph Data
|
| 103 |
+
**Description:** BED format with per-base signal values
|
| 104 |
+
**Typical Data:** Continuous-valued genomic data (coverage, signals)
|
| 105 |
+
**Use Cases:** Coverage tracks, ChIP-seq signals, methylation
|
| 106 |
+
**Python Libraries:**
|
| 107 |
+
- `pyBigWig`: Can convert to bigWig
|
| 108 |
+
- `pybedtools`: BedGraph operations
|
| 109 |
+
**EDA Approach:**
|
| 110 |
+
- Signal distribution statistics
|
| 111 |
+
- Genome coverage percentage
|
| 112 |
+
- Signal dynamics (peaks, valleys)
|
| 113 |
+
- Chromosome-wise signal patterns
|
| 114 |
+
- Quantile analysis
|
| 115 |
+
- Zero-coverage regions
|
| 116 |
+
|
| 117 |
+
### .bigWig / .bw - Binary BigWig
|
| 118 |
+
**Description:** Indexed binary format for genome-wide signal data
|
| 119 |
+
**Typical Data:** Continuous genomic signals (compressed and indexed)
|
| 120 |
+
**Use Cases:** Efficient genome browser tracks, large-scale data
|
| 121 |
+
**Python Libraries:**
|
| 122 |
+
- `pyBigWig`: `pyBigWig.open('file.bw')`
|
| 123 |
+
- `pybbi`: BigWig/BigBed interface
|
| 124 |
+
**EDA Approach:**
|
| 125 |
+
- Signal statistics extraction
|
| 126 |
+
- Zoom level analysis
|
| 127 |
+
- Regional signal extraction
|
| 128 |
+
- Efficient genome-wide summaries
|
| 129 |
+
- Compression efficiency
|
| 130 |
+
- Index structure analysis
|
| 131 |
+
|
| 132 |
+
### .bigBed / .bb - Binary BigBed
|
| 133 |
+
**Description:** Indexed binary BED format
|
| 134 |
+
**Typical Data:** Genomic features (compressed and indexed)
|
| 135 |
+
**Use Cases:** Large feature sets, genome browsers
|
| 136 |
+
**Python Libraries:**
|
| 137 |
+
- `pybbi`: BigBed reading
|
| 138 |
+
- `pybigtools`: Modern BigBed interface
|
| 139 |
+
**EDA Approach:**
|
| 140 |
+
- Feature density analysis
|
| 141 |
+
- Efficient interval queries
|
| 142 |
+
- Zoom level validation
|
| 143 |
+
- Index performance metrics
|
| 144 |
+
- Feature size statistics
|
| 145 |
+
|
| 146 |
+
### .gff / .gff3 - General Feature Format
|
| 147 |
+
**Description:** Tab-delimited format for genomic annotations
|
| 148 |
+
**Typical Data:** Gene models, transcripts, exons, regulatory elements
|
| 149 |
+
**Use Cases:** Genome annotation, gene prediction
|
| 150 |
+
**Python Libraries:**
|
| 151 |
+
- `BCBio.GFF`: Biopython GFF module
|
| 152 |
+
- `gffutils`: `gffutils.create_db('file.gff3')`
|
| 153 |
+
- `pyranges`: GFF support
|
| 154 |
+
**EDA Approach:**
|
| 155 |
+
- Feature type distribution (gene, exon, CDS, etc.)
|
| 156 |
+
- Gene structure validation
|
| 157 |
+
- Strand balance
|
| 158 |
+
- Hierarchical relationship validation
|
| 159 |
+
- Phase validation for CDS
|
| 160 |
+
- Attribute completeness
|
| 161 |
+
- Gene model statistics (introns, exons per gene)
|
| 162 |
+
|
| 163 |
+
### .gtf - Gene Transfer Format
|
| 164 |
+
**Description:** GFF2-based format for gene annotations
|
| 165 |
+
**Typical Data:** Gene and transcript annotations
|
| 166 |
+
**Use Cases:** RNA-seq analysis, gene quantification
|
| 167 |
+
**Python Libraries:**
|
| 168 |
+
- `pyranges`: `pyranges.read_gtf('file.gtf')`
|
| 169 |
+
- `gffutils`: GTF database creation
|
| 170 |
+
- `HTSeq`: GTF reading for counts
|
| 171 |
+
**EDA Approach:**
|
| 172 |
+
- Transcript isoform analysis
|
| 173 |
+
- Gene structure completeness
|
| 174 |
+
- Exon number distribution
|
| 175 |
+
- Transcript length distribution
|
| 176 |
+
- TSS and TES analysis
|
| 177 |
+
- Biotype distribution
|
| 178 |
+
- Overlapping gene detection
|
| 179 |
+
|
| 180 |
+
### .vcf - Variant Call Format
|
| 181 |
+
**Description:** Text format for genetic variants
|
| 182 |
+
**Typical Data:** SNPs, indels, structural variants with annotations
|
| 183 |
+
**Use Cases:** Variant calling, population genetics, GWAS
|
| 184 |
+
**Python Libraries:**
|
| 185 |
+
- `pysam`: `pysam.VariantFile('file.vcf')`
|
| 186 |
+
- `cyvcf2`: Fast VCF parsing
|
| 187 |
+
- `PyVCF`: Older but comprehensive
|
| 188 |
+
**EDA Approach:**
|
| 189 |
+
- Variant count by type (SNP, indel, SV)
|
| 190 |
+
- Quality score distribution
|
| 191 |
+
- Allele frequency spectrum
|
| 192 |
+
- Transition/transversion ratio
|
| 193 |
+
- Heterozygosity rates
|
| 194 |
+
- Missing genotype analysis
|
| 195 |
+
- Hardy-Weinberg equilibrium
|
| 196 |
+
- Annotation completeness (if annotated)
|
| 197 |
+
|
| 198 |
+
### .bcf - Binary VCF
|
| 199 |
+
**Description:** Compressed binary variant format
|
| 200 |
+
**Typical Data:** Same as VCF but binary
|
| 201 |
+
**Use Cases:** Efficient variant storage and processing
|
| 202 |
+
**Python Libraries:**
|
| 203 |
+
- `pysam`: Full BCF support
|
| 204 |
+
- `cyvcf2`: Optimized BCF reading
|
| 205 |
+
**EDA Approach:**
|
| 206 |
+
- Same as VCF plus:
|
| 207 |
+
- Compression efficiency
|
| 208 |
+
- Indexing validation
|
| 209 |
+
- Read performance metrics
|
| 210 |
+
|
| 211 |
+
### .gvcf - Genomic VCF
|
| 212 |
+
**Description:** VCF with reference confidence blocks
|
| 213 |
+
**Typical Data:** All positions (variant and non-variant)
|
| 214 |
+
**Use Cases:** Joint genotyping workflows, GATK
|
| 215 |
+
**Python Libraries:**
|
| 216 |
+
- `pysam`: GVCF support
|
| 217 |
+
- Standard VCF parsers
|
| 218 |
+
**EDA Approach:**
|
| 219 |
+
- Reference block analysis
|
| 220 |
+
- Coverage uniformity
|
| 221 |
+
- Variant density
|
| 222 |
+
- Genotype quality across genome
|
| 223 |
+
- Reference confidence distribution
|
| 224 |
+
|
| 225 |
+
## RNA-Seq and Expression Data
|
| 226 |
+
|
| 227 |
+
### .counts - Gene Count Matrix
|
| 228 |
+
**Description:** Tab-delimited gene expression counts
|
| 229 |
+
**Typical Data:** Gene IDs with read counts per sample
|
| 230 |
+
**Use Cases:** RNA-seq quantification, differential expression
|
| 231 |
+
**Python Libraries:**
|
| 232 |
+
- `pandas`: `pd.read_csv('file.counts', sep='\t')`
|
| 233 |
+
- `scanpy` (for single-cell): `sc.read_csv()`
|
| 234 |
+
**EDA Approach:**
|
| 235 |
+
- Library size distribution
|
| 236 |
+
- Detection rate (genes per sample)
|
| 237 |
+
- Zero-inflation analysis
|
| 238 |
+
- Count distribution (log scale)
|
| 239 |
+
- Outlier sample detection
|
| 240 |
+
- Correlation between replicates
|
| 241 |
+
- PCA for sample relationships
|
| 242 |
+
|
| 243 |
+
### .tpm / .fpkm - Normalized Expression
|
| 244 |
+
**Description:** Normalized gene expression values
|
| 245 |
+
**Typical Data:** TPM (transcripts per million) or FPKM values
|
| 246 |
+
**Use Cases:** Cross-sample comparison, visualization
|
| 247 |
+
**Python Libraries:**
|
| 248 |
+
- `pandas`: Standard CSV reading
|
| 249 |
+
- `anndata`: For integrated analysis
|
| 250 |
+
**EDA Approach:**
|
| 251 |
+
- Expression distribution
|
| 252 |
+
- Highly expressed gene identification
|
| 253 |
+
- Sample clustering
|
| 254 |
+
- Batch effect detection
|
| 255 |
+
- Coefficient of variation analysis
|
| 256 |
+
- Dynamic range assessment
|
| 257 |
+
|
| 258 |
+
### .mtx - Matrix Market Format
|
| 259 |
+
**Description:** Sparse matrix format (common in single-cell)
|
| 260 |
+
**Typical Data:** Sparse count matrices (cells × genes)
|
| 261 |
+
**Use Cases:** Single-cell RNA-seq, large sparse matrices
|
| 262 |
+
**Python Libraries:**
|
| 263 |
+
- `scipy.io`: `scipy.io.mmread('file.mtx')`
|
| 264 |
+
- `scanpy`: `sc.read_mtx('file.mtx')`
|
| 265 |
+
**EDA Approach:**
|
| 266 |
+
- Sparsity analysis
|
| 267 |
+
- Cell and gene filtering thresholds
|
| 268 |
+
- Doublet detection metrics
|
| 269 |
+
- Mitochondrial fraction
|
| 270 |
+
- UMI count distribution
|
| 271 |
+
- Gene detection per cell
|
| 272 |
+
|
| 273 |
+
### .h5ad - Anndata Format
|
| 274 |
+
**Description:** HDF5-based annotated data matrix
|
| 275 |
+
**Typical Data:** Expression matrix with metadata (cells, genes)
|
| 276 |
+
**Use Cases:** Single-cell RNA-seq analysis with Scanpy
|
| 277 |
+
**Python Libraries:**
|
| 278 |
+
- `scanpy`: `sc.read_h5ad('file.h5ad')`
|
| 279 |
+
- `anndata`: Direct AnnData manipulation
|
| 280 |
+
**EDA Approach:**
|
| 281 |
+
- Cell and gene counts
|
| 282 |
+
- Metadata completeness
|
| 283 |
+
- Layer availability (raw, normalized)
|
| 284 |
+
- Embedding presence (PCA, UMAP)
|
| 285 |
+
- QC metrics distribution
|
| 286 |
+
- Batch information
|
| 287 |
+
- Cell type annotation coverage
|
| 288 |
+
|
| 289 |
+
### .loom - Loom Format
|
| 290 |
+
**Description:** HDF5-based format for omics data
|
| 291 |
+
**Typical Data:** Expression matrices with metadata
|
| 292 |
+
**Use Cases:** Single-cell data, RNA velocity analysis
|
| 293 |
+
**Python Libraries:**
|
| 294 |
+
- `loompy`: `loompy.connect('file.loom')`
|
| 295 |
+
- `scanpy`: Can import loom files
|
| 296 |
+
**EDA Approach:**
|
| 297 |
+
- Layer analysis (spliced, unspliced)
|
| 298 |
+
- Row and column attribute exploration
|
| 299 |
+
- Graph connectivity analysis
|
| 300 |
+
- Cluster assignments
|
| 301 |
+
- Velocity-specific metrics
|
| 302 |
+
|
| 303 |
+
### .rds - R Data Serialization
|
| 304 |
+
**Description:** R object storage (often Seurat objects)
|
| 305 |
+
**Typical Data:** R analysis results, especially single-cell
|
| 306 |
+
**Use Cases:** R-Python data exchange
|
| 307 |
+
**Python Libraries:**
|
| 308 |
+
- `pyreadr`: `pyreadr.read_r('file.rds')`
|
| 309 |
+
- `rpy2`: For full R integration
|
| 310 |
+
- Conversion tools to AnnData
|
| 311 |
+
**EDA Approach:**
|
| 312 |
+
- Object type identification
|
| 313 |
+
- Data structure exploration
|
| 314 |
+
- Metadata extraction
|
| 315 |
+
- Conversion validation
|
| 316 |
+
|
| 317 |
+
## Alignment and Assembly Formats
|
| 318 |
+
|
| 319 |
+
### .maf - Multiple Alignment Format
|
| 320 |
+
**Description:** Text format for multiple sequence alignments
|
| 321 |
+
**Typical Data:** Genome-wide or local multiple alignments
|
| 322 |
+
**Use Cases:** Comparative genomics, conservation analysis
|
| 323 |
+
**Python Libraries:**
|
| 324 |
+
- `Biopython`: `AlignIO.parse('file.maf', 'maf')`
|
| 325 |
+
- `bx-python`: MAF-specific tools
|
| 326 |
+
**EDA Approach:**
|
| 327 |
+
- Alignment block statistics
|
| 328 |
+
- Species coverage
|
| 329 |
+
- Gap analysis
|
| 330 |
+
- Conservation scoring
|
| 331 |
+
- Alignment quality metrics
|
| 332 |
+
- Block length distribution
|
| 333 |
+
|
| 334 |
+
### .axt - Pairwise Alignment Format
|
| 335 |
+
**Description:** Pairwise alignment format (UCSC)
|
| 336 |
+
**Typical Data:** Pairwise genomic alignments
|
| 337 |
+
**Use Cases:** Genome comparison, synteny analysis
|
| 338 |
+
**Python Libraries:**
|
| 339 |
+
- Custom parsers (simple format)
|
| 340 |
+
- `bx-python`: AXT support
|
| 341 |
+
**EDA Approach:**
|
| 342 |
+
- Alignment score distribution
|
| 343 |
+
- Identity percentage
|
| 344 |
+
- Syntenic block identification
|
| 345 |
+
- Gap size analysis
|
| 346 |
+
- Coverage statistics
|
| 347 |
+
|
| 348 |
+
### .chain - Chain Alignment Format
|
| 349 |
+
**Description:** Genome coordinate mapping chains
|
| 350 |
+
**Typical Data:** Coordinate transformations between genome builds
|
| 351 |
+
**Use Cases:** Liftover, coordinate conversion
|
| 352 |
+
**Python Libraries:**
|
| 353 |
+
- `pyliftover`: Chain file usage
|
| 354 |
+
- Custom parsers for chain format
|
| 355 |
+
**EDA Approach:**
|
| 356 |
+
- Chain score distribution
|
| 357 |
+
- Coverage of source genome
|
| 358 |
+
- Gap analysis
|
| 359 |
+
- Inversion detection
|
| 360 |
+
- Mapping quality assessment
|
| 361 |
+
|
| 362 |
+
### .psl - Pattern Space Layout
|
| 363 |
+
**Description:** BLAT/BLAST alignment format
|
| 364 |
+
**Typical Data:** Alignment results from BLAT
|
| 365 |
+
**Use Cases:** Transcript mapping, similarity searches
|
| 366 |
+
**Python Libraries:**
|
| 367 |
+
- Custom parsers (tab-delimited)
|
| 368 |
+
- `pybedtools`: Can handle PSL
|
| 369 |
+
**EDA Approach:**
|
| 370 |
+
- Match percentage distribution
|
| 371 |
+
- Gap statistics
|
| 372 |
+
- Query coverage
|
| 373 |
+
- Multiple mapping analysis
|
| 374 |
+
- Alignment quality metrics
|
| 375 |
+
|
| 376 |
+
## Genome Assembly and Annotation
|
| 377 |
+
|
| 378 |
+
### .agp - Assembly Golden Path
|
| 379 |
+
**Description:** Assembly structure description
|
| 380 |
+
**Typical Data:** Scaffold composition, gap information
|
| 381 |
+
**Use Cases:** Genome assembly representation
|
| 382 |
+
**Python Libraries:**
|
| 383 |
+
- Custom parsers (simple tab-delimited)
|
| 384 |
+
- Assembly analysis tools
|
| 385 |
+
**EDA Approach:**
|
| 386 |
+
- Scaffold statistics (N50, L50)
|
| 387 |
+
- Gap type and size distribution
|
| 388 |
+
- Component length analysis
|
| 389 |
+
- Assembly contiguity metrics
|
| 390 |
+
- Unplaced contig analysis
|
| 391 |
+
|
| 392 |
+
### .scaffolds / .contigs - Assembly Sequences
|
| 393 |
+
**Description:** Assembled sequences (usually FASTA)
|
| 394 |
+
**Typical Data:** Assembled genomic sequences
|
| 395 |
+
**Use Cases:** Genome assembly output
|
| 396 |
+
**Python Libraries:**
|
| 397 |
+
- Same as FASTA format
|
| 398 |
+
- Assembly-specific tools (QUAST)
|
| 399 |
+
**EDA Approach:**
|
| 400 |
+
- Assembly statistics (N50, N90, etc.)
|
| 401 |
+
- Length distribution
|
| 402 |
+
- Coverage analysis
|
| 403 |
+
- Gap (N) content
|
| 404 |
+
- Duplication assessment
|
| 405 |
+
- BUSCO completeness (if annotations available)
|
| 406 |
+
|
| 407 |
+
### .2bit - Compressed Genome Format
|
| 408 |
+
**Description:** UCSC compact genome format
|
| 409 |
+
**Typical Data:** Reference genomes (highly compressed)
|
| 410 |
+
**Use Cases:** Efficient genome storage and access
|
| 411 |
+
**Python Libraries:**
|
| 412 |
+
- `py2bit`: `py2bit.open('file.2bit')`
|
| 413 |
+
- `twobitreader`: Alternative reader
|
| 414 |
+
**EDA Approach:**
|
| 415 |
+
- Compression efficiency
|
| 416 |
+
- Random access performance
|
| 417 |
+
- Sequence extraction validation
|
| 418 |
+
- Masked region analysis
|
| 419 |
+
- N content and distribution
|
| 420 |
+
|
| 421 |
+
### .sizes - Chromosome Sizes
|
| 422 |
+
**Description:** Simple format with chromosome lengths
|
| 423 |
+
**Typical Data:** Tab-delimited chromosome names and sizes
|
| 424 |
+
**Use Cases:** Genome browsers, coordinate validation
|
| 425 |
+
**Python Libraries:**
|
| 426 |
+
- Simple file reading with pandas
|
| 427 |
+
- Built into many genomic tools
|
| 428 |
+
**EDA Approach:**
|
| 429 |
+
- Genome size calculation
|
| 430 |
+
- Chromosome count
|
| 431 |
+
- Size distribution
|
| 432 |
+
- Karyotype validation
|
| 433 |
+
- Completeness check against reference
|
| 434 |
+
|
| 435 |
+
## Phylogenetics and Evolution
|
| 436 |
+
|
| 437 |
+
### .nwk / .newick - Newick Tree Format
|
| 438 |
+
**Description:** Parenthetical tree representation
|
| 439 |
+
**Typical Data:** Phylogenetic trees with branch lengths
|
| 440 |
+
**Use Cases:** Evolutionary analysis, tree visualization
|
| 441 |
+
**Python Libraries:**
|
| 442 |
+
- `Biopython`: `Phylo.read('file.nwk', 'newick')`
|
| 443 |
+
- `ete3`: `ete3.Tree('file.nwk')`
|
| 444 |
+
- `dendropy`: Phylogenetic computing
|
| 445 |
+
**EDA Approach:**
|
| 446 |
+
- Tree structure analysis (tips, internal nodes)
|
| 447 |
+
- Branch length distribution
|
| 448 |
+
- Tree balance metrics
|
| 449 |
+
- Ultrametricity check
|
| 450 |
+
- Bootstrap support analysis
|
| 451 |
+
- Topology validation
|
| 452 |
+
|
| 453 |
+
### .nexus - Nexus Format
|
| 454 |
+
**Description:** Rich format for phylogenetic data
|
| 455 |
+
**Typical Data:** Alignments, trees, character matrices
|
| 456 |
+
**Use Cases:** Phylogenetic software interchange
|
| 457 |
+
**Python Libraries:**
|
| 458 |
+
- `Biopython`: Nexus support
|
| 459 |
+
- `dendropy`: Comprehensive Nexus handling
|
| 460 |
+
**EDA Approach:**
|
| 461 |
+
- Data block analysis
|
| 462 |
+
- Character type distribution
|
| 463 |
+
- Tree block validation
|
| 464 |
+
- Taxa consistency
|
| 465 |
+
- Command block parsing
|
| 466 |
+
- Format compliance checking
|
| 467 |
+
|
| 468 |
+
### .phylip - PHYLIP Format
|
| 469 |
+
**Description:** Sequence alignment format (strict/relaxed)
|
| 470 |
+
**Typical Data:** Multiple sequence alignments
|
| 471 |
+
**Use Cases:** Phylogenetic analysis input
|
| 472 |
+
**Python Libraries:**
|
| 473 |
+
- `Biopython`: `AlignIO.read('file.phy', 'phylip')`
|
| 474 |
+
- `dendropy`: PHYLIP support
|
| 475 |
+
**EDA Approach:**
|
| 476 |
+
- Alignment dimensions
|
| 477 |
+
- Sequence length uniformity
|
| 478 |
+
- Gap position analysis
|
| 479 |
+
- Informative site calculation
|
| 480 |
+
- Format variant detection (strict vs relaxed)
|
| 481 |
+
|
| 482 |
+
### .paml - PAML Output
|
| 483 |
+
**Description:** Output from PAML phylogenetic software
|
| 484 |
+
**Typical Data:** Evolutionary model results, dN/dS ratios
|
| 485 |
+
**Use Cases:** Molecular evolution analysis
|
| 486 |
+
**Python Libraries:**
|
| 487 |
+
- Custom parsers for specific PAML programs
|
| 488 |
+
- `Biopython`: Basic PAML parsing
|
| 489 |
+
**EDA Approach:**
|
| 490 |
+
- Model parameter extraction
|
| 491 |
+
- Likelihood values
|
| 492 |
+
- dN/dS ratio distribution
|
| 493 |
+
- Branch-specific results
|
| 494 |
+
- Convergence assessment
|
| 495 |
+
|
| 496 |
+
## Protein and Structure Data
|
| 497 |
+
|
| 498 |
+
### .embl - EMBL Format
|
| 499 |
+
**Description:** Rich sequence annotation format
|
| 500 |
+
**Typical Data:** Sequences with extensive annotations
|
| 501 |
+
**Use Cases:** Sequence databases, genome records
|
| 502 |
+
**Python Libraries:**
|
| 503 |
+
- `Biopython`: `SeqIO.read('file.embl', 'embl')`
|
| 504 |
+
**EDA Approach:**
|
| 505 |
+
- Feature annotation completeness
|
| 506 |
+
- Sequence length and type
|
| 507 |
+
- Reference information
|
| 508 |
+
- Cross-reference validation
|
| 509 |
+
- Feature overlap analysis
|
| 510 |
+
|
| 511 |
+
### .genbank / .gb / .gbk - GenBank Format
|
| 512 |
+
**Description:** NCBI's sequence annotation format
|
| 513 |
+
**Typical Data:** Annotated sequences with features
|
| 514 |
+
**Use Cases:** Sequence databases, annotation transfer
|
| 515 |
+
**Python Libraries:**
|
| 516 |
+
- `Biopython`: `SeqIO.parse('file.gb', 'genbank')`
|
| 517 |
+
**EDA Approach:**
|
| 518 |
+
- Feature type distribution
|
| 519 |
+
- CDS analysis (start codons, stops)
|
| 520 |
+
- Translation validation
|
| 521 |
+
- Annotation completeness
|
| 522 |
+
- Source organism extraction
|
| 523 |
+
- Reference and publication info
|
| 524 |
+
- Locus tag consistency
|
| 525 |
+
|
| 526 |
+
### .sff - Standard Flowgram Format
|
| 527 |
+
**Description:** 454/Roche sequencing data format
|
| 528 |
+
**Typical Data:** Raw pyrosequencing flowgrams
|
| 529 |
+
**Use Cases:** Legacy 454 sequencing data
|
| 530 |
+
**Python Libraries:**
|
| 531 |
+
- `Biopython`: `SeqIO.parse('file.sff', 'sff')`
|
| 532 |
+
- Platform-specific tools
|
| 533 |
+
**EDA Approach:**
|
| 534 |
+
- Read count and length
|
| 535 |
+
- Flowgram signal quality
|
| 536 |
+
- Key sequence detection
|
| 537 |
+
- Adapter trimming validation
|
| 538 |
+
- Quality score distribution
|
| 539 |
+
|
| 540 |
+
### .hdf5 (Genomics Specific)
|
| 541 |
+
**Description:** HDF5 for genomics (10X, Hi-C, etc.)
|
| 542 |
+
**Typical Data:** High-throughput genomics data
|
| 543 |
+
**Use Cases:** 10X Genomics, spatial transcriptomics
|
| 544 |
+
**Python Libraries:**
|
| 545 |
+
- `h5py`: Low-level access
|
| 546 |
+
- `scanpy`: For 10X data
|
| 547 |
+
- `cooler`: For Hi-C data
|
| 548 |
+
**EDA Approach:**
|
| 549 |
+
- Dataset structure exploration
|
| 550 |
+
- Barcode statistics
|
| 551 |
+
- UMI counting
|
| 552 |
+
- Feature-barcode matrix analysis
|
| 553 |
+
- Spatial coordinates (if applicable)
|
| 554 |
+
|
| 555 |
+
### .cool / .mcool - Cooler Format
|
| 556 |
+
**Description:** HDF5-based Hi-C contact matrices
|
| 557 |
+
**Typical Data:** Chromatin interaction matrices
|
| 558 |
+
**Use Cases:** 3D genome analysis, Hi-C data
|
| 559 |
+
**Python Libraries:**
|
| 560 |
+
- `cooler`: `cooler.Cooler('file.cool')`
|
| 561 |
+
- `hicstraw`: For .hic format
|
| 562 |
+
**EDA Approach:**
|
| 563 |
+
- Resolution analysis
|
| 564 |
+
- Contact matrix statistics
|
| 565 |
+
- Distance decay curves
|
| 566 |
+
- Compartment analysis
|
| 567 |
+
- TAD boundary detection
|
| 568 |
+
- Balance factor validation
|
| 569 |
+
|
| 570 |
+
### .hic - Hi-C Binary Format
|
| 571 |
+
**Description:** Juicer binary Hi-C format
|
| 572 |
+
**Typical Data:** Multi-resolution Hi-C matrices
|
| 573 |
+
**Use Cases:** Hi-C analysis with Juicer tools
|
| 574 |
+
**Python Libraries:**
|
| 575 |
+
- `hicstraw`: `hicstraw.HiCFile('file.hic')`
|
| 576 |
+
- `straw`: C++ library with Python bindings
|
| 577 |
+
**EDA Approach:**
|
| 578 |
+
- Available resolutions
|
| 579 |
+
- Normalization methods
|
| 580 |
+
- Contact statistics
|
| 581 |
+
- Chromosomal interactions
|
| 582 |
+
- Quality metrics
|
| 583 |
+
|
| 584 |
+
### .bw (ChIP-seq / ATAC-seq specific)
|
| 585 |
+
**Description:** BigWig files for epigenomics
|
| 586 |
+
**Typical Data:** Coverage or enrichment signals
|
| 587 |
+
**Use Cases:** ChIP-seq, ATAC-seq, DNase-seq
|
| 588 |
+
**Python Libraries:**
|
| 589 |
+
- `pyBigWig`: Standard bigWig access
|
| 590 |
+
**EDA Approach:**
|
| 591 |
+
- Peak enrichment patterns
|
| 592 |
+
- Background signal analysis
|
| 593 |
+
- Sample correlation
|
| 594 |
+
- Signal-to-noise ratio
|
| 595 |
+
- Library complexity metrics
|
| 596 |
+
|
| 597 |
+
### .narrowPeak / .broadPeak - ENCODE Peak Formats
|
| 598 |
+
**Description:** BED-based formats for peaks
|
| 599 |
+
**Typical Data:** Peak calls with scores and p-values
|
| 600 |
+
**Use Cases:** ChIP-seq peak calling output
|
| 601 |
+
**Python Libraries:**
|
| 602 |
+
- `pybedtools`: BED-compatible
|
| 603 |
+
- Custom parsers for peak-specific fields
|
| 604 |
+
**EDA Approach:**
|
| 605 |
+
- Peak count and width distribution
|
| 606 |
+
- Signal value distribution
|
| 607 |
+
- Q-value and p-value analysis
|
| 608 |
+
- Peak summit analysis
|
| 609 |
+
- Overlap with known features
|
| 610 |
+
- Motif enrichment preparation
|
| 611 |
+
|
| 612 |
+
### .wig - Wiggle Format
|
| 613 |
+
**Description:** Dense continuous genomic data
|
| 614 |
+
**Typical Data:** Coverage or signal tracks
|
| 615 |
+
**Use Cases:** Genome browser visualization
|
| 616 |
+
**Python Libraries:**
|
| 617 |
+
- `pyBigWig`: Can convert to bigWig
|
| 618 |
+
- Custom parsers for wiggle format
|
| 619 |
+
**EDA Approach:**
|
| 620 |
+
- Signal statistics
|
| 621 |
+
- Coverage metrics
|
| 622 |
+
- Format variant (fixedStep vs variableStep)
|
| 623 |
+
- Span parameter analysis
|
| 624 |
+
- Conversion efficiency to bigWig
|
| 625 |
+
|
| 626 |
+
### .ab1 - Sanger Sequencing Trace
|
| 627 |
+
**Description:** Binary chromatogram format
|
| 628 |
+
**Typical Data:** Sanger sequencing traces
|
| 629 |
+
**Use Cases:** Capillary sequencing validation
|
| 630 |
+
**Python Libraries:**
|
| 631 |
+
- `Biopython`: `SeqIO.read('file.ab1', 'abi')`
|
| 632 |
+
- `tracy` tools: For quality assessment
|
| 633 |
+
**EDA Approach:**
|
| 634 |
+
- Base calling quality
|
| 635 |
+
- Trace quality scores
|
| 636 |
+
- Mixed base detection
|
| 637 |
+
- Primer and vector detection
|
| 638 |
+
- Read length and quality region
|
| 639 |
+
- Heterozygosity detection
|
| 640 |
+
|
| 641 |
+
### .scf - Standard Chromatogram Format
|
| 642 |
+
**Description:** Sanger sequencing chromatogram
|
| 643 |
+
**Typical Data:** Base calls and confidence values
|
| 644 |
+
**Use Cases:** Sequencing trace analysis
|
| 645 |
+
**Python Libraries:**
|
| 646 |
+
- `Biopython`: SCF format support
|
| 647 |
+
**EDA Approach:**
|
| 648 |
+
- Similar to AB1 format
|
| 649 |
+
- Quality score profiles
|
| 650 |
+
- Peak height ratios
|
| 651 |
+
- Signal-to-noise metrics
|
| 652 |
+
|
| 653 |
+
### .idx - Index Files (Generic)
|
| 654 |
+
**Description:** Index files for various formats
|
| 655 |
+
**Typical Data:** Fast random access indices
|
| 656 |
+
**Use Cases:** Efficient data access (BAM, VCF, etc.)
|
| 657 |
+
**Python Libraries:**
|
| 658 |
+
- Format-specific libraries handle indices
|
| 659 |
+
- `pysam`: Auto-handles BAI, CSI indices
|
| 660 |
+
**EDA Approach:**
|
| 661 |
+
- Index completeness validation
|
| 662 |
+
- Binning strategy analysis
|
| 663 |
+
- Access performance metrics
|
| 664 |
+
- Index size vs data size ratio
|
.scider/skills/exploratory-data-analysis/references/chemistry_molecular_formats.md
ADDED
|
@@ -0,0 +1,664 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Chemistry and Molecular File Formats Reference
|
| 2 |
+
|
| 3 |
+
This reference covers file formats commonly used in computational chemistry, cheminformatics, molecular modeling, and related fields.
|
| 4 |
+
|
| 5 |
+
## Structure File Formats
|
| 6 |
+
|
| 7 |
+
### .pdb - Protein Data Bank
|
| 8 |
+
**Description:** Standard format for 3D structures of biological macromolecules
|
| 9 |
+
**Typical Data:** Atomic coordinates, residue information, secondary structure, crystal structure data
|
| 10 |
+
**Use Cases:** Protein structure analysis, molecular visualization, docking studies
|
| 11 |
+
**Python Libraries:**
|
| 12 |
+
- `Biopython`: `Bio.PDB`
|
| 13 |
+
- `MDAnalysis`: `MDAnalysis.Universe('file.pdb')`
|
| 14 |
+
- `PyMOL`: `pymol.cmd.load('file.pdb')`
|
| 15 |
+
- `ProDy`: `prody.parsePDB('file.pdb')`
|
| 16 |
+
**EDA Approach:**
|
| 17 |
+
- Structure validation (bond lengths, angles, clashes)
|
| 18 |
+
- Secondary structure analysis
|
| 19 |
+
- B-factor distribution
|
| 20 |
+
- Missing residues/atoms detection
|
| 21 |
+
- Ramachandran plots for validation
|
| 22 |
+
- Surface area and volume calculations
|
| 23 |
+
|
| 24 |
+
### .cif - Crystallographic Information File
|
| 25 |
+
**Description:** Structured data format for crystallographic information
|
| 26 |
+
**Typical Data:** Unit cell parameters, atomic coordinates, symmetry operations, experimental data
|
| 27 |
+
**Use Cases:** Crystal structure determination, structural biology, materials science
|
| 28 |
+
**Python Libraries:**
|
| 29 |
+
- `gemmi`: `gemmi.cif.read_file('file.cif')`
|
| 30 |
+
- `PyCifRW`: `CifFile.ReadCif('file.cif')`
|
| 31 |
+
- `Biopython`: `Bio.PDB.MMCIFParser()`
|
| 32 |
+
**EDA Approach:**
|
| 33 |
+
- Data completeness check
|
| 34 |
+
- Resolution and quality metrics
|
| 35 |
+
- Unit cell parameter analysis
|
| 36 |
+
- Symmetry group validation
|
| 37 |
+
- Atomic displacement parameters
|
| 38 |
+
- R-factors and validation metrics
|
| 39 |
+
|
| 40 |
+
### .mol - MDL Molfile
|
| 41 |
+
**Description:** Chemical structure file format by MDL/Accelrys
|
| 42 |
+
**Typical Data:** 2D/3D coordinates, atom types, bond orders, charges
|
| 43 |
+
**Use Cases:** Chemical database storage, cheminformatics, drug design
|
| 44 |
+
**Python Libraries:**
|
| 45 |
+
- `RDKit`: `Chem.MolFromMolFile('file.mol')`
|
| 46 |
+
- `Open Babel`: `pybel.readfile('mol', 'file.mol')`
|
| 47 |
+
- `ChemoPy`: For descriptor calculation
|
| 48 |
+
**EDA Approach:**
|
| 49 |
+
- Molecular property calculation (MW, logP, TPSA)
|
| 50 |
+
- Functional group analysis
|
| 51 |
+
- Ring system detection
|
| 52 |
+
- Stereochemistry validation
|
| 53 |
+
- 2D/3D coordinate consistency
|
| 54 |
+
- Valence and charge validation
|
| 55 |
+
|
| 56 |
+
### .mol2 - Tripos Mol2
|
| 57 |
+
**Description:** Complete 3D molecular structure format with atom typing
|
| 58 |
+
**Typical Data:** Coordinates, SYBYL atom types, bond types, charges, substructures
|
| 59 |
+
**Use Cases:** Molecular docking, QSAR studies, drug discovery
|
| 60 |
+
**Python Libraries:**
|
| 61 |
+
- `RDKit`: `Chem.MolFromMol2File('file.mol2')`
|
| 62 |
+
- `Open Babel`: `pybel.readfile('mol2', 'file.mol2')`
|
| 63 |
+
- `MDAnalysis`: Can parse mol2 topology
|
| 64 |
+
**EDA Approach:**
|
| 65 |
+
- Atom type distribution
|
| 66 |
+
- Partial charge analysis
|
| 67 |
+
- Bond type statistics
|
| 68 |
+
- Substructure identification
|
| 69 |
+
- Conformational analysis
|
| 70 |
+
- Energy minimization status check
|
| 71 |
+
|
| 72 |
+
### .sdf - Structure Data File
|
| 73 |
+
**Description:** Multi-structure file format with associated data
|
| 74 |
+
**Typical Data:** Multiple molecular structures with properties/annotations
|
| 75 |
+
**Use Cases:** Chemical databases, virtual screening, compound libraries
|
| 76 |
+
**Python Libraries:**
|
| 77 |
+
- `RDKit`: `Chem.SDMolSupplier('file.sdf')`
|
| 78 |
+
- `Open Babel`: `pybel.readfile('sdf', 'file.sdf')`
|
| 79 |
+
- `PandasTools` (RDKit): For DataFrame integration
|
| 80 |
+
**EDA Approach:**
|
| 81 |
+
- Dataset size and diversity metrics
|
| 82 |
+
- Property distribution analysis (MW, logP, etc.)
|
| 83 |
+
- Structural diversity (Tanimoto similarity)
|
| 84 |
+
- Missing data assessment
|
| 85 |
+
- Outlier detection in properties
|
| 86 |
+
- Scaffold analysis
|
| 87 |
+
|
| 88 |
+
### .xyz - XYZ Coordinates
|
| 89 |
+
**Description:** Simple Cartesian coordinate format
|
| 90 |
+
**Typical Data:** Atom types and 3D coordinates
|
| 91 |
+
**Use Cases:** Quantum chemistry, geometry optimization, molecular dynamics
|
| 92 |
+
**Python Libraries:**
|
| 93 |
+
- `ASE`: `ase.io.read('file.xyz')`
|
| 94 |
+
- `Open Babel`: `pybel.readfile('xyz', 'file.xyz')`
|
| 95 |
+
- `cclib`: For parsing QM outputs with xyz
|
| 96 |
+
**EDA Approach:**
|
| 97 |
+
- Geometry analysis (bond lengths, angles, dihedrals)
|
| 98 |
+
- Center of mass calculation
|
| 99 |
+
- Moment of inertia
|
| 100 |
+
- Molecular size metrics
|
| 101 |
+
- Coordinate validation
|
| 102 |
+
- Symmetry detection
|
| 103 |
+
|
| 104 |
+
### .smi / .smiles - SMILES String
|
| 105 |
+
**Description:** Line notation for chemical structures
|
| 106 |
+
**Typical Data:** Text representation of molecular structure
|
| 107 |
+
**Use Cases:** Chemical databases, literature mining, data exchange
|
| 108 |
+
**Python Libraries:**
|
| 109 |
+
- `RDKit`: `Chem.MolFromSmiles(smiles)`
|
| 110 |
+
- `Open Babel`: Can parse SMILES
|
| 111 |
+
- `DeepChem`: For ML on SMILES
|
| 112 |
+
**EDA Approach:**
|
| 113 |
+
- SMILES syntax validation
|
| 114 |
+
- Descriptor calculation from SMILES
|
| 115 |
+
- Fingerprint generation
|
| 116 |
+
- Substructure searching
|
| 117 |
+
- Tautomer enumeration
|
| 118 |
+
- Stereoisomer handling
|
| 119 |
+
|
| 120 |
+
### .pdbqt - AutoDock PDBQT
|
| 121 |
+
**Description:** Modified PDB format for AutoDock docking
|
| 122 |
+
**Typical Data:** Coordinates, partial charges, atom types for docking
|
| 123 |
+
**Use Cases:** Molecular docking, virtual screening
|
| 124 |
+
**Python Libraries:**
|
| 125 |
+
- `Meeko`: For PDBQT preparation
|
| 126 |
+
- `Open Babel`: Can read PDBQT
|
| 127 |
+
- `ProDy`: Limited PDBQT support
|
| 128 |
+
**EDA Approach:**
|
| 129 |
+
- Charge distribution analysis
|
| 130 |
+
- Rotatable bond identification
|
| 131 |
+
- Atom type validation
|
| 132 |
+
- Coordinate quality check
|
| 133 |
+
- Hydrogen placement validation
|
| 134 |
+
- Torsion definition analysis
|
| 135 |
+
|
| 136 |
+
### .mae - Maestro Format
|
| 137 |
+
**Description:** Schrödinger's proprietary molecular structure format
|
| 138 |
+
**Typical Data:** Structures, properties, annotations from Schrödinger suite
|
| 139 |
+
**Use Cases:** Drug discovery, molecular modeling with Schrödinger tools
|
| 140 |
+
**Python Libraries:**
|
| 141 |
+
- `schrodinger.structure`: Requires Schrödinger installation
|
| 142 |
+
- Custom parsers for basic reading
|
| 143 |
+
**EDA Approach:**
|
| 144 |
+
- Property extraction and analysis
|
| 145 |
+
- Structure quality metrics
|
| 146 |
+
- Conformer analysis
|
| 147 |
+
- Docking score distributions
|
| 148 |
+
- Ligand efficiency metrics
|
| 149 |
+
|
| 150 |
+
### .gro - GROMACS Coordinate File
|
| 151 |
+
**Description:** Molecular structure file for GROMACS MD simulations
|
| 152 |
+
**Typical Data:** Atom positions, velocities, box vectors
|
| 153 |
+
**Use Cases:** Molecular dynamics simulations, GROMACS workflows
|
| 154 |
+
**Python Libraries:**
|
| 155 |
+
- `MDAnalysis`: `Universe('file.gro')`
|
| 156 |
+
- `MDTraj`: `mdtraj.load_gro('file.gro')`
|
| 157 |
+
- `GromacsWrapper`: For GROMACS integration
|
| 158 |
+
**EDA Approach:**
|
| 159 |
+
- System composition analysis
|
| 160 |
+
- Box dimension validation
|
| 161 |
+
- Atom position distribution
|
| 162 |
+
- Velocity distribution (if present)
|
| 163 |
+
- Density calculation
|
| 164 |
+
- Solvation analysis
|
| 165 |
+
|
| 166 |
+
## Computational Chemistry Output Formats
|
| 167 |
+
|
| 168 |
+
### .log - Gaussian Log File
|
| 169 |
+
**Description:** Output from Gaussian quantum chemistry calculations
|
| 170 |
+
**Typical Data:** Energies, geometries, frequencies, orbitals, populations
|
| 171 |
+
**Use Cases:** QM calculations, geometry optimization, frequency analysis
|
| 172 |
+
**Python Libraries:**
|
| 173 |
+
- `cclib`: `cclib.io.ccread('file.log')`
|
| 174 |
+
- `GaussianRunPack`: For Gaussian workflows
|
| 175 |
+
- Custom parsers with regex
|
| 176 |
+
**EDA Approach:**
|
| 177 |
+
- Convergence analysis
|
| 178 |
+
- Energy profile extraction
|
| 179 |
+
- Vibrational frequency analysis
|
| 180 |
+
- Orbital energy levels
|
| 181 |
+
- Population analysis (Mulliken, NBO)
|
| 182 |
+
- Thermochemistry data extraction
|
| 183 |
+
|
| 184 |
+
### .out - Quantum Chemistry Output
|
| 185 |
+
**Description:** Generic output file from various QM packages
|
| 186 |
+
**Typical Data:** Calculation results, energies, properties
|
| 187 |
+
**Use Cases:** QM calculations across different software
|
| 188 |
+
**Python Libraries:**
|
| 189 |
+
- `cclib`: Universal parser for QM outputs
|
| 190 |
+
- `ASE`: Can read some output formats
|
| 191 |
+
**EDA Approach:**
|
| 192 |
+
- Software-specific parsing
|
| 193 |
+
- Convergence criteria check
|
| 194 |
+
- Energy and gradient trends
|
| 195 |
+
- Basis set and method validation
|
| 196 |
+
- Computational cost analysis
|
| 197 |
+
|
| 198 |
+
### .wfn / .wfx - Wavefunction Files
|
| 199 |
+
**Description:** Wavefunction data for quantum chemical analysis
|
| 200 |
+
**Typical Data:** Molecular orbitals, basis sets, density matrices
|
| 201 |
+
**Use Cases:** Electron density analysis, QTAIM analysis
|
| 202 |
+
**Python Libraries:**
|
| 203 |
+
- `Multiwfn`: Interface via Python
|
| 204 |
+
- `Horton`: For wavefunction analysis
|
| 205 |
+
- Custom parsers for specific formats
|
| 206 |
+
**EDA Approach:**
|
| 207 |
+
- Orbital population analysis
|
| 208 |
+
- Electron density distribution
|
| 209 |
+
- Critical point analysis (QTAIM)
|
| 210 |
+
- Molecular orbital visualization
|
| 211 |
+
- Bonding analysis
|
| 212 |
+
|
| 213 |
+
### .fchk - Gaussian Formatted Checkpoint
|
| 214 |
+
**Description:** Formatted checkpoint file from Gaussian
|
| 215 |
+
**Typical Data:** Complete wavefunction data, results, geometry
|
| 216 |
+
**Use Cases:** Post-processing Gaussian calculations
|
| 217 |
+
**Python Libraries:**
|
| 218 |
+
- `cclib`: Can parse fchk files
|
| 219 |
+
- `GaussView` Python API (if available)
|
| 220 |
+
- Custom parsers
|
| 221 |
+
**EDA Approach:**
|
| 222 |
+
- Wavefunction quality assessment
|
| 223 |
+
- Property extraction
|
| 224 |
+
- Basis set information
|
| 225 |
+
- Gradient and Hessian analysis
|
| 226 |
+
- Natural orbital analysis
|
| 227 |
+
|
| 228 |
+
### .cube - Gaussian Cube File
|
| 229 |
+
**Description:** Volumetric data on a 3D grid
|
| 230 |
+
**Typical Data:** Electron density, molecular orbitals, ESP on grid
|
| 231 |
+
**Use Cases:** Visualization of volumetric properties
|
| 232 |
+
**Python Libraries:**
|
| 233 |
+
- `cclib`: `cclib.io.ccread('file.cube')`
|
| 234 |
+
- `ase.io`: `ase.io.read('file.cube')`
|
| 235 |
+
- `pyquante`: For cube file manipulation
|
| 236 |
+
**EDA Approach:**
|
| 237 |
+
- Grid dimension and spacing analysis
|
| 238 |
+
- Value distribution statistics
|
| 239 |
+
- Isosurface value determination
|
| 240 |
+
- Integration over volume
|
| 241 |
+
- Comparison between different cubes
|
| 242 |
+
|
| 243 |
+
## Molecular Dynamics Formats
|
| 244 |
+
|
| 245 |
+
### .dcd - Binary Trajectory
|
| 246 |
+
**Description:** Binary trajectory format (CHARMM, NAMD)
|
| 247 |
+
**Typical Data:** Time series of atomic coordinates
|
| 248 |
+
**Use Cases:** MD trajectory analysis
|
| 249 |
+
**Python Libraries:**
|
| 250 |
+
- `MDAnalysis`: `Universe(topology, 'traj.dcd')`
|
| 251 |
+
- `MDTraj`: `mdtraj.load_dcd('traj.dcd', top='topology.pdb')`
|
| 252 |
+
- `PyTraj` (Amber): Limited support
|
| 253 |
+
**EDA Approach:**
|
| 254 |
+
- RMSD/RMSF analysis
|
| 255 |
+
- Trajectory length and frame count
|
| 256 |
+
- Coordinate range and drift
|
| 257 |
+
- Periodic boundary handling
|
| 258 |
+
- File integrity check
|
| 259 |
+
- Time step validation
|
| 260 |
+
|
| 261 |
+
### .xtc - Compressed Trajectory
|
| 262 |
+
**Description:** GROMACS compressed trajectory format
|
| 263 |
+
**Typical Data:** Compressed coordinates from MD simulations
|
| 264 |
+
**Use Cases:** Space-efficient MD trajectory storage
|
| 265 |
+
**Python Libraries:**
|
| 266 |
+
- `MDAnalysis`: `Universe(topology, 'traj.xtc')`
|
| 267 |
+
- `MDTraj`: `mdtraj.load_xtc('traj.xtc', top='topology.pdb')`
|
| 268 |
+
**EDA Approach:**
|
| 269 |
+
- Compression ratio assessment
|
| 270 |
+
- Precision loss evaluation
|
| 271 |
+
- RMSD over time
|
| 272 |
+
- Structural stability metrics
|
| 273 |
+
- Sampling frequency analysis
|
| 274 |
+
|
| 275 |
+
### .trr - GROMACS Trajectory
|
| 276 |
+
**Description:** Full precision GROMACS trajectory
|
| 277 |
+
**Typical Data:** Coordinates, velocities, forces from MD
|
| 278 |
+
**Use Cases:** High-precision MD analysis
|
| 279 |
+
**Python Libraries:**
|
| 280 |
+
- `MDAnalysis`: Full support
|
| 281 |
+
- `MDTraj`: Can read trr files
|
| 282 |
+
- `GromacsWrapper`
|
| 283 |
+
**EDA Approach:**
|
| 284 |
+
- Full system dynamics analysis
|
| 285 |
+
- Energy conservation check (with velocities)
|
| 286 |
+
- Force analysis
|
| 287 |
+
- Temperature and pressure validation
|
| 288 |
+
- System equilibration assessment
|
| 289 |
+
|
| 290 |
+
### .nc / .netcdf - Amber NetCDF Trajectory
|
| 291 |
+
**Description:** Network Common Data Form trajectory
|
| 292 |
+
**Typical Data:** MD coordinates, velocities, forces
|
| 293 |
+
**Use Cases:** Amber MD simulations, large trajectory storage
|
| 294 |
+
**Python Libraries:**
|
| 295 |
+
- `MDAnalysis`: NetCDF support
|
| 296 |
+
- `PyTraj`: Native Amber analysis
|
| 297 |
+
- `netCDF4`: Low-level access
|
| 298 |
+
**EDA Approach:**
|
| 299 |
+
- Metadata extraction
|
| 300 |
+
- Trajectory statistics
|
| 301 |
+
- Time series analysis
|
| 302 |
+
- Replica exchange analysis
|
| 303 |
+
- Multi-dimensional data extraction
|
| 304 |
+
|
| 305 |
+
### .top - GROMACS Topology
|
| 306 |
+
**Description:** Molecular topology for GROMACS
|
| 307 |
+
**Typical Data:** Atom types, bonds, angles, force field parameters
|
| 308 |
+
**Use Cases:** MD simulation setup and analysis
|
| 309 |
+
**Python Libraries:**
|
| 310 |
+
- `ParmEd`: `parmed.load_file('system.top')`
|
| 311 |
+
- `MDAnalysis`: Can parse topology
|
| 312 |
+
- Custom parsers for specific fields
|
| 313 |
+
**EDA Approach:**
|
| 314 |
+
- Force field parameter validation
|
| 315 |
+
- System composition
|
| 316 |
+
- Bond/angle/dihedral distribution
|
| 317 |
+
- Charge neutrality check
|
| 318 |
+
- Molecule type enumeration
|
| 319 |
+
|
| 320 |
+
### .psf - Protein Structure File (CHARMM)
|
| 321 |
+
**Description:** Topology file for CHARMM/NAMD
|
| 322 |
+
**Typical Data:** Atom connectivity, types, charges
|
| 323 |
+
**Use Cases:** CHARMM/NAMD MD simulations
|
| 324 |
+
**Python Libraries:**
|
| 325 |
+
- `MDAnalysis`: Native PSF support
|
| 326 |
+
- `ParmEd`: Can read PSF files
|
| 327 |
+
**EDA Approach:**
|
| 328 |
+
- Topology validation
|
| 329 |
+
- Connectivity analysis
|
| 330 |
+
- Charge distribution
|
| 331 |
+
- Atom type statistics
|
| 332 |
+
- Segment analysis
|
| 333 |
+
|
| 334 |
+
### .prmtop - Amber Parameter/Topology
|
| 335 |
+
**Description:** Amber topology and parameter file
|
| 336 |
+
**Typical Data:** System topology, force field parameters
|
| 337 |
+
**Use Cases:** Amber MD simulations
|
| 338 |
+
**Python Libraries:**
|
| 339 |
+
- `ParmEd`: `parmed.load_file('system.prmtop')`
|
| 340 |
+
- `PyTraj`: Native Amber support
|
| 341 |
+
**EDA Approach:**
|
| 342 |
+
- Force field completeness
|
| 343 |
+
- Parameter validation
|
| 344 |
+
- System size and composition
|
| 345 |
+
- Periodic box information
|
| 346 |
+
- Atom mask creation for analysis
|
| 347 |
+
|
| 348 |
+
### .inpcrd / .rst7 - Amber Coordinates
|
| 349 |
+
**Description:** Amber coordinate/restart file
|
| 350 |
+
**Typical Data:** Atomic coordinates, velocities, box info
|
| 351 |
+
**Use Cases:** Starting coordinates for Amber MD
|
| 352 |
+
**Python Libraries:**
|
| 353 |
+
- `ParmEd`: Works with prmtop
|
| 354 |
+
- `PyTraj`: Amber coordinate reading
|
| 355 |
+
**EDA Approach:**
|
| 356 |
+
- Coordinate validity
|
| 357 |
+
- System initialization check
|
| 358 |
+
- Box vector validation
|
| 359 |
+
- Velocity distribution (if restart)
|
| 360 |
+
- Energy minimization status
|
| 361 |
+
|
| 362 |
+
## Spectroscopy and Analytical Data
|
| 363 |
+
|
| 364 |
+
### .jcamp / .jdx - JCAMP-DX
|
| 365 |
+
**Description:** Joint Committee on Atomic and Molecular Physical Data eXchange
|
| 366 |
+
**Typical Data:** Spectroscopic data (IR, NMR, MS, UV-Vis)
|
| 367 |
+
**Use Cases:** Spectroscopy data exchange and archiving
|
| 368 |
+
**Python Libraries:**
|
| 369 |
+
- `jcamp`: `jcamp.jcamp_reader('file.jdx')`
|
| 370 |
+
- `nmrglue`: For NMR JCAMP files
|
| 371 |
+
- Custom parsers for specific subtypes
|
| 372 |
+
**EDA Approach:**
|
| 373 |
+
- Peak detection and analysis
|
| 374 |
+
- Baseline correction assessment
|
| 375 |
+
- Signal-to-noise calculation
|
| 376 |
+
- Spectral range validation
|
| 377 |
+
- Integration analysis
|
| 378 |
+
- Comparison with reference spectra
|
| 379 |
+
|
| 380 |
+
### .mzML - Mass Spectrometry Markup Language
|
| 381 |
+
**Description:** Standard XML format for mass spectrometry data
|
| 382 |
+
**Typical Data:** MS/MS spectra, chromatograms, metadata
|
| 383 |
+
**Use Cases:** Proteomics, metabolomics, mass spectrometry workflows
|
| 384 |
+
**Python Libraries:**
|
| 385 |
+
- `pymzml`: `pymzml.run.Reader('file.mzML')`
|
| 386 |
+
- `pyteomics`: `pyteomics.mzml.read('file.mzML')`
|
| 387 |
+
- `MSFileReader` wrappers
|
| 388 |
+
**EDA Approach:**
|
| 389 |
+
- Scan count and types
|
| 390 |
+
- MS level distribution
|
| 391 |
+
- Retention time range
|
| 392 |
+
- m/z range and resolution
|
| 393 |
+
- Peak intensity distribution
|
| 394 |
+
- Data completeness
|
| 395 |
+
- Quality control metrics
|
| 396 |
+
|
| 397 |
+
### .mzXML - Mass Spectrometry XML
|
| 398 |
+
**Description:** Open XML format for MS data
|
| 399 |
+
**Typical Data:** Mass spectra, retention times, peak lists
|
| 400 |
+
**Use Cases:** Legacy MS data, metabolomics
|
| 401 |
+
**Python Libraries:**
|
| 402 |
+
- `pymzml`: Can read mzXML
|
| 403 |
+
- `pyteomics.mzxml`
|
| 404 |
+
- `lxml` for direct XML parsing
|
| 405 |
+
**EDA Approach:**
|
| 406 |
+
- Similar to mzML
|
| 407 |
+
- Version compatibility check
|
| 408 |
+
- Conversion quality assessment
|
| 409 |
+
- Peak picking validation
|
| 410 |
+
|
| 411 |
+
### .raw - Vendor Raw Data
|
| 412 |
+
**Description:** Proprietary instrument data files (Thermo, Bruker, etc.)
|
| 413 |
+
**Typical Data:** Raw instrument signals, unprocessed data
|
| 414 |
+
**Use Cases:** Direct instrument data access
|
| 415 |
+
**Python Libraries:**
|
| 416 |
+
- `pymsfilereader`: For Thermo RAW files
|
| 417 |
+
- `ThermoRawFileParser`: CLI wrapper
|
| 418 |
+
- Vendor-specific APIs (Thermo, Bruker Compass)
|
| 419 |
+
**EDA Approach:**
|
| 420 |
+
- Instrument method extraction
|
| 421 |
+
- Raw signal quality
|
| 422 |
+
- Calibration status
|
| 423 |
+
- Scan function analysis
|
| 424 |
+
- Chromatographic quality metrics
|
| 425 |
+
|
| 426 |
+
### .d - Agilent Data Directory
|
| 427 |
+
**Description:** Agilent's data folder structure
|
| 428 |
+
**Typical Data:** LC-MS, GC-MS data and metadata
|
| 429 |
+
**Use Cases:** Agilent instrument data processing
|
| 430 |
+
**Python Libraries:**
|
| 431 |
+
- `agilent-reader`: Community tools
|
| 432 |
+
- `Chemstation` Python integration
|
| 433 |
+
- Custom directory parsing
|
| 434 |
+
**EDA Approach:**
|
| 435 |
+
- Directory structure validation
|
| 436 |
+
- Method parameter extraction
|
| 437 |
+
- Signal file integrity
|
| 438 |
+
- Calibration curve analysis
|
| 439 |
+
- Sequence information extraction
|
| 440 |
+
|
| 441 |
+
### .fid - NMR Free Induction Decay
|
| 442 |
+
**Description:** Raw NMR time-domain data
|
| 443 |
+
**Typical Data:** Time-domain NMR signal
|
| 444 |
+
**Use Cases:** NMR processing and analysis
|
| 445 |
+
**Python Libraries:**
|
| 446 |
+
- `nmrglue`: `nmrglue.bruker.read_fid('fid')`
|
| 447 |
+
- `nmrstarlib`: For NMR-STAR files
|
| 448 |
+
**EDA Approach:**
|
| 449 |
+
- Signal decay analysis
|
| 450 |
+
- Noise level assessment
|
| 451 |
+
- Acquisition parameter validation
|
| 452 |
+
- Apodization function selection
|
| 453 |
+
- Zero-filling optimization
|
| 454 |
+
- Phasing parameter estimation
|
| 455 |
+
|
| 456 |
+
### .ft - NMR Frequency-Domain Data
|
| 457 |
+
**Description:** Processed NMR spectrum
|
| 458 |
+
**Typical Data:** Frequency-domain NMR data
|
| 459 |
+
**Use Cases:** NMR analysis and interpretation
|
| 460 |
+
**Python Libraries:**
|
| 461 |
+
- `nmrglue`: Comprehensive NMR support
|
| 462 |
+
- `pyNMR`: For processing
|
| 463 |
+
**EDA Approach:**
|
| 464 |
+
- Peak picking and integration
|
| 465 |
+
- Chemical shift calibration
|
| 466 |
+
- Multiplicity analysis
|
| 467 |
+
- Coupling constant extraction
|
| 468 |
+
- Spectral quality metrics
|
| 469 |
+
- Reference compound identification
|
| 470 |
+
|
| 471 |
+
### .spc - Spectroscopy File
|
| 472 |
+
**Description:** Thermo Galactic spectroscopy format
|
| 473 |
+
**Typical Data:** IR, Raman, UV-Vis spectra
|
| 474 |
+
**Use Cases:** Spectroscopic data from various instruments
|
| 475 |
+
**Python Libraries:**
|
| 476 |
+
- `spc`: `spc.File('file.spc')`
|
| 477 |
+
- Custom parsers for binary format
|
| 478 |
+
**EDA Approach:**
|
| 479 |
+
- Spectral resolution
|
| 480 |
+
- Wavelength/wavenumber range
|
| 481 |
+
- Baseline characterization
|
| 482 |
+
- Peak identification
|
| 483 |
+
- Derivative spectra calculation
|
| 484 |
+
|
| 485 |
+
## Chemical Database Formats
|
| 486 |
+
|
| 487 |
+
### .inchi - International Chemical Identifier
|
| 488 |
+
**Description:** Text identifier for chemical substances
|
| 489 |
+
**Typical Data:** Layered chemical structure representation
|
| 490 |
+
**Use Cases:** Chemical database keys, structure searching
|
| 491 |
+
**Python Libraries:**
|
| 492 |
+
- `RDKit`: `Chem.MolFromInchi(inchi)`
|
| 493 |
+
- `Open Babel`: InChI conversion
|
| 494 |
+
**EDA Approach:**
|
| 495 |
+
- InChI validation
|
| 496 |
+
- Layer analysis
|
| 497 |
+
- Stereochemistry verification
|
| 498 |
+
- InChI key generation
|
| 499 |
+
- Structure round-trip validation
|
| 500 |
+
|
| 501 |
+
### .cdx / .cdxml - ChemDraw Exchange
|
| 502 |
+
**Description:** ChemDraw drawing file format
|
| 503 |
+
**Typical Data:** 2D chemical structures with annotations
|
| 504 |
+
**Use Cases:** Chemical drawing, publication figures
|
| 505 |
+
**Python Libraries:**
|
| 506 |
+
- `RDKit`: Can import some CDXML
|
| 507 |
+
- `Open Babel`: Limited support
|
| 508 |
+
- `ChemDraw` Python API (commercial)
|
| 509 |
+
**EDA Approach:**
|
| 510 |
+
- Structure extraction
|
| 511 |
+
- Annotation preservation
|
| 512 |
+
- Style consistency
|
| 513 |
+
- 2D coordinate validation
|
| 514 |
+
|
| 515 |
+
### .cml - Chemical Markup Language
|
| 516 |
+
**Description:** XML-based chemical structure format
|
| 517 |
+
**Typical Data:** Chemical structures, reactions, properties
|
| 518 |
+
**Use Cases:** Semantic chemical data representation
|
| 519 |
+
**Python Libraries:**
|
| 520 |
+
- `RDKit`: CML support
|
| 521 |
+
- `Open Babel`: Good CML support
|
| 522 |
+
- `lxml`: For XML parsing
|
| 523 |
+
**EDA Approach:**
|
| 524 |
+
- XML schema validation
|
| 525 |
+
- Namespace handling
|
| 526 |
+
- Property extraction
|
| 527 |
+
- Reaction scheme analysis
|
| 528 |
+
- Metadata completeness
|
| 529 |
+
|
| 530 |
+
### .rxn - MDL Reaction File
|
| 531 |
+
**Description:** Chemical reaction structure file
|
| 532 |
+
**Typical Data:** Reactants, products, reaction arrows
|
| 533 |
+
**Use Cases:** Reaction databases, synthesis planning
|
| 534 |
+
**Python Libraries:**
|
| 535 |
+
- `RDKit`: `Chem.ReactionFromRxnFile('file.rxn')`
|
| 536 |
+
- `Open Babel`: Reaction support
|
| 537 |
+
**EDA Approach:**
|
| 538 |
+
- Reaction balancing validation
|
| 539 |
+
- Atom mapping analysis
|
| 540 |
+
- Reagent identification
|
| 541 |
+
- Stereochemistry changes
|
| 542 |
+
- Reaction classification
|
| 543 |
+
|
| 544 |
+
### .rdf - Reaction Data File
|
| 545 |
+
**Description:** Multi-reaction file format
|
| 546 |
+
**Typical Data:** Multiple reactions with data
|
| 547 |
+
**Use Cases:** Reaction databases
|
| 548 |
+
**Python Libraries:**
|
| 549 |
+
- `RDKit`: RDF reading capabilities
|
| 550 |
+
- Custom parsers
|
| 551 |
+
**EDA Approach:**
|
| 552 |
+
- Reaction yield statistics
|
| 553 |
+
- Condition analysis
|
| 554 |
+
- Success rate patterns
|
| 555 |
+
- Reagent frequency analysis
|
| 556 |
+
|
| 557 |
+
## Computational Output and Data
|
| 558 |
+
|
| 559 |
+
### .hdf5 / .h5 - Hierarchical Data Format
|
| 560 |
+
**Description:** Container for scientific data arrays
|
| 561 |
+
**Typical Data:** Large arrays, metadata, hierarchical organization
|
| 562 |
+
**Use Cases:** Large dataset storage, computational results
|
| 563 |
+
**Python Libraries:**
|
| 564 |
+
- `h5py`: `h5py.File('file.h5', 'r')`
|
| 565 |
+
- `pytables`: Advanced HDF5 interface
|
| 566 |
+
- `pandas`: Can read HDF5
|
| 567 |
+
**EDA Approach:**
|
| 568 |
+
- Dataset structure exploration
|
| 569 |
+
- Array shape and dtype analysis
|
| 570 |
+
- Metadata extraction
|
| 571 |
+
- Memory-efficient data sampling
|
| 572 |
+
- Chunk optimization analysis
|
| 573 |
+
- Compression ratio assessment
|
| 574 |
+
|
| 575 |
+
### .pkl / .pickle - Python Pickle
|
| 576 |
+
**Description:** Serialized Python objects
|
| 577 |
+
**Typical Data:** Any Python object (molecules, dataframes, models)
|
| 578 |
+
**Use Cases:** Intermediate data storage, model persistence
|
| 579 |
+
**Python Libraries:**
|
| 580 |
+
- `pickle`: Built-in serialization
|
| 581 |
+
- `joblib`: Enhanced pickling for large arrays
|
| 582 |
+
- `dill`: Extended pickle support
|
| 583 |
+
**EDA Approach:**
|
| 584 |
+
- Object type inspection
|
| 585 |
+
- Size and complexity analysis
|
| 586 |
+
- Version compatibility check
|
| 587 |
+
- Security validation (trusted source)
|
| 588 |
+
- Deserialization testing
|
| 589 |
+
|
| 590 |
+
### .npy / .npz - NumPy Arrays
|
| 591 |
+
**Description:** NumPy array binary format
|
| 592 |
+
**Typical Data:** Numerical arrays (coordinates, features, matrices)
|
| 593 |
+
**Use Cases:** Fast numerical data I/O
|
| 594 |
+
**Python Libraries:**
|
| 595 |
+
- `numpy`: `np.load('file.npy')`
|
| 596 |
+
- Direct memory mapping for large files
|
| 597 |
+
**EDA Approach:**
|
| 598 |
+
- Array shape and dimensions
|
| 599 |
+
- Data type and precision
|
| 600 |
+
- Statistical summary (mean, std, range)
|
| 601 |
+
- Missing value detection
|
| 602 |
+
- Outlier identification
|
| 603 |
+
- Memory footprint analysis
|
| 604 |
+
|
| 605 |
+
### .mat - MATLAB Data File
|
| 606 |
+
**Description:** MATLAB workspace data
|
| 607 |
+
**Typical Data:** Arrays, structures from MATLAB
|
| 608 |
+
**Use Cases:** MATLAB-Python data exchange
|
| 609 |
+
**Python Libraries:**
|
| 610 |
+
- `scipy.io`: `scipy.io.loadmat('file.mat')`
|
| 611 |
+
- `h5py`: For v7.3 MAT files
|
| 612 |
+
**EDA Approach:**
|
| 613 |
+
- Variable extraction and types
|
| 614 |
+
- Array dimension analysis
|
| 615 |
+
- Structure field exploration
|
| 616 |
+
- MATLAB version compatibility
|
| 617 |
+
- Data type conversion validation
|
| 618 |
+
|
| 619 |
+
### .csv - Comma-Separated Values
|
| 620 |
+
**Description:** Tabular data in text format
|
| 621 |
+
**Typical Data:** Chemical properties, experimental data, descriptors
|
| 622 |
+
**Use Cases:** Data exchange, analysis, machine learning
|
| 623 |
+
**Python Libraries:**
|
| 624 |
+
- `pandas`: `pd.read_csv('file.csv')`
|
| 625 |
+
- `csv`: Built-in module
|
| 626 |
+
- `polars`: Fast CSV reading
|
| 627 |
+
**EDA Approach:**
|
| 628 |
+
- Data types inference
|
| 629 |
+
- Missing value patterns
|
| 630 |
+
- Statistical summaries
|
| 631 |
+
- Correlation analysis
|
| 632 |
+
- Distribution visualization
|
| 633 |
+
- Outlier detection
|
| 634 |
+
|
| 635 |
+
### .json - JavaScript Object Notation
|
| 636 |
+
**Description:** Structured text data format
|
| 637 |
+
**Typical Data:** Chemical properties, metadata, API responses
|
| 638 |
+
**Use Cases:** Data interchange, configuration, web APIs
|
| 639 |
+
**Python Libraries:**
|
| 640 |
+
- `json`: Built-in JSON support
|
| 641 |
+
- `pandas`: `pd.read_json()`
|
| 642 |
+
- `ujson`: Faster JSON parsing
|
| 643 |
+
**EDA Approach:**
|
| 644 |
+
- Schema validation
|
| 645 |
+
- Nesting depth analysis
|
| 646 |
+
- Key-value distribution
|
| 647 |
+
- Data type consistency
|
| 648 |
+
- Array length statistics
|
| 649 |
+
|
| 650 |
+
### .parquet - Apache Parquet
|
| 651 |
+
**Description:** Columnar storage format
|
| 652 |
+
**Typical Data:** Large tabular datasets efficiently
|
| 653 |
+
**Use Cases:** Big data, efficient columnar analytics
|
| 654 |
+
**Python Libraries:**
|
| 655 |
+
- `pandas`: `pd.read_parquet('file.parquet')`
|
| 656 |
+
- `pyarrow`: Direct parquet access
|
| 657 |
+
- `fastparquet`: Alternative implementation
|
| 658 |
+
**EDA Approach:**
|
| 659 |
+
- Column statistics from metadata
|
| 660 |
+
- Partition analysis
|
| 661 |
+
- Compression efficiency
|
| 662 |
+
- Row group structure
|
| 663 |
+
- Fast sampling for large files
|
| 664 |
+
- Schema evolution tracking
|
.scider/skills/exploratory-data-analysis/references/general_scientific_formats.md
ADDED
|
@@ -0,0 +1,518 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# General Scientific Data Formats Reference
|
| 2 |
+
|
| 3 |
+
This reference covers general-purpose scientific data formats used across multiple disciplines.
|
| 4 |
+
|
| 5 |
+
## Numerical and Array Data
|
| 6 |
+
|
| 7 |
+
### .npy - NumPy Array
|
| 8 |
+
**Description:** Binary NumPy array format
|
| 9 |
+
**Typical Data:** N-dimensional arrays of any data type
|
| 10 |
+
**Use Cases:** Fast I/O for numerical data, intermediate results
|
| 11 |
+
**Python Libraries:**
|
| 12 |
+
- `numpy`: `np.load('file.npy')`, `np.save()`
|
| 13 |
+
- Memory-mapped access: `np.load('file.npy', mmap_mode='r')`
|
| 14 |
+
**EDA Approach:**
|
| 15 |
+
- Array shape and dimensionality
|
| 16 |
+
- Data type and precision
|
| 17 |
+
- Statistical summary (mean, std, min, max, percentiles)
|
| 18 |
+
- Missing or invalid values (NaN, inf)
|
| 19 |
+
- Memory footprint
|
| 20 |
+
- Value distribution and histogram
|
| 21 |
+
- Sparsity analysis
|
| 22 |
+
- Correlation structure (if 2D)
|
| 23 |
+
|
| 24 |
+
### .npz - Compressed NumPy Archive
|
| 25 |
+
**Description:** Multiple NumPy arrays in one file
|
| 26 |
+
**Typical Data:** Collections of related arrays
|
| 27 |
+
**Use Cases:** Saving multiple arrays together, compressed storage
|
| 28 |
+
**Python Libraries:**
|
| 29 |
+
- `numpy`: `np.load('file.npz')` returns dict-like object
|
| 30 |
+
- `np.savez()` or `np.savez_compressed()`
|
| 31 |
+
**EDA Approach:**
|
| 32 |
+
- List of contained arrays
|
| 33 |
+
- Individual array analysis
|
| 34 |
+
- Relationships between arrays
|
| 35 |
+
- Total file size and compression ratio
|
| 36 |
+
- Naming conventions
|
| 37 |
+
- Data consistency checks
|
| 38 |
+
|
| 39 |
+
### .csv - Comma-Separated Values
|
| 40 |
+
**Description:** Plain text tabular data
|
| 41 |
+
**Typical Data:** Experimental measurements, results tables
|
| 42 |
+
**Use Cases:** Universal data exchange, spreadsheet export
|
| 43 |
+
**Python Libraries:**
|
| 44 |
+
- `pandas`: `pd.read_csv('file.csv')`
|
| 45 |
+
- `csv`: Built-in module
|
| 46 |
+
- `polars`: High-performance CSV reading
|
| 47 |
+
- `numpy`: `np.loadtxt()` or `np.genfromtxt()`
|
| 48 |
+
**EDA Approach:**
|
| 49 |
+
- Row and column counts
|
| 50 |
+
- Data type inference
|
| 51 |
+
- Missing value patterns and frequency
|
| 52 |
+
- Column statistics (numeric: mean, std; categorical: frequencies)
|
| 53 |
+
- Outlier detection
|
| 54 |
+
- Correlation matrix
|
| 55 |
+
- Duplicate row detection
|
| 56 |
+
- Header and index validation
|
| 57 |
+
- Encoding issues detection
|
| 58 |
+
|
| 59 |
+
### .tsv / .tab - Tab-Separated Values
|
| 60 |
+
**Description:** Tab-delimited tabular data
|
| 61 |
+
**Typical Data:** Similar to CSV but tab-separated
|
| 62 |
+
**Use Cases:** Bioinformatics, text processing output
|
| 63 |
+
**Python Libraries:**
|
| 64 |
+
- `pandas`: `pd.read_csv('file.tsv', sep='\t')`
|
| 65 |
+
**EDA Approach:**
|
| 66 |
+
- Same as CSV format
|
| 67 |
+
- Tab vs space validation
|
| 68 |
+
- Quote handling
|
| 69 |
+
|
| 70 |
+
### .xlsx / .xls - Excel Spreadsheets
|
| 71 |
+
**Description:** Microsoft Excel binary/XML formats
|
| 72 |
+
**Typical Data:** Tabular data with formatting, formulas
|
| 73 |
+
**Use Cases:** Lab notebooks, data entry, reports
|
| 74 |
+
**Python Libraries:**
|
| 75 |
+
- `pandas`: `pd.read_excel('file.xlsx')`
|
| 76 |
+
- `openpyxl`: Full Excel file manipulation
|
| 77 |
+
- `xlrd`: Reading .xls (legacy)
|
| 78 |
+
**EDA Approach:**
|
| 79 |
+
- Sheet enumeration and names
|
| 80 |
+
- Per-sheet data analysis
|
| 81 |
+
- Formula evaluation
|
| 82 |
+
- Merged cells handling
|
| 83 |
+
- Hidden rows/columns
|
| 84 |
+
- Data validation rules
|
| 85 |
+
- Named ranges
|
| 86 |
+
- Formatting-only cells detection
|
| 87 |
+
|
| 88 |
+
### .json - JavaScript Object Notation
|
| 89 |
+
**Description:** Hierarchical text data format
|
| 90 |
+
**Typical Data:** Nested data structures, metadata
|
| 91 |
+
**Use Cases:** API responses, configuration, results
|
| 92 |
+
**Python Libraries:**
|
| 93 |
+
- `json`: Built-in module
|
| 94 |
+
- `pandas`: `pd.read_json()`
|
| 95 |
+
- `ujson`: Faster JSON parsing
|
| 96 |
+
**EDA Approach:**
|
| 97 |
+
- Schema inference
|
| 98 |
+
- Nesting depth
|
| 99 |
+
- Key-value distribution
|
| 100 |
+
- Array lengths
|
| 101 |
+
- Data type consistency
|
| 102 |
+
- Missing keys
|
| 103 |
+
- Duplicate detection
|
| 104 |
+
- Size and complexity metrics
|
| 105 |
+
|
| 106 |
+
### .xml - Extensible Markup Language
|
| 107 |
+
**Description:** Hierarchical markup format
|
| 108 |
+
**Typical Data:** Structured data with metadata
|
| 109 |
+
**Use Cases:** Standards-based data exchange, APIs
|
| 110 |
+
**Python Libraries:**
|
| 111 |
+
- `lxml`: `lxml.etree.parse()`
|
| 112 |
+
- `xml.etree.ElementTree`: Built-in XML
|
| 113 |
+
- `xmltodict`: Convert XML to dict
|
| 114 |
+
**EDA Approach:**
|
| 115 |
+
- Schema/DTD validation
|
| 116 |
+
- Element hierarchy and depth
|
| 117 |
+
- Namespace handling
|
| 118 |
+
- Attribute vs element content
|
| 119 |
+
- CDATA sections
|
| 120 |
+
- Text content extraction
|
| 121 |
+
- Sibling and child counts
|
| 122 |
+
|
| 123 |
+
### .yaml / .yml - YAML
|
| 124 |
+
**Description:** Human-readable data serialization
|
| 125 |
+
**Typical Data:** Configuration, metadata, parameters
|
| 126 |
+
**Use Cases:** Experiment configurations, pipelines
|
| 127 |
+
**Python Libraries:**
|
| 128 |
+
- `yaml`: `yaml.safe_load()` or `yaml.load()`
|
| 129 |
+
- `ruamel.yaml`: YAML 1.2 support
|
| 130 |
+
**EDA Approach:**
|
| 131 |
+
- Configuration structure
|
| 132 |
+
- Data type handling
|
| 133 |
+
- List and dict depth
|
| 134 |
+
- Anchor and alias usage
|
| 135 |
+
- Multi-document files
|
| 136 |
+
- Comments preservation
|
| 137 |
+
- Validation against schema
|
| 138 |
+
|
| 139 |
+
### .toml - TOML Configuration
|
| 140 |
+
**Description:** Configuration file format
|
| 141 |
+
**Typical Data:** Settings, parameters
|
| 142 |
+
**Use Cases:** Python package configuration, settings
|
| 143 |
+
**Python Libraries:**
|
| 144 |
+
- `tomli` / `tomllib`: TOML reading (tomllib in Python 3.11+)
|
| 145 |
+
- `toml`: Reading and writing
|
| 146 |
+
**EDA Approach:**
|
| 147 |
+
- Section structure
|
| 148 |
+
- Key-value pairs
|
| 149 |
+
- Data type inference
|
| 150 |
+
- Nested table validation
|
| 151 |
+
- Required vs optional fields
|
| 152 |
+
|
| 153 |
+
### .ini - INI Configuration
|
| 154 |
+
**Description:** Simple configuration format
|
| 155 |
+
**Typical Data:** Application settings
|
| 156 |
+
**Use Cases:** Legacy configurations, simple settings
|
| 157 |
+
**Python Libraries:**
|
| 158 |
+
- `configparser`: Built-in INI parser
|
| 159 |
+
**EDA Approach:**
|
| 160 |
+
- Section enumeration
|
| 161 |
+
- Key-value extraction
|
| 162 |
+
- Type conversion
|
| 163 |
+
- Comment handling
|
| 164 |
+
- Case sensitivity
|
| 165 |
+
|
| 166 |
+
## Binary and Compressed Data
|
| 167 |
+
|
| 168 |
+
### .hdf5 / .h5 - Hierarchical Data Format 5
|
| 169 |
+
**Description:** Container for large scientific datasets
|
| 170 |
+
**Typical Data:** Multi-dimensional arrays, metadata, groups
|
| 171 |
+
**Use Cases:** Large datasets, multi-modal data, parallel I/O
|
| 172 |
+
**Python Libraries:**
|
| 173 |
+
- `h5py`: `h5py.File('file.h5', 'r')`
|
| 174 |
+
- `pytables`: Advanced HDF5 interface
|
| 175 |
+
- `pandas`: HDF5 storage via HDFStore
|
| 176 |
+
**EDA Approach:**
|
| 177 |
+
- Group and dataset hierarchy
|
| 178 |
+
- Dataset shapes and dtypes
|
| 179 |
+
- Attributes and metadata
|
| 180 |
+
- Compression and chunking strategy
|
| 181 |
+
- Memory-efficient sampling
|
| 182 |
+
- Dataset relationships
|
| 183 |
+
- File size and efficiency
|
| 184 |
+
- Access patterns optimization
|
| 185 |
+
|
| 186 |
+
### .zarr - Chunked Array Storage
|
| 187 |
+
**Description:** Cloud-optimized chunked arrays
|
| 188 |
+
**Typical Data:** Large N-dimensional arrays
|
| 189 |
+
**Use Cases:** Cloud storage, parallel computing, streaming
|
| 190 |
+
**Python Libraries:**
|
| 191 |
+
- `zarr`: `zarr.open('file.zarr')`
|
| 192 |
+
- `xarray`: Zarr backend support
|
| 193 |
+
**EDA Approach:**
|
| 194 |
+
- Array metadata and dimensions
|
| 195 |
+
- Chunk size optimization
|
| 196 |
+
- Compression codec and ratio
|
| 197 |
+
- Synchronizer and store type
|
| 198 |
+
- Multi-scale hierarchies
|
| 199 |
+
- Parallel access performance
|
| 200 |
+
- Attribute metadata
|
| 201 |
+
|
| 202 |
+
### .gz / .gzip - Gzip Compressed
|
| 203 |
+
**Description:** Compressed data files
|
| 204 |
+
**Typical Data:** Any compressed text or binary
|
| 205 |
+
**Use Cases:** Compression for storage/transfer
|
| 206 |
+
**Python Libraries:**
|
| 207 |
+
- `gzip`: Built-in gzip module
|
| 208 |
+
- `pandas`: Automatic gzip handling in read functions
|
| 209 |
+
**EDA Approach:**
|
| 210 |
+
- Compression ratio
|
| 211 |
+
- Original file type detection
|
| 212 |
+
- Decompression validation
|
| 213 |
+
- Header information
|
| 214 |
+
- Multi-member archives
|
| 215 |
+
|
| 216 |
+
### .bz2 - Bzip2 Compressed
|
| 217 |
+
**Description:** Bzip2 compression
|
| 218 |
+
**Typical Data:** Highly compressed files
|
| 219 |
+
**Use Cases:** Better compression than gzip
|
| 220 |
+
**Python Libraries:**
|
| 221 |
+
- `bz2`: Built-in bz2 module
|
| 222 |
+
- Automatic handling in pandas
|
| 223 |
+
**EDA Approach:**
|
| 224 |
+
- Compression efficiency
|
| 225 |
+
- Decompression time
|
| 226 |
+
- Content validation
|
| 227 |
+
|
| 228 |
+
### .zip - ZIP Archive
|
| 229 |
+
**Description:** Archive with multiple files
|
| 230 |
+
**Typical Data:** Collections of files
|
| 231 |
+
**Use Cases:** File distribution, archiving
|
| 232 |
+
**Python Libraries:**
|
| 233 |
+
- `zipfile`: Built-in ZIP support
|
| 234 |
+
- `pandas`: Can read zipped CSVs
|
| 235 |
+
**EDA Approach:**
|
| 236 |
+
- Archive member listing
|
| 237 |
+
- Compression method per file
|
| 238 |
+
- Total vs compressed size
|
| 239 |
+
- Directory structure
|
| 240 |
+
- File type distribution
|
| 241 |
+
- Extraction validation
|
| 242 |
+
|
| 243 |
+
### .tar / .tar.gz - TAR Archive
|
| 244 |
+
**Description:** Unix tape archive
|
| 245 |
+
**Typical Data:** Multiple files and directories
|
| 246 |
+
**Use Cases:** Software distribution, backups
|
| 247 |
+
**Python Libraries:**
|
| 248 |
+
- `tarfile`: Built-in TAR support
|
| 249 |
+
**EDA Approach:**
|
| 250 |
+
- Member file listing
|
| 251 |
+
- Compression (if .tar.gz, .tar.bz2)
|
| 252 |
+
- Directory structure
|
| 253 |
+
- Permissions preservation
|
| 254 |
+
- Extraction testing
|
| 255 |
+
|
| 256 |
+
## Time Series and Waveform Data
|
| 257 |
+
|
| 258 |
+
### .wav - Waveform Audio
|
| 259 |
+
**Description:** Audio waveform data
|
| 260 |
+
**Typical Data:** Acoustic signals, audio recordings
|
| 261 |
+
**Use Cases:** Acoustic analysis, ultrasound, signal processing
|
| 262 |
+
**Python Libraries:**
|
| 263 |
+
- `scipy.io.wavfile`: `scipy.io.wavfile.read()`
|
| 264 |
+
- `wave`: Built-in module
|
| 265 |
+
- `soundfile`: Enhanced audio I/O
|
| 266 |
+
**EDA Approach:**
|
| 267 |
+
- Sample rate and duration
|
| 268 |
+
- Bit depth and channels
|
| 269 |
+
- Amplitude distribution
|
| 270 |
+
- Spectral analysis (FFT)
|
| 271 |
+
- Signal-to-noise ratio
|
| 272 |
+
- Clipping detection
|
| 273 |
+
- Frequency content
|
| 274 |
+
|
| 275 |
+
### .mat - MATLAB Data
|
| 276 |
+
**Description:** MATLAB workspace variables
|
| 277 |
+
**Typical Data:** Arrays, structures, cells
|
| 278 |
+
**Use Cases:** MATLAB-Python interoperability
|
| 279 |
+
**Python Libraries:**
|
| 280 |
+
- `scipy.io`: `scipy.io.loadmat()`
|
| 281 |
+
- `h5py`: For MATLAB v7.3 files (HDF5-based)
|
| 282 |
+
- `mat73`: Pure Python for v7.3
|
| 283 |
+
**EDA Approach:**
|
| 284 |
+
- Variable names and types
|
| 285 |
+
- Array dimensions
|
| 286 |
+
- Structure field exploration
|
| 287 |
+
- Cell array handling
|
| 288 |
+
- Sparse matrix detection
|
| 289 |
+
- MATLAB version compatibility
|
| 290 |
+
- Metadata extraction
|
| 291 |
+
|
| 292 |
+
### .edf - European Data Format
|
| 293 |
+
**Description:** Time series data (especially medical)
|
| 294 |
+
**Typical Data:** EEG, physiological signals
|
| 295 |
+
**Use Cases:** Medical signal storage
|
| 296 |
+
**Python Libraries:**
|
| 297 |
+
- `pyedflib`: EDF/EDF+ reading and writing
|
| 298 |
+
- `mne`: Neurophysiology data (supports EDF)
|
| 299 |
+
**EDA Approach:**
|
| 300 |
+
- Signal count and names
|
| 301 |
+
- Sampling frequencies
|
| 302 |
+
- Signal ranges and units
|
| 303 |
+
- Recording duration
|
| 304 |
+
- Annotation events
|
| 305 |
+
- Data quality (saturation, noise)
|
| 306 |
+
- Patient/study information
|
| 307 |
+
|
| 308 |
+
### .csv (Time Series)
|
| 309 |
+
**Description:** CSV with timestamp column
|
| 310 |
+
**Typical Data:** Time-indexed measurements
|
| 311 |
+
**Use Cases:** Sensor data, monitoring, experiments
|
| 312 |
+
**Python Libraries:**
|
| 313 |
+
- `pandas`: `pd.read_csv()` with `parse_dates`
|
| 314 |
+
**EDA Approach:**
|
| 315 |
+
- Temporal range and resolution
|
| 316 |
+
- Sampling regularity
|
| 317 |
+
- Missing time points
|
| 318 |
+
- Trend and seasonality
|
| 319 |
+
- Stationarity tests
|
| 320 |
+
- Autocorrelation
|
| 321 |
+
- Anomaly detection
|
| 322 |
+
|
| 323 |
+
## Geospatial and Environmental Data
|
| 324 |
+
|
| 325 |
+
### .shp - Shapefile
|
| 326 |
+
**Description:** Geospatial vector data
|
| 327 |
+
**Typical Data:** Geographic features (points, lines, polygons)
|
| 328 |
+
**Use Cases:** GIS analysis, spatial data
|
| 329 |
+
**Python Libraries:**
|
| 330 |
+
- `geopandas`: `gpd.read_file('file.shp')`
|
| 331 |
+
- `fiona`: Lower-level shapefile access
|
| 332 |
+
- `pyshp`: Pure Python shapefile reader
|
| 333 |
+
**EDA Approach:**
|
| 334 |
+
- Geometry type and count
|
| 335 |
+
- Coordinate reference system
|
| 336 |
+
- Bounding box
|
| 337 |
+
- Attribute table analysis
|
| 338 |
+
- Geometry validity
|
| 339 |
+
- Spatial distribution
|
| 340 |
+
- Multi-part features
|
| 341 |
+
- Associated files (.shx, .dbf, .prj)
|
| 342 |
+
|
| 343 |
+
### .geojson - GeoJSON
|
| 344 |
+
**Description:** JSON format for geographic data
|
| 345 |
+
**Typical Data:** Features with geometry and properties
|
| 346 |
+
**Use Cases:** Web mapping, spatial analysis
|
| 347 |
+
**Python Libraries:**
|
| 348 |
+
- `geopandas`: Native GeoJSON support
|
| 349 |
+
- `json`: Parse as JSON then process
|
| 350 |
+
**EDA Approach:**
|
| 351 |
+
- Feature count and types
|
| 352 |
+
- CRS specification
|
| 353 |
+
- Bounding box calculation
|
| 354 |
+
- Property schema
|
| 355 |
+
- Geometry complexity
|
| 356 |
+
- Nesting structure
|
| 357 |
+
|
| 358 |
+
### .tif / .tiff (Geospatial)
|
| 359 |
+
**Description:** GeoTIFF with spatial reference
|
| 360 |
+
**Typical Data:** Satellite imagery, DEMs, rasters
|
| 361 |
+
**Use Cases:** Remote sensing, terrain analysis
|
| 362 |
+
**Python Libraries:**
|
| 363 |
+
- `rasterio`: `rasterio.open('file.tif')`
|
| 364 |
+
- `gdal`: Geospatial Data Abstraction Library
|
| 365 |
+
- `xarray` with `rioxarray`: N-D geospatial arrays
|
| 366 |
+
**EDA Approach:**
|
| 367 |
+
- Raster dimensions and resolution
|
| 368 |
+
- Band count and descriptions
|
| 369 |
+
- Coordinate reference system
|
| 370 |
+
- Geotransform parameters
|
| 371 |
+
- NoData value handling
|
| 372 |
+
- Pixel value distribution
|
| 373 |
+
- Histogram analysis
|
| 374 |
+
- Overviews and pyramids
|
| 375 |
+
|
| 376 |
+
### .nc / .netcdf - Network Common Data Form
|
| 377 |
+
**Description:** Self-describing array-based data
|
| 378 |
+
**Typical Data:** Climate, atmospheric, oceanographic data
|
| 379 |
+
**Use Cases:** Scientific datasets, model output
|
| 380 |
+
**Python Libraries:**
|
| 381 |
+
- `netCDF4`: `netCDF4.Dataset('file.nc')`
|
| 382 |
+
- `xarray`: `xr.open_dataset('file.nc')`
|
| 383 |
+
**EDA Approach:**
|
| 384 |
+
- Variable enumeration
|
| 385 |
+
- Dimension analysis
|
| 386 |
+
- Time series properties
|
| 387 |
+
- Spatial coverage
|
| 388 |
+
- Attribute metadata (CF conventions)
|
| 389 |
+
- Coordinate systems
|
| 390 |
+
- Chunking and compression
|
| 391 |
+
- Data quality flags
|
| 392 |
+
|
| 393 |
+
### .grib / .grib2 - Gridded Binary
|
| 394 |
+
**Description:** Meteorological data format
|
| 395 |
+
**Typical Data:** Weather forecasts, climate data
|
| 396 |
+
**Use Cases:** Numerical weather prediction
|
| 397 |
+
**Python Libraries:**
|
| 398 |
+
- `pygrib`: GRIB file reading
|
| 399 |
+
- `xarray` with `cfgrib`: GRIB to xarray
|
| 400 |
+
**EDA Approach:**
|
| 401 |
+
- Message inventory
|
| 402 |
+
- Parameter and level types
|
| 403 |
+
- Spatial grid specification
|
| 404 |
+
- Temporal coverage
|
| 405 |
+
- Ensemble members
|
| 406 |
+
- Forecast vs analysis
|
| 407 |
+
- Data packing and precision
|
| 408 |
+
|
| 409 |
+
### .hdf4 - HDF4 Format
|
| 410 |
+
**Description:** Older HDF format
|
| 411 |
+
**Typical Data:** NASA Earth Science data
|
| 412 |
+
**Use Cases:** Satellite data (MODIS, etc.)
|
| 413 |
+
**Python Libraries:**
|
| 414 |
+
- `pyhdf`: HDF4 access
|
| 415 |
+
- `gdal`: Can read HDF4
|
| 416 |
+
**EDA Approach:**
|
| 417 |
+
- Scientific dataset listing
|
| 418 |
+
- Vdata and attributes
|
| 419 |
+
- Dimension scales
|
| 420 |
+
- Metadata extraction
|
| 421 |
+
- Quality flags
|
| 422 |
+
- Conversion to HDF5 or NetCDF
|
| 423 |
+
|
| 424 |
+
## Specialized Scientific Formats
|
| 425 |
+
|
| 426 |
+
### .fits - Flexible Image Transport System
|
| 427 |
+
**Description:** Astronomy data format
|
| 428 |
+
**Typical Data:** Images, tables, spectra from telescopes
|
| 429 |
+
**Use Cases:** Astronomical observations
|
| 430 |
+
**Python Libraries:**
|
| 431 |
+
- `astropy.io.fits`: `fits.open('file.fits')`
|
| 432 |
+
- `fitsio`: Alternative FITS library
|
| 433 |
+
**EDA Approach:**
|
| 434 |
+
- HDU (Header Data Unit) structure
|
| 435 |
+
- Image dimensions and WCS
|
| 436 |
+
- Header keyword analysis
|
| 437 |
+
- Table column descriptions
|
| 438 |
+
- Data type and scaling
|
| 439 |
+
- FITS convention compliance
|
| 440 |
+
- Checksum validation
|
| 441 |
+
|
| 442 |
+
### .asdf - Advanced Scientific Data Format
|
| 443 |
+
**Description:** Next-gen data format for astronomy
|
| 444 |
+
**Typical Data:** Complex hierarchical scientific data
|
| 445 |
+
**Use Cases:** James Webb Space Telescope data
|
| 446 |
+
**Python Libraries:**
|
| 447 |
+
- `asdf`: `asdf.open('file.asdf')`
|
| 448 |
+
**EDA Approach:**
|
| 449 |
+
- Tree structure exploration
|
| 450 |
+
- Schema validation
|
| 451 |
+
- Internal vs external arrays
|
| 452 |
+
- Compression methods
|
| 453 |
+
- YAML metadata
|
| 454 |
+
- Version compatibility
|
| 455 |
+
|
| 456 |
+
### .root - ROOT Data Format
|
| 457 |
+
**Description:** CERN ROOT framework format
|
| 458 |
+
**Typical Data:** High-energy physics data
|
| 459 |
+
**Use Cases:** Particle physics experiments
|
| 460 |
+
**Python Libraries:**
|
| 461 |
+
- `uproot`: Pure Python ROOT reading
|
| 462 |
+
- `ROOT`: Official PyROOT bindings
|
| 463 |
+
**EDA Approach:**
|
| 464 |
+
- TTree structure
|
| 465 |
+
- Branch types and entries
|
| 466 |
+
- Histogram inventory
|
| 467 |
+
- Event loop statistics
|
| 468 |
+
- File compression
|
| 469 |
+
- Split level analysis
|
| 470 |
+
|
| 471 |
+
### .txt - Plain Text Data
|
| 472 |
+
**Description:** Generic text-based data
|
| 473 |
+
**Typical Data:** Tab/space-delimited, custom formats
|
| 474 |
+
**Use Cases:** Simple data exchange, logs
|
| 475 |
+
**Python Libraries:**
|
| 476 |
+
- `pandas`: `pd.read_csv()` with custom delimiters
|
| 477 |
+
- `numpy`: `np.loadtxt()`, `np.genfromtxt()`
|
| 478 |
+
- Built-in file reading
|
| 479 |
+
**EDA Approach:**
|
| 480 |
+
- Format detection (delimiter, header)
|
| 481 |
+
- Data type inference
|
| 482 |
+
- Comment line handling
|
| 483 |
+
- Missing value codes
|
| 484 |
+
- Column alignment
|
| 485 |
+
- Encoding detection
|
| 486 |
+
|
| 487 |
+
### .dat - Generic Data File
|
| 488 |
+
**Description:** Binary or text data
|
| 489 |
+
**Typical Data:** Instrument output, custom formats
|
| 490 |
+
**Use Cases:** Various scientific instruments
|
| 491 |
+
**Python Libraries:**
|
| 492 |
+
- Format-specific: requires knowledge of structure
|
| 493 |
+
- `numpy`: `np.fromfile()` for binary
|
| 494 |
+
- `struct`: Parse binary structures
|
| 495 |
+
**EDA Approach:**
|
| 496 |
+
- Binary vs text determination
|
| 497 |
+
- Header detection
|
| 498 |
+
- Record structure inference
|
| 499 |
+
- Endianness
|
| 500 |
+
- Data type patterns
|
| 501 |
+
- Validation with documentation
|
| 502 |
+
|
| 503 |
+
### .log - Log Files
|
| 504 |
+
**Description:** Text logs from software/instruments
|
| 505 |
+
**Typical Data:** Timestamped events, messages
|
| 506 |
+
**Use Cases:** Troubleshooting, experiment tracking
|
| 507 |
+
**Python Libraries:**
|
| 508 |
+
- Built-in file reading
|
| 509 |
+
- `pandas`: Structured log parsing
|
| 510 |
+
- Regular expressions for parsing
|
| 511 |
+
**EDA Approach:**
|
| 512 |
+
- Log level distribution
|
| 513 |
+
- Timestamp parsing
|
| 514 |
+
- Error and warning frequency
|
| 515 |
+
- Event sequencing
|
| 516 |
+
- Pattern recognition
|
| 517 |
+
- Anomaly detection
|
| 518 |
+
- Session boundaries
|
.scider/skills/exploratory-data-analysis/references/microscopy_imaging_formats.md
ADDED
|
@@ -0,0 +1,620 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Microscopy and Imaging File Formats Reference
|
| 2 |
+
|
| 3 |
+
This reference covers file formats used in microscopy, medical imaging, remote sensing, and scientific image analysis.
|
| 4 |
+
|
| 5 |
+
## Microscopy-Specific Formats
|
| 6 |
+
|
| 7 |
+
### .tif / .tiff - Tagged Image File Format
|
| 8 |
+
**Description:** Flexible image format supporting multiple pages and metadata
|
| 9 |
+
**Typical Data:** Microscopy images, z-stacks, time series, multi-channel
|
| 10 |
+
**Use Cases:** Fluorescence microscopy, confocal imaging, biological imaging
|
| 11 |
+
**Python Libraries:**
|
| 12 |
+
- `tifffile`: `tifffile.imread('file.tif')` - Microscopy TIFF support
|
| 13 |
+
- `PIL/Pillow`: `Image.open('file.tif')` - Basic TIFF
|
| 14 |
+
- `scikit-image`: `io.imread('file.tif')`
|
| 15 |
+
- `AICSImageIO`: Multi-format microscopy reader
|
| 16 |
+
**EDA Approach:**
|
| 17 |
+
- Image dimensions and bit depth
|
| 18 |
+
- Multi-page/z-stack analysis
|
| 19 |
+
- Metadata extraction (OME-TIFF)
|
| 20 |
+
- Channel analysis and intensity distributions
|
| 21 |
+
- Temporal dynamics (time-lapse)
|
| 22 |
+
- Pixel size and spatial calibration
|
| 23 |
+
- Histogram analysis per channel
|
| 24 |
+
- Dynamic range utilization
|
| 25 |
+
|
| 26 |
+
### .nd2 - Nikon NIS-Elements
|
| 27 |
+
**Description:** Proprietary Nikon microscope format
|
| 28 |
+
**Typical Data:** Multi-dimensional microscopy (XYZCT)
|
| 29 |
+
**Use Cases:** Nikon microscope data, confocal, widefield
|
| 30 |
+
**Python Libraries:**
|
| 31 |
+
- `nd2reader`: `ND2Reader('file.nd2')`
|
| 32 |
+
- `pims`: `pims.ND2_Reader('file.nd2')`
|
| 33 |
+
- `AICSImageIO`: Universal reader
|
| 34 |
+
**EDA Approach:**
|
| 35 |
+
- Experiment metadata extraction
|
| 36 |
+
- Channel configurations
|
| 37 |
+
- Time-lapse frame analysis
|
| 38 |
+
- Z-stack depth and spacing
|
| 39 |
+
- XY stage positions
|
| 40 |
+
- Laser settings and power
|
| 41 |
+
- Pixel binning information
|
| 42 |
+
- Acquisition timestamps
|
| 43 |
+
|
| 44 |
+
### .lif - Leica Image Format
|
| 45 |
+
**Description:** Leica microscope proprietary format
|
| 46 |
+
**Typical Data:** Multi-experiment, multi-dimensional images
|
| 47 |
+
**Use Cases:** Leica confocal and widefield data
|
| 48 |
+
**Python Libraries:**
|
| 49 |
+
- `readlif`: `readlif.LifFile('file.lif')`
|
| 50 |
+
- `AICSImageIO`: LIF support
|
| 51 |
+
- `python-bioformats`: Via Bio-Formats
|
| 52 |
+
**EDA Approach:**
|
| 53 |
+
- Multiple experiment detection
|
| 54 |
+
- Image series enumeration
|
| 55 |
+
- Metadata per experiment
|
| 56 |
+
- Channel and timepoint structure
|
| 57 |
+
- Physical dimensions extraction
|
| 58 |
+
- Objective and detector information
|
| 59 |
+
- Scan settings analysis
|
| 60 |
+
|
| 61 |
+
### .czi - Carl Zeiss Image
|
| 62 |
+
**Description:** Zeiss microscope format
|
| 63 |
+
**Typical Data:** Multi-dimensional microscopy with rich metadata
|
| 64 |
+
**Use Cases:** Zeiss confocal, lightsheet, widefield
|
| 65 |
+
**Python Libraries:**
|
| 66 |
+
- `czifile`: `czifile.CziFile('file.czi')`
|
| 67 |
+
- `AICSImageIO`: CZI support
|
| 68 |
+
- `pylibCZIrw`: Official Zeiss library
|
| 69 |
+
**EDA Approach:**
|
| 70 |
+
- Scene and position analysis
|
| 71 |
+
- Mosaic tile structure
|
| 72 |
+
- Channel wavelength information
|
| 73 |
+
- Acquisition mode detection
|
| 74 |
+
- Scaling and calibration
|
| 75 |
+
- Instrument configuration
|
| 76 |
+
- ROI definitions
|
| 77 |
+
|
| 78 |
+
### .oib / .oif - Olympus Image Format
|
| 79 |
+
**Description:** Olympus microscope formats
|
| 80 |
+
**Typical Data:** Confocal and multiphoton imaging
|
| 81 |
+
**Use Cases:** Olympus FluoView data
|
| 82 |
+
**Python Libraries:**
|
| 83 |
+
- `AICSImageIO`: OIB/OIF support
|
| 84 |
+
- `python-bioformats`: Via Bio-Formats
|
| 85 |
+
**EDA Approach:**
|
| 86 |
+
- Directory structure validation (OIF)
|
| 87 |
+
- Metadata file parsing
|
| 88 |
+
- Channel configuration
|
| 89 |
+
- Scan parameters
|
| 90 |
+
- Objective and filter information
|
| 91 |
+
- PMT settings
|
| 92 |
+
|
| 93 |
+
### .vsi - Olympus VSI
|
| 94 |
+
**Description:** Olympus slide scanner format
|
| 95 |
+
**Typical Data:** Whole slide imaging, large mosaics
|
| 96 |
+
**Use Cases:** Virtual microscopy, pathology
|
| 97 |
+
**Python Libraries:**
|
| 98 |
+
- `openslide-python`: `openslide.OpenSlide('file.vsi')`
|
| 99 |
+
- `AICSImageIO`: VSI support
|
| 100 |
+
**EDA Approach:**
|
| 101 |
+
- Pyramid level analysis
|
| 102 |
+
- Tile structure and overlap
|
| 103 |
+
- Macro and label images
|
| 104 |
+
- Magnification levels
|
| 105 |
+
- Whole slide statistics
|
| 106 |
+
- Region detection
|
| 107 |
+
|
| 108 |
+
### .ims - Imaris Format
|
| 109 |
+
**Description:** Bitplane Imaris HDF5-based format
|
| 110 |
+
**Typical Data:** Large 3D/4D microscopy datasets
|
| 111 |
+
**Use Cases:** 3D rendering, time-lapse analysis
|
| 112 |
+
**Python Libraries:**
|
| 113 |
+
- `h5py`: Direct HDF5 access
|
| 114 |
+
- `imaris_ims_file_reader`: Specialized reader
|
| 115 |
+
**EDA Approach:**
|
| 116 |
+
- Resolution level analysis
|
| 117 |
+
- Time point structure
|
| 118 |
+
- Channel organization
|
| 119 |
+
- Dataset hierarchy
|
| 120 |
+
- Thumbnail generation
|
| 121 |
+
- Memory-mapped access strategies
|
| 122 |
+
- Chunking optimization
|
| 123 |
+
|
| 124 |
+
### .lsm - Zeiss LSM
|
| 125 |
+
**Description:** Legacy Zeiss confocal format
|
| 126 |
+
**Typical Data:** Confocal laser scanning microscopy
|
| 127 |
+
**Use Cases:** Older Zeiss confocal data
|
| 128 |
+
**Python Libraries:**
|
| 129 |
+
- `tifffile`: LSM support (TIFF-based)
|
| 130 |
+
- `python-bioformats`: LSM reading
|
| 131 |
+
**EDA Approach:**
|
| 132 |
+
- Similar to TIFF with LSM-specific metadata
|
| 133 |
+
- Scan speed and resolution
|
| 134 |
+
- Laser lines and power
|
| 135 |
+
- Detector gain and offset
|
| 136 |
+
- LUT information
|
| 137 |
+
|
| 138 |
+
### .stk - MetaMorph Stack
|
| 139 |
+
**Description:** MetaMorph image stack format
|
| 140 |
+
**Typical Data:** Time-lapse or z-stack sequences
|
| 141 |
+
**Use Cases:** MetaMorph software output
|
| 142 |
+
**Python Libraries:**
|
| 143 |
+
- `tifffile`: STK is TIFF-based
|
| 144 |
+
- `python-bioformats`: STK support
|
| 145 |
+
**EDA Approach:**
|
| 146 |
+
- Stack dimensionality
|
| 147 |
+
- Plane metadata
|
| 148 |
+
- Timing information
|
| 149 |
+
- Stage positions
|
| 150 |
+
- UIC tags parsing
|
| 151 |
+
|
| 152 |
+
### .dv - DeltaVision
|
| 153 |
+
**Description:** Applied Precision DeltaVision format
|
| 154 |
+
**Typical Data:** Deconvolution microscopy
|
| 155 |
+
**Use Cases:** DeltaVision microscope data
|
| 156 |
+
**Python Libraries:**
|
| 157 |
+
- `mrc`: Can read DV (MRC-related)
|
| 158 |
+
- `AICSImageIO`: DV support
|
| 159 |
+
**EDA Approach:**
|
| 160 |
+
- Wave information (channels)
|
| 161 |
+
- Extended header analysis
|
| 162 |
+
- Lens and magnification
|
| 163 |
+
- Deconvolution status
|
| 164 |
+
- Time stamps per section
|
| 165 |
+
|
| 166 |
+
### .mrc - Medical Research Council
|
| 167 |
+
**Description:** Electron microscopy format
|
| 168 |
+
**Typical Data:** EM images, cryo-EM, tomography
|
| 169 |
+
**Use Cases:** Structural biology, electron microscopy
|
| 170 |
+
**Python Libraries:**
|
| 171 |
+
- `mrcfile`: `mrcfile.open('file.mrc')`
|
| 172 |
+
- `EMAN2`: EM-specific tools
|
| 173 |
+
**EDA Approach:**
|
| 174 |
+
- Volume dimensions
|
| 175 |
+
- Voxel size and units
|
| 176 |
+
- Origin and map statistics
|
| 177 |
+
- Symmetry information
|
| 178 |
+
- Extended header analysis
|
| 179 |
+
- Density statistics
|
| 180 |
+
- Header consistency validation
|
| 181 |
+
|
| 182 |
+
### .dm3 / .dm4 - Gatan Digital Micrograph
|
| 183 |
+
**Description:** Gatan TEM/STEM format
|
| 184 |
+
**Typical Data:** Transmission electron microscopy
|
| 185 |
+
**Use Cases:** TEM imaging and analysis
|
| 186 |
+
**Python Libraries:**
|
| 187 |
+
- `hyperspy`: `hs.load('file.dm3')`
|
| 188 |
+
- `ncempy`: `ncempy.io.dm.dmReader('file.dm3')`
|
| 189 |
+
**EDA Approach:**
|
| 190 |
+
- Microscope parameters
|
| 191 |
+
- Energy dispersive spectroscopy data
|
| 192 |
+
- Diffraction patterns
|
| 193 |
+
- Calibration information
|
| 194 |
+
- Tag structure analysis
|
| 195 |
+
- Image series handling
|
| 196 |
+
|
| 197 |
+
### .eer - Electron Event Representation
|
| 198 |
+
**Description:** Direct electron detector format
|
| 199 |
+
**Typical Data:** Electron counting data from detectors
|
| 200 |
+
**Use Cases:** Cryo-EM data collection
|
| 201 |
+
**Python Libraries:**
|
| 202 |
+
- `mrcfile`: Some EER support
|
| 203 |
+
- Vendor-specific tools (Gatan, TFS)
|
| 204 |
+
**EDA Approach:**
|
| 205 |
+
- Event counting statistics
|
| 206 |
+
- Frame rate and dose
|
| 207 |
+
- Detector configuration
|
| 208 |
+
- Motion correction assessment
|
| 209 |
+
- Gain reference validation
|
| 210 |
+
|
| 211 |
+
### .ser - TIA Series
|
| 212 |
+
**Description:** FEI/TFS TIA format
|
| 213 |
+
**Typical Data:** EM image series
|
| 214 |
+
**Use Cases:** FEI/Thermo Fisher EM data
|
| 215 |
+
**Python Libraries:**
|
| 216 |
+
- `hyperspy`: SER support
|
| 217 |
+
- `ncempy`: TIA reader
|
| 218 |
+
**EDA Approach:**
|
| 219 |
+
- Series structure
|
| 220 |
+
- Calibration data
|
| 221 |
+
- Acquisition metadata
|
| 222 |
+
- Time stamps
|
| 223 |
+
- Multi-dimensional data organization
|
| 224 |
+
|
| 225 |
+
## Medical and Biological Imaging
|
| 226 |
+
|
| 227 |
+
### .dcm - DICOM
|
| 228 |
+
**Description:** Digital Imaging and Communications in Medicine
|
| 229 |
+
**Typical Data:** Medical images with patient/study metadata
|
| 230 |
+
**Use Cases:** Clinical imaging, radiology, CT, MRI, PET
|
| 231 |
+
**Python Libraries:**
|
| 232 |
+
- `pydicom`: `pydicom.dcmread('file.dcm')`
|
| 233 |
+
- `SimpleITK`: `sitk.ReadImage('file.dcm')`
|
| 234 |
+
- `nibabel`: Limited DICOM support
|
| 235 |
+
**EDA Approach:**
|
| 236 |
+
- Patient metadata extraction (anonymization check)
|
| 237 |
+
- Modality-specific analysis
|
| 238 |
+
- Series and study organization
|
| 239 |
+
- Slice thickness and spacing
|
| 240 |
+
- Window/level settings
|
| 241 |
+
- Hounsfield units (CT)
|
| 242 |
+
- Image orientation and position
|
| 243 |
+
- Multi-frame analysis
|
| 244 |
+
|
| 245 |
+
### .nii / .nii.gz - NIfTI
|
| 246 |
+
**Description:** Neuroimaging Informatics Technology Initiative
|
| 247 |
+
**Typical Data:** Brain imaging, fMRI, structural MRI
|
| 248 |
+
**Use Cases:** Neuroimaging research, brain analysis
|
| 249 |
+
**Python Libraries:**
|
| 250 |
+
- `nibabel`: `nibabel.load('file.nii')`
|
| 251 |
+
- `nilearn`: Neuroimaging with ML
|
| 252 |
+
- `SimpleITK`: NIfTI support
|
| 253 |
+
**EDA Approach:**
|
| 254 |
+
- Volume dimensions and voxel size
|
| 255 |
+
- Affine transformation matrix
|
| 256 |
+
- Time series analysis (fMRI)
|
| 257 |
+
- Intensity distribution
|
| 258 |
+
- Brain extraction quality
|
| 259 |
+
- Registration assessment
|
| 260 |
+
- Orientation validation
|
| 261 |
+
- Header information consistency
|
| 262 |
+
|
| 263 |
+
### .mnc - MINC Format
|
| 264 |
+
**Description:** Medical Image NetCDF
|
| 265 |
+
**Typical Data:** Medical imaging (predecessor to NIfTI)
|
| 266 |
+
**Use Cases:** Legacy neuroimaging data
|
| 267 |
+
**Python Libraries:**
|
| 268 |
+
- `pyminc`: MINC-specific tools
|
| 269 |
+
- `nibabel`: MINC support
|
| 270 |
+
**EDA Approach:**
|
| 271 |
+
- Similar to NIfTI
|
| 272 |
+
- NetCDF structure exploration
|
| 273 |
+
- Dimension ordering
|
| 274 |
+
- Metadata extraction
|
| 275 |
+
|
| 276 |
+
### .nrrd - Nearly Raw Raster Data
|
| 277 |
+
**Description:** Medical imaging format with detached header
|
| 278 |
+
**Typical Data:** Medical images, research imaging
|
| 279 |
+
**Use Cases:** 3D Slicer, ITK-based applications
|
| 280 |
+
**Python Libraries:**
|
| 281 |
+
- `pynrrd`: `nrrd.read('file.nrrd')`
|
| 282 |
+
- `SimpleITK`: NRRD support
|
| 283 |
+
**EDA Approach:**
|
| 284 |
+
- Header field analysis
|
| 285 |
+
- Encoding format
|
| 286 |
+
- Dimension and spacing
|
| 287 |
+
- Orientation matrix
|
| 288 |
+
- Compression assessment
|
| 289 |
+
- Endianness handling
|
| 290 |
+
|
| 291 |
+
### .mha / .mhd - MetaImage
|
| 292 |
+
**Description:** MetaImage format (ITK)
|
| 293 |
+
**Typical Data:** Medical/scientific 3D images
|
| 294 |
+
**Use Cases:** ITK/SimpleITK applications
|
| 295 |
+
**Python Libraries:**
|
| 296 |
+
- `SimpleITK`: Native MHA/MHD support
|
| 297 |
+
- `itk`: Direct ITK integration
|
| 298 |
+
**EDA Approach:**
|
| 299 |
+
- Header-data file pairing (MHD)
|
| 300 |
+
- Transform matrix
|
| 301 |
+
- Element spacing
|
| 302 |
+
- Compression format
|
| 303 |
+
- Data type and dimensions
|
| 304 |
+
|
| 305 |
+
### .hdr / .img - Analyze Format
|
| 306 |
+
**Description:** Legacy medical imaging format
|
| 307 |
+
**Typical Data:** Brain imaging (pre-NIfTI)
|
| 308 |
+
**Use Cases:** Old neuroimaging datasets
|
| 309 |
+
**Python Libraries:**
|
| 310 |
+
- `nibabel`: Analyze support
|
| 311 |
+
- Conversion to NIfTI recommended
|
| 312 |
+
**EDA Approach:**
|
| 313 |
+
- Header-image pairing validation
|
| 314 |
+
- Byte order issues
|
| 315 |
+
- Conversion to modern formats
|
| 316 |
+
- Metadata limitations
|
| 317 |
+
|
| 318 |
+
## Scientific Image Formats
|
| 319 |
+
|
| 320 |
+
### .png - Portable Network Graphics
|
| 321 |
+
**Description:** Lossless compressed image format
|
| 322 |
+
**Typical Data:** 2D images, screenshots, processed data
|
| 323 |
+
**Use Cases:** Publication figures, lossless storage
|
| 324 |
+
**Python Libraries:**
|
| 325 |
+
- `PIL/Pillow`: `Image.open('file.png')`
|
| 326 |
+
- `scikit-image`: `io.imread('file.png')`
|
| 327 |
+
- `imageio`: `imageio.imread('file.png')`
|
| 328 |
+
**EDA Approach:**
|
| 329 |
+
- Bit depth analysis (8-bit, 16-bit)
|
| 330 |
+
- Color mode (grayscale, RGB, palette)
|
| 331 |
+
- Metadata (PNG chunks)
|
| 332 |
+
- Transparency handling
|
| 333 |
+
- Compression efficiency
|
| 334 |
+
- Histogram analysis
|
| 335 |
+
|
| 336 |
+
### .jpg / .jpeg - Joint Photographic Experts Group
|
| 337 |
+
**Description:** Lossy compressed image format
|
| 338 |
+
**Typical Data:** Natural images, photos
|
| 339 |
+
**Use Cases:** Visualization, web graphics (not raw data)
|
| 340 |
+
**Python Libraries:**
|
| 341 |
+
- `PIL/Pillow`: Standard JPEG support
|
| 342 |
+
- `scikit-image`: JPEG reading
|
| 343 |
+
**EDA Approach:**
|
| 344 |
+
- Compression artifacts detection
|
| 345 |
+
- Quality factor estimation
|
| 346 |
+
- Color space (RGB, grayscale)
|
| 347 |
+
- EXIF metadata
|
| 348 |
+
- Quantization table analysis
|
| 349 |
+
- Note: Not suitable for quantitative analysis
|
| 350 |
+
|
| 351 |
+
### .bmp - Bitmap Image
|
| 352 |
+
**Description:** Uncompressed raster image
|
| 353 |
+
**Typical Data:** Simple images, screenshots
|
| 354 |
+
**Use Cases:** Compatibility, simple storage
|
| 355 |
+
**Python Libraries:**
|
| 356 |
+
- `PIL/Pillow`: BMP support
|
| 357 |
+
- `scikit-image`: BMP reading
|
| 358 |
+
**EDA Approach:**
|
| 359 |
+
- Color depth
|
| 360 |
+
- Palette analysis (if indexed)
|
| 361 |
+
- File size efficiency
|
| 362 |
+
- Pixel format validation
|
| 363 |
+
|
| 364 |
+
### .gif - Graphics Interchange Format
|
| 365 |
+
**Description:** Image format with animation support
|
| 366 |
+
**Typical Data:** Animated images, simple graphics
|
| 367 |
+
**Use Cases:** Animations, time-lapse visualization
|
| 368 |
+
**Python Libraries:**
|
| 369 |
+
- `PIL/Pillow`: GIF support
|
| 370 |
+
- `imageio`: Better GIF animation support
|
| 371 |
+
**EDA Approach:**
|
| 372 |
+
- Frame count and timing
|
| 373 |
+
- Palette limitations (256 colors)
|
| 374 |
+
- Loop count
|
| 375 |
+
- Disposal method
|
| 376 |
+
- Transparency handling
|
| 377 |
+
|
| 378 |
+
### .svg - Scalable Vector Graphics
|
| 379 |
+
**Description:** XML-based vector graphics
|
| 380 |
+
**Typical Data:** Vector drawings, plots, diagrams
|
| 381 |
+
**Use Cases:** Publication-quality figures, plots
|
| 382 |
+
**Python Libraries:**
|
| 383 |
+
- `svgpathtools`: Path manipulation
|
| 384 |
+
- `cairosvg`: Rasterization
|
| 385 |
+
- `lxml`: XML parsing
|
| 386 |
+
**EDA Approach:**
|
| 387 |
+
- Element structure analysis
|
| 388 |
+
- Style information
|
| 389 |
+
- Viewbox and dimensions
|
| 390 |
+
- Path complexity
|
| 391 |
+
- Text element extraction
|
| 392 |
+
- Layer organization
|
| 393 |
+
|
| 394 |
+
### .eps - Encapsulated PostScript
|
| 395 |
+
**Description:** Vector graphics format
|
| 396 |
+
**Typical Data:** Publication figures
|
| 397 |
+
**Use Cases:** Legacy publication graphics
|
| 398 |
+
**Python Libraries:**
|
| 399 |
+
- `PIL/Pillow`: Basic EPS rasterization
|
| 400 |
+
- `ghostscript` via subprocess
|
| 401 |
+
**EDA Approach:**
|
| 402 |
+
- Bounding box information
|
| 403 |
+
- Preview image validation
|
| 404 |
+
- Font embedding
|
| 405 |
+
- Conversion to modern formats
|
| 406 |
+
|
| 407 |
+
### .pdf (Images)
|
| 408 |
+
**Description:** Portable Document Format with images
|
| 409 |
+
**Typical Data:** Publication figures, multi-page documents
|
| 410 |
+
**Use Cases:** Publication, data presentation
|
| 411 |
+
**Python Libraries:**
|
| 412 |
+
- `PyMuPDF/fitz`: `fitz.open('file.pdf')`
|
| 413 |
+
- `pdf2image`: Rasterization
|
| 414 |
+
- `pdfplumber`: Text and layout extraction
|
| 415 |
+
**EDA Approach:**
|
| 416 |
+
- Page count
|
| 417 |
+
- Image extraction
|
| 418 |
+
- Resolution and DPI
|
| 419 |
+
- Embedded fonts and metadata
|
| 420 |
+
- Compression methods
|
| 421 |
+
- Image vs vector content
|
| 422 |
+
|
| 423 |
+
### .fig - MATLAB Figure
|
| 424 |
+
**Description:** MATLAB figure file
|
| 425 |
+
**Typical Data:** MATLAB plots and figures
|
| 426 |
+
**Use Cases:** MATLAB data visualization
|
| 427 |
+
**Python Libraries:**
|
| 428 |
+
- Custom parsers (MAT file structure)
|
| 429 |
+
- Conversion to other formats
|
| 430 |
+
**EDA Approach:**
|
| 431 |
+
- Figure structure
|
| 432 |
+
- Data extraction from plots
|
| 433 |
+
- Axes and label information
|
| 434 |
+
- Plot type identification
|
| 435 |
+
|
| 436 |
+
### .hdf5 (Imaging Specific)
|
| 437 |
+
**Description:** HDF5 for large imaging datasets
|
| 438 |
+
**Typical Data:** High-content screening, large microscopy
|
| 439 |
+
**Use Cases:** BigDataViewer, large-scale imaging
|
| 440 |
+
**Python Libraries:**
|
| 441 |
+
- `h5py`: Universal HDF5 access
|
| 442 |
+
- Imaging-specific readers (BigDataViewer)
|
| 443 |
+
**EDA Approach:**
|
| 444 |
+
- Dataset hierarchy
|
| 445 |
+
- Chunk and compression strategy
|
| 446 |
+
- Multi-resolution pyramid
|
| 447 |
+
- Metadata organization
|
| 448 |
+
- Memory-mapped access
|
| 449 |
+
- Parallel I/O performance
|
| 450 |
+
|
| 451 |
+
### .zarr - Chunked Array Storage
|
| 452 |
+
**Description:** Cloud-optimized array storage
|
| 453 |
+
**Typical Data:** Large imaging datasets, OME-ZARR
|
| 454 |
+
**Use Cases:** Cloud microscopy, large-scale analysis
|
| 455 |
+
**Python Libraries:**
|
| 456 |
+
- `zarr`: `zarr.open('file.zarr')`
|
| 457 |
+
- `ome-zarr-py`: OME-ZARR support
|
| 458 |
+
**EDA Approach:**
|
| 459 |
+
- Chunk size optimization
|
| 460 |
+
- Compression codec analysis
|
| 461 |
+
- Multi-scale representation
|
| 462 |
+
- Array dimensions and dtype
|
| 463 |
+
- Metadata structure (OME)
|
| 464 |
+
- Cloud access patterns
|
| 465 |
+
|
| 466 |
+
### .raw - Raw Image Data
|
| 467 |
+
**Description:** Unformatted binary pixel data
|
| 468 |
+
**Typical Data:** Raw detector output
|
| 469 |
+
**Use Cases:** Custom imaging systems
|
| 470 |
+
**Python Libraries:**
|
| 471 |
+
- `numpy`: `np.fromfile()` with dtype
|
| 472 |
+
- `imageio`: Raw format plugins
|
| 473 |
+
**EDA Approach:**
|
| 474 |
+
- Dimensions determination (external info needed)
|
| 475 |
+
- Byte order and data type
|
| 476 |
+
- Header presence detection
|
| 477 |
+
- Pixel value range
|
| 478 |
+
- Noise characteristics
|
| 479 |
+
|
| 480 |
+
### .bin - Binary Image Data
|
| 481 |
+
**Description:** Generic binary image format
|
| 482 |
+
**Typical Data:** Raw or custom-formatted images
|
| 483 |
+
**Use Cases:** Instrument-specific outputs
|
| 484 |
+
**Python Libraries:**
|
| 485 |
+
- `numpy`: Custom binary reading
|
| 486 |
+
- `struct`: For structured binary data
|
| 487 |
+
**EDA Approach:**
|
| 488 |
+
- Format specification required
|
| 489 |
+
- Header parsing (if present)
|
| 490 |
+
- Data type inference
|
| 491 |
+
- Dimension extraction
|
| 492 |
+
- Validation with known parameters
|
| 493 |
+
|
| 494 |
+
## Image Analysis Formats
|
| 495 |
+
|
| 496 |
+
### .roi - ImageJ ROI
|
| 497 |
+
**Description:** ImageJ region of interest format
|
| 498 |
+
**Typical Data:** Geometric ROIs, selections
|
| 499 |
+
**Use Cases:** ImageJ/Fiji analysis workflows
|
| 500 |
+
**Python Libraries:**
|
| 501 |
+
- `read-roi`: `read_roi.read_roi_file('file.roi')`
|
| 502 |
+
- `roifile`: ROI manipulation
|
| 503 |
+
**EDA Approach:**
|
| 504 |
+
- ROI type analysis (rectangle, polygon, etc.)
|
| 505 |
+
- Coordinate extraction
|
| 506 |
+
- ROI properties (area, perimeter)
|
| 507 |
+
- Group analysis (ROI sets)
|
| 508 |
+
- Z-position and time information
|
| 509 |
+
|
| 510 |
+
### .zip (ROI sets)
|
| 511 |
+
**Description:** ZIP archive of ImageJ ROIs
|
| 512 |
+
**Typical Data:** Multiple ROI files
|
| 513 |
+
**Use Cases:** Batch ROI analysis
|
| 514 |
+
**Python Libraries:**
|
| 515 |
+
- `read-roi`: `read_roi.read_roi_zip('file.zip')`
|
| 516 |
+
- Standard `zipfile` module
|
| 517 |
+
**EDA Approach:**
|
| 518 |
+
- ROI count in set
|
| 519 |
+
- ROI type distribution
|
| 520 |
+
- Spatial distribution
|
| 521 |
+
- Overlapping ROI detection
|
| 522 |
+
- Naming conventions
|
| 523 |
+
|
| 524 |
+
### .ome.tif / .ome.tiff - OME-TIFF
|
| 525 |
+
**Description:** TIFF with OME-XML metadata
|
| 526 |
+
**Typical Data:** Standardized microscopy with rich metadata
|
| 527 |
+
**Use Cases:** Bio-Formats compatible storage
|
| 528 |
+
**Python Libraries:**
|
| 529 |
+
- `tifffile`: OME-TIFF support
|
| 530 |
+
- `AICSImageIO`: OME reading
|
| 531 |
+
- `python-bioformats`: Bio-Formats integration
|
| 532 |
+
**EDA Approach:**
|
| 533 |
+
- OME-XML validation
|
| 534 |
+
- Physical dimensions extraction
|
| 535 |
+
- Channel naming and wavelengths
|
| 536 |
+
- Plane positions (Z, C, T)
|
| 537 |
+
- Instrument metadata
|
| 538 |
+
- Bio-Formats compatibility
|
| 539 |
+
|
| 540 |
+
### .ome.zarr - OME-ZARR
|
| 541 |
+
**Description:** OME-NGFF specification on ZARR
|
| 542 |
+
**Typical Data:** Next-generation file format for bioimaging
|
| 543 |
+
**Use Cases:** Cloud-native imaging, large datasets
|
| 544 |
+
**Python Libraries:**
|
| 545 |
+
- `ome-zarr-py`: Official implementation
|
| 546 |
+
- `zarr`: Underlying array storage
|
| 547 |
+
**EDA Approach:**
|
| 548 |
+
- Multiscale resolution levels
|
| 549 |
+
- Metadata compliance with OME-NGFF spec
|
| 550 |
+
- Coordinate transformations
|
| 551 |
+
- Label and ROI handling
|
| 552 |
+
- Cloud storage optimization
|
| 553 |
+
- Chunk access patterns
|
| 554 |
+
|
| 555 |
+
### .klb - Keller Lab Block
|
| 556 |
+
**Description:** Fast microscopy format for large data
|
| 557 |
+
**Typical Data:** Lightsheet microscopy, time-lapse
|
| 558 |
+
**Use Cases:** High-throughput imaging
|
| 559 |
+
**Python Libraries:**
|
| 560 |
+
- `pyklb`: KLB reading and writing
|
| 561 |
+
**EDA Approach:**
|
| 562 |
+
- Compression efficiency
|
| 563 |
+
- Block structure
|
| 564 |
+
- Multi-resolution support
|
| 565 |
+
- Read performance benchmarking
|
| 566 |
+
- Metadata extraction
|
| 567 |
+
|
| 568 |
+
### .vsi - Whole Slide Imaging
|
| 569 |
+
**Description:** Virtual slide format (multiple vendors)
|
| 570 |
+
**Typical Data:** Pathology slides, large mosaics
|
| 571 |
+
**Use Cases:** Digital pathology
|
| 572 |
+
**Python Libraries:**
|
| 573 |
+
- `openslide-python`: Multi-format WSI
|
| 574 |
+
- `tiffslide`: Pure Python alternative
|
| 575 |
+
**EDA Approach:**
|
| 576 |
+
- Pyramid level count
|
| 577 |
+
- Downsampling factors
|
| 578 |
+
- Associated images (macro, label)
|
| 579 |
+
- Tile size and overlap
|
| 580 |
+
- MPP (microns per pixel)
|
| 581 |
+
- Background detection
|
| 582 |
+
- Tissue segmentation
|
| 583 |
+
|
| 584 |
+
### .ndpi - Hamamatsu NanoZoomer
|
| 585 |
+
**Description:** Hamamatsu slide scanner format
|
| 586 |
+
**Typical Data:** Whole slide pathology images
|
| 587 |
+
**Use Cases:** Digital pathology workflows
|
| 588 |
+
**Python Libraries:**
|
| 589 |
+
- `openslide-python`: NDPI support
|
| 590 |
+
**EDA Approach:**
|
| 591 |
+
- Multi-resolution pyramid
|
| 592 |
+
- Lens and objective information
|
| 593 |
+
- Scan area and magnification
|
| 594 |
+
- Focal plane information
|
| 595 |
+
- Tissue detection
|
| 596 |
+
|
| 597 |
+
### .svs - Aperio ScanScope
|
| 598 |
+
**Description:** Aperio whole slide format
|
| 599 |
+
**Typical Data:** Digital pathology slides
|
| 600 |
+
**Use Cases:** Pathology image analysis
|
| 601 |
+
**Python Libraries:**
|
| 602 |
+
- `openslide-python`: SVS support
|
| 603 |
+
**EDA Approach:**
|
| 604 |
+
- Pyramid structure
|
| 605 |
+
- MPP calibration
|
| 606 |
+
- Label and macro images
|
| 607 |
+
- Compression quality
|
| 608 |
+
- Thumbnail generation
|
| 609 |
+
|
| 610 |
+
### .scn - Leica SCN
|
| 611 |
+
**Description:** Leica slide scanner format
|
| 612 |
+
**Typical Data:** Whole slide imaging
|
| 613 |
+
**Use Cases:** Digital pathology
|
| 614 |
+
**Python Libraries:**
|
| 615 |
+
- `openslide-python`: SCN support
|
| 616 |
+
**EDA Approach:**
|
| 617 |
+
- Tile structure analysis
|
| 618 |
+
- Collection organization
|
| 619 |
+
- Metadata extraction
|
| 620 |
+
- Magnification levels
|
.scider/skills/exploratory-data-analysis/references/proteomics_metabolomics_formats.md
ADDED
|
@@ -0,0 +1,517 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Proteomics and Metabolomics File Formats Reference
|
| 2 |
+
|
| 3 |
+
This reference covers file formats specific to proteomics, metabolomics, lipidomics, and related omics workflows.
|
| 4 |
+
|
| 5 |
+
## Mass Spectrometry-Based Proteomics
|
| 6 |
+
|
| 7 |
+
### .mzML - Mass Spectrometry Markup Language
|
| 8 |
+
**Description:** Standard XML format for MS data
|
| 9 |
+
**Typical Data:** MS1 and MS2 spectra, retention times, intensities
|
| 10 |
+
**Use Cases:** Proteomics, metabolomics pipelines
|
| 11 |
+
**Python Libraries:**
|
| 12 |
+
- `pymzml`: `pymzml.run.Reader('file.mzML')`
|
| 13 |
+
- `pyteomics.mzml`: `pyteomics.mzml.read('file.mzML')`
|
| 14 |
+
- `pyopenms`: OpenMS Python bindings
|
| 15 |
+
**EDA Approach:**
|
| 16 |
+
- Scan count and MS level distribution
|
| 17 |
+
- Total ion chromatogram (TIC) analysis
|
| 18 |
+
- Base peak chromatogram (BPC)
|
| 19 |
+
- m/z coverage and resolution
|
| 20 |
+
- Retention time range
|
| 21 |
+
- Precursor selection patterns
|
| 22 |
+
- Data completeness
|
| 23 |
+
- Quality control metrics (lock mass, standards)
|
| 24 |
+
|
| 25 |
+
### .mzXML - Legacy MS XML Format
|
| 26 |
+
**Description:** Older XML-based MS format
|
| 27 |
+
**Typical Data:** Mass spectra with metadata
|
| 28 |
+
**Use Cases:** Legacy proteomics data
|
| 29 |
+
**Python Libraries:**
|
| 30 |
+
- `pyteomics.mzxml`
|
| 31 |
+
- `pymzml`: Can read mzXML
|
| 32 |
+
**EDA Approach:**
|
| 33 |
+
- Similar to mzML
|
| 34 |
+
- Format version compatibility
|
| 35 |
+
- Conversion quality validation
|
| 36 |
+
- Metadata preservation check
|
| 37 |
+
|
| 38 |
+
### .mzIdentML - Peptide Identification Format
|
| 39 |
+
**Description:** PSI standard for peptide identifications
|
| 40 |
+
**Typical Data:** Peptide-spectrum matches, proteins, scores
|
| 41 |
+
**Use Cases:** Search engine results, proteomics workflows
|
| 42 |
+
**Python Libraries:**
|
| 43 |
+
- `pyteomics.mzid`
|
| 44 |
+
- `pyopenms`: MzIdentML support
|
| 45 |
+
**EDA Approach:**
|
| 46 |
+
- PSM count and score distribution
|
| 47 |
+
- FDR calculation and filtering
|
| 48 |
+
- Modification analysis
|
| 49 |
+
- Missed cleavage statistics
|
| 50 |
+
- Protein inference results
|
| 51 |
+
- Search parameters validation
|
| 52 |
+
- Decoy hit analysis
|
| 53 |
+
- Rank-1 vs lower ranks
|
| 54 |
+
|
| 55 |
+
### .pepXML - Trans-Proteomic Pipeline Peptide XML
|
| 56 |
+
**Description:** TPP format for peptide identifications
|
| 57 |
+
**Typical Data:** Search results with statistical validation
|
| 58 |
+
**Use Cases:** Proteomics database search output
|
| 59 |
+
**Python Libraries:**
|
| 60 |
+
- `pyteomics.pepxml`
|
| 61 |
+
**EDA Approach:**
|
| 62 |
+
- Search engine comparison
|
| 63 |
+
- Score distributions (XCorr, expect value, etc.)
|
| 64 |
+
- Charge state analysis
|
| 65 |
+
- Modification frequencies
|
| 66 |
+
- PeptideProphet probabilities
|
| 67 |
+
- Protein coverage
|
| 68 |
+
- Spectral counting
|
| 69 |
+
|
| 70 |
+
### .protXML - Protein Inference Results
|
| 71 |
+
**Description:** TPP protein-level identifications
|
| 72 |
+
**Typical Data:** Protein groups, probabilities, peptides
|
| 73 |
+
**Use Cases:** Protein-level analysis
|
| 74 |
+
**Python Libraries:**
|
| 75 |
+
- `pyteomics.protxml`
|
| 76 |
+
**EDA Approach:**
|
| 77 |
+
- Protein group statistics
|
| 78 |
+
- Parsimonious protein sets
|
| 79 |
+
- ProteinProphet probabilities
|
| 80 |
+
- Coverage and peptide count per protein
|
| 81 |
+
- Unique vs shared peptides
|
| 82 |
+
- Protein molecular weight distribution
|
| 83 |
+
- GO term enrichment preparation
|
| 84 |
+
|
| 85 |
+
### .pride.xml - PRIDE XML Format
|
| 86 |
+
**Description:** Proteomics Identifications Database format
|
| 87 |
+
**Typical Data:** Complete proteomics experiment data
|
| 88 |
+
**Use Cases:** Public data deposition (legacy)
|
| 89 |
+
**Python Libraries:**
|
| 90 |
+
- `pyteomics.pride`
|
| 91 |
+
- Custom XML parsers
|
| 92 |
+
**EDA Approach:**
|
| 93 |
+
- Experiment metadata extraction
|
| 94 |
+
- Identification completeness
|
| 95 |
+
- Cross-linking to spectra
|
| 96 |
+
- Protocol information
|
| 97 |
+
- Instrument details
|
| 98 |
+
|
| 99 |
+
### .tsv / .csv (Proteomics)
|
| 100 |
+
**Description:** Tab or comma-separated proteomics results
|
| 101 |
+
**Typical Data:** Peptide or protein quantification tables
|
| 102 |
+
**Use Cases:** MaxQuant, Proteome Discoverer, Skyline output
|
| 103 |
+
**Python Libraries:**
|
| 104 |
+
- `pandas`: `pd.read_csv()` or `pd.read_table()`
|
| 105 |
+
**EDA Approach:**
|
| 106 |
+
- Identification counts
|
| 107 |
+
- Quantitative value distributions
|
| 108 |
+
- Missing value patterns
|
| 109 |
+
- Intensity-based analysis
|
| 110 |
+
- Label-free quantification assessment
|
| 111 |
+
- Isobaric tag ratio analysis
|
| 112 |
+
- Coefficient of variation
|
| 113 |
+
- Batch effects
|
| 114 |
+
|
| 115 |
+
### .msf - Thermo MSF Database
|
| 116 |
+
**Description:** Proteome Discoverer results database
|
| 117 |
+
**Typical Data:** SQLite database with search results
|
| 118 |
+
**Use Cases:** Thermo Proteome Discoverer workflows
|
| 119 |
+
**Python Libraries:**
|
| 120 |
+
- `sqlite3`: Database access
|
| 121 |
+
- Custom MSF parsers
|
| 122 |
+
**EDA Approach:**
|
| 123 |
+
- Database schema exploration
|
| 124 |
+
- Peptide and protein tables
|
| 125 |
+
- Score thresholds
|
| 126 |
+
- Quantification data
|
| 127 |
+
- Processing node information
|
| 128 |
+
- Confidence levels
|
| 129 |
+
|
| 130 |
+
### .pdResult - Proteome Discoverer Result
|
| 131 |
+
**Description:** Proteome Discoverer study results
|
| 132 |
+
**Typical Data:** Comprehensive search and quantification
|
| 133 |
+
**Use Cases:** PD study exports
|
| 134 |
+
**Python Libraries:**
|
| 135 |
+
- Vendor tools for conversion
|
| 136 |
+
- Export to TSV for Python analysis
|
| 137 |
+
**EDA Approach:**
|
| 138 |
+
- Study design validation
|
| 139 |
+
- Result filtering criteria
|
| 140 |
+
- Quantitative comparison groups
|
| 141 |
+
- Imputation strategies
|
| 142 |
+
|
| 143 |
+
### .pep.xml - Peptide Summary
|
| 144 |
+
**Description:** Compact peptide identification format
|
| 145 |
+
**Typical Data:** Peptide sequences, modifications, scores
|
| 146 |
+
**Use Cases:** Downstream analysis input
|
| 147 |
+
**Python Libraries:**
|
| 148 |
+
- `pyteomics`: XML parsing
|
| 149 |
+
**EDA Approach:**
|
| 150 |
+
- Unique peptide counting
|
| 151 |
+
- PTM site localization
|
| 152 |
+
- Retention time predictability
|
| 153 |
+
- Charge state preferences
|
| 154 |
+
|
| 155 |
+
## Quantitative Proteomics
|
| 156 |
+
|
| 157 |
+
### .sky - Skyline Document
|
| 158 |
+
**Description:** Skyline targeted proteomics document
|
| 159 |
+
**Typical Data:** Transition lists, chromatograms, results
|
| 160 |
+
**Use Cases:** Targeted proteomics (SRM/MRM/PRM)
|
| 161 |
+
**Python Libraries:**
|
| 162 |
+
- `skyline`: Python API (limited)
|
| 163 |
+
- Export to CSV for analysis
|
| 164 |
+
**EDA Approach:**
|
| 165 |
+
- Transition selection validation
|
| 166 |
+
- Chromatographic peak quality
|
| 167 |
+
- Interference detection
|
| 168 |
+
- Retention time consistency
|
| 169 |
+
- Calibration curve assessment
|
| 170 |
+
- Replicate correlation
|
| 171 |
+
- LOD/LOQ determination
|
| 172 |
+
|
| 173 |
+
### .sky.zip - Zipped Skyline Document
|
| 174 |
+
**Description:** Skyline document with external files
|
| 175 |
+
**Typical Data:** Complete Skyline analysis
|
| 176 |
+
**Use Cases:** Sharing Skyline projects
|
| 177 |
+
**Python Libraries:**
|
| 178 |
+
- `zipfile`: Extract for processing
|
| 179 |
+
**EDA Approach:**
|
| 180 |
+
- Document structure
|
| 181 |
+
- External file references
|
| 182 |
+
- Result export and analysis
|
| 183 |
+
|
| 184 |
+
### .wiff - SCIEX WIFF Format
|
| 185 |
+
**Description:** SCIEX instrument data with quantitation
|
| 186 |
+
**Typical Data:** LC-MS/MS with MRM transitions
|
| 187 |
+
**Use Cases:** SCIEX QTRAP, TripleTOF data
|
| 188 |
+
**Python Libraries:**
|
| 189 |
+
- Vendor tools (limited Python access)
|
| 190 |
+
- Conversion to mzML
|
| 191 |
+
**EDA Approach:**
|
| 192 |
+
- MRM transition performance
|
| 193 |
+
- Dwell time optimization
|
| 194 |
+
- Cycle time analysis
|
| 195 |
+
- Peak integration quality
|
| 196 |
+
|
| 197 |
+
### .raw (Thermo)
|
| 198 |
+
**Description:** Thermo raw instrument file
|
| 199 |
+
**Typical Data:** Full MS data from Orbitrap, Q Exactive
|
| 200 |
+
**Use Cases:** Label-free and TMT quantification
|
| 201 |
+
**Python Libraries:**
|
| 202 |
+
- `pymsfilereader`: Thermo RawFileReader
|
| 203 |
+
- `ThermoRawFileParser`: Cross-platform CLI
|
| 204 |
+
**EDA Approach:**
|
| 205 |
+
- MS1 and MS2 acquisition rates
|
| 206 |
+
- AGC target and fill times
|
| 207 |
+
- Resolution settings
|
| 208 |
+
- Isolation window validation
|
| 209 |
+
- SPS ion selection (TMT)
|
| 210 |
+
- Contamination assessment
|
| 211 |
+
|
| 212 |
+
### .d (Agilent)
|
| 213 |
+
**Description:** Agilent data directory
|
| 214 |
+
**Typical Data:** LC-MS and GC-MS data
|
| 215 |
+
**Use Cases:** Agilent instrument workflows
|
| 216 |
+
**Python Libraries:**
|
| 217 |
+
- Community parsers
|
| 218 |
+
- Export to mzML
|
| 219 |
+
**EDA Approach:**
|
| 220 |
+
- Method consistency
|
| 221 |
+
- Calibration status
|
| 222 |
+
- Sequence run information
|
| 223 |
+
- Retention time stability
|
| 224 |
+
|
| 225 |
+
## Metabolomics and Lipidomics
|
| 226 |
+
|
| 227 |
+
### .mzML (Metabolomics)
|
| 228 |
+
**Description:** Standard MS format for metabolomics
|
| 229 |
+
**Typical Data:** Full scan MS, targeted MS/MS
|
| 230 |
+
**Use Cases:** Untargeted and targeted metabolomics
|
| 231 |
+
**Python Libraries:**
|
| 232 |
+
- Same as proteomics mzML tools
|
| 233 |
+
**EDA Approach:**
|
| 234 |
+
- Feature detection quality
|
| 235 |
+
- Mass accuracy assessment
|
| 236 |
+
- Retention time alignment
|
| 237 |
+
- Blank subtraction
|
| 238 |
+
- QC sample consistency
|
| 239 |
+
- Isotope pattern validation
|
| 240 |
+
- Adduct formation analysis
|
| 241 |
+
- In-source fragmentation check
|
| 242 |
+
|
| 243 |
+
### .cdf / .netCDF - ANDI-MS
|
| 244 |
+
**Description:** Analytical Data Interchange for MS
|
| 245 |
+
**Typical Data:** GC-MS, LC-MS chromatography data
|
| 246 |
+
**Use Cases:** Metabolomics, GC-MS workflows
|
| 247 |
+
**Python Libraries:**
|
| 248 |
+
- `netCDF4`: Low-level access
|
| 249 |
+
- `pyopenms`: CDF support
|
| 250 |
+
- `xcms` via R integration
|
| 251 |
+
**EDA Approach:**
|
| 252 |
+
- TIC and extracted ion chromatograms
|
| 253 |
+
- Peak detection across samples
|
| 254 |
+
- Retention index calculation
|
| 255 |
+
- Mass spectral matching
|
| 256 |
+
- Library search preparation
|
| 257 |
+
|
| 258 |
+
### .msp - Mass Spectral Format (NIST)
|
| 259 |
+
**Description:** NIST spectral library format
|
| 260 |
+
**Typical Data:** Reference mass spectra
|
| 261 |
+
**Use Cases:** Metabolite identification, library matching
|
| 262 |
+
**Python Libraries:**
|
| 263 |
+
- `matchms`: Spectral matching
|
| 264 |
+
- Custom MSP parsers
|
| 265 |
+
**EDA Approach:**
|
| 266 |
+
- Library coverage
|
| 267 |
+
- Metadata completeness (InChI, SMILES)
|
| 268 |
+
- Spectral quality metrics
|
| 269 |
+
- Collision energy standardization
|
| 270 |
+
- Precursor type annotation
|
| 271 |
+
|
| 272 |
+
### .mgf (Metabolomics)
|
| 273 |
+
**Description:** Mascot Generic Format for MS/MS
|
| 274 |
+
**Typical Data:** MS/MS spectra for metabolite ID
|
| 275 |
+
**Use Cases:** Spectral library searching
|
| 276 |
+
**Python Libraries:**
|
| 277 |
+
- `matchms`: Metabolomics spectral analysis
|
| 278 |
+
- `pyteomics.mgf`
|
| 279 |
+
**EDA Approach:**
|
| 280 |
+
- Spectrum quality filtering
|
| 281 |
+
- Precursor isolation purity
|
| 282 |
+
- Fragment m/z accuracy
|
| 283 |
+
- Neutral loss patterns
|
| 284 |
+
- MS/MS completeness
|
| 285 |
+
|
| 286 |
+
### .nmrML - NMR Markup Language
|
| 287 |
+
**Description:** Standard XML format for NMR metabolomics
|
| 288 |
+
**Typical Data:** 1D/2D NMR spectra with metadata
|
| 289 |
+
**Use Cases:** NMR-based metabolomics
|
| 290 |
+
**Python Libraries:**
|
| 291 |
+
- `nmrml2isa`: Format conversion
|
| 292 |
+
- Custom XML parsers
|
| 293 |
+
**EDA Approach:**
|
| 294 |
+
- Spectral quality metrics
|
| 295 |
+
- Binning consistency
|
| 296 |
+
- Reference compound validation
|
| 297 |
+
- pH and temperature effects
|
| 298 |
+
- Metabolite identification confidence
|
| 299 |
+
|
| 300 |
+
### .json (Metabolomics)
|
| 301 |
+
**Description:** JSON format for metabolomics results
|
| 302 |
+
**Typical Data:** Feature tables, annotations, metadata
|
| 303 |
+
**Use Cases:** GNPS, MetaboAnalyst, web tools
|
| 304 |
+
**Python Libraries:**
|
| 305 |
+
- `json`: Standard library
|
| 306 |
+
- `pandas`: JSON normalization
|
| 307 |
+
**EDA Approach:**
|
| 308 |
+
- Feature annotation coverage
|
| 309 |
+
- GNPS clustering results
|
| 310 |
+
- Molecular networking statistics
|
| 311 |
+
- Adduct and in-source fragment linkage
|
| 312 |
+
- Putative identification confidence
|
| 313 |
+
|
| 314 |
+
### .txt (Metabolomics Tables)
|
| 315 |
+
**Description:** Tab-delimited feature tables
|
| 316 |
+
**Typical Data:** m/z, RT, intensities across samples
|
| 317 |
+
**Use Cases:** MZmine, XCMS, MS-DIAL output
|
| 318 |
+
**Python Libraries:**
|
| 319 |
+
- `pandas`: Text file reading
|
| 320 |
+
**EDA Approach:**
|
| 321 |
+
- Feature count and quality
|
| 322 |
+
- Missing value imputation
|
| 323 |
+
- Data normalization assessment
|
| 324 |
+
- Batch correction validation
|
| 325 |
+
- PCA and clustering for QC
|
| 326 |
+
- Fold change calculations
|
| 327 |
+
- Statistical test preparation
|
| 328 |
+
|
| 329 |
+
### .featureXML - OpenMS Feature Format
|
| 330 |
+
**Description:** OpenMS detected features
|
| 331 |
+
**Typical Data:** LC-MS features with quality scores
|
| 332 |
+
**Use Cases:** OpenMS workflows
|
| 333 |
+
**Python Libraries:**
|
| 334 |
+
- `pyopenms`: FeatureXML support
|
| 335 |
+
**EDA Approach:**
|
| 336 |
+
- Feature detection parameters
|
| 337 |
+
- Quality metrics per feature
|
| 338 |
+
- Isotope pattern fitting
|
| 339 |
+
- Charge state assignment
|
| 340 |
+
- FWHM and asymmetry
|
| 341 |
+
|
| 342 |
+
### .consensusXML - OpenMS Consensus Features
|
| 343 |
+
**Description:** Linked features across samples
|
| 344 |
+
**Typical Data:** Aligned features with group info
|
| 345 |
+
**Use Cases:** Multi-sample LC-MS analysis
|
| 346 |
+
**Python Libraries:**
|
| 347 |
+
- `pyopenms`: ConsensusXML reading
|
| 348 |
+
**EDA Approach:**
|
| 349 |
+
- Feature correspondence quality
|
| 350 |
+
- Retention time alignment
|
| 351 |
+
- Missing value patterns
|
| 352 |
+
- Intensity normalization needs
|
| 353 |
+
- Batch-wise feature agreement
|
| 354 |
+
|
| 355 |
+
### .idXML - OpenMS Identification Format
|
| 356 |
+
**Description:** Peptide/metabolite identifications
|
| 357 |
+
**Typical Data:** MS/MS identifications with scores
|
| 358 |
+
**Use Cases:** OpenMS ID workflows
|
| 359 |
+
**Python Libraries:**
|
| 360 |
+
- `pyopenms`: IdXML support
|
| 361 |
+
**EDA Approach:**
|
| 362 |
+
- Identification rate
|
| 363 |
+
- Score distribution
|
| 364 |
+
- Spectral match quality
|
| 365 |
+
- False discovery assessment
|
| 366 |
+
- Annotation transfer validation
|
| 367 |
+
|
| 368 |
+
## Lipidomics-Specific Formats
|
| 369 |
+
|
| 370 |
+
### .lcb - LipidCreator Batch
|
| 371 |
+
**Description:** LipidCreator transition list
|
| 372 |
+
**Typical Data:** Lipid transitions for targeted MS
|
| 373 |
+
**Use Cases:** Targeted lipidomics
|
| 374 |
+
**Python Libraries:**
|
| 375 |
+
- Export to CSV for processing
|
| 376 |
+
**EDA Approach:**
|
| 377 |
+
- Transition coverage per lipid class
|
| 378 |
+
- Retention time prediction
|
| 379 |
+
- Collision energy optimization
|
| 380 |
+
- Class-specific fragmentation patterns
|
| 381 |
+
|
| 382 |
+
### .mzTab - Proteomics/Metabolomics Tabular Format
|
| 383 |
+
**Description:** PSI tabular summary format
|
| 384 |
+
**Typical Data:** Protein/peptide/metabolite quantification
|
| 385 |
+
**Use Cases:** Publication and data sharing
|
| 386 |
+
**Python Libraries:**
|
| 387 |
+
- `pyteomics.mztab`
|
| 388 |
+
- `pandas` for TSV-like structure
|
| 389 |
+
**EDA Approach:**
|
| 390 |
+
- Data completeness
|
| 391 |
+
- Metadata section validation
|
| 392 |
+
- Quantification method
|
| 393 |
+
- Identification confidence
|
| 394 |
+
- Software and parameters
|
| 395 |
+
- Quality metrics summary
|
| 396 |
+
|
| 397 |
+
### .csv (LipidSearch, LipidMatch)
|
| 398 |
+
**Description:** Lipid identification results
|
| 399 |
+
**Typical Data:** Lipid annotations, grades, intensities
|
| 400 |
+
**Use Cases:** Lipidomics software output
|
| 401 |
+
**Python Libraries:**
|
| 402 |
+
- `pandas`: CSV reading
|
| 403 |
+
**EDA Approach:**
|
| 404 |
+
- Lipid class distribution
|
| 405 |
+
- Identification grade/confidence
|
| 406 |
+
- Fatty acid composition analysis
|
| 407 |
+
- Double bond and chain length patterns
|
| 408 |
+
- Intensity correlations
|
| 409 |
+
- Normalization to internal standards
|
| 410 |
+
|
| 411 |
+
### .sdf (Metabolomics)
|
| 412 |
+
**Description:** Structure data file for metabolites
|
| 413 |
+
**Typical Data:** Chemical structures with properties
|
| 414 |
+
**Use Cases:** Metabolite database creation
|
| 415 |
+
**Python Libraries:**
|
| 416 |
+
- `RDKit`: `Chem.SDMolSupplier('file.sdf')`
|
| 417 |
+
**EDA Approach:**
|
| 418 |
+
- Structure validation
|
| 419 |
+
- Property calculation (logP, MW, TPSA)
|
| 420 |
+
- Molecular formula consistency
|
| 421 |
+
- Tautomer enumeration
|
| 422 |
+
- Retention time prediction features
|
| 423 |
+
|
| 424 |
+
### .mol (Metabolomics)
|
| 425 |
+
**Description:** Single molecule structure files
|
| 426 |
+
**Typical Data:** Metabolite chemical structure
|
| 427 |
+
**Use Cases:** Structure-based searches
|
| 428 |
+
**Python Libraries:**
|
| 429 |
+
- `RDKit`: `Chem.MolFromMolFile('file.mol')`
|
| 430 |
+
**EDA Approach:**
|
| 431 |
+
- Structure correctness
|
| 432 |
+
- Stereochemistry validation
|
| 433 |
+
- Charge state
|
| 434 |
+
- Implicit hydrogen handling
|
| 435 |
+
|
| 436 |
+
## Data Processing and Analysis
|
| 437 |
+
|
| 438 |
+
### .h5 / .hdf5 (Omics)
|
| 439 |
+
**Description:** HDF5 for large omics datasets
|
| 440 |
+
**Typical Data:** Feature matrices, spectra, metadata
|
| 441 |
+
**Use Cases:** Large-scale studies, cloud computing
|
| 442 |
+
**Python Libraries:**
|
| 443 |
+
- `h5py`: HDF5 access
|
| 444 |
+
- `anndata`: For single-cell proteomics
|
| 445 |
+
**EDA Approach:**
|
| 446 |
+
- Dataset organization
|
| 447 |
+
- Chunking and compression
|
| 448 |
+
- Metadata structure
|
| 449 |
+
- Efficient data access patterns
|
| 450 |
+
- Sample and feature annotations
|
| 451 |
+
|
| 452 |
+
### .Rdata / .rds - R Objects
|
| 453 |
+
**Description:** Serialized R analysis objects
|
| 454 |
+
**Typical Data:** Processed omics results from R packages
|
| 455 |
+
**Use Cases:** xcms, CAMERA, MSnbase workflows
|
| 456 |
+
**Python Libraries:**
|
| 457 |
+
- `pyreadr`: `pyreadr.read_r('file.Rdata')`
|
| 458 |
+
- `rpy2`: R-Python integration
|
| 459 |
+
**EDA Approach:**
|
| 460 |
+
- Object structure exploration
|
| 461 |
+
- Data extraction
|
| 462 |
+
- Method parameter review
|
| 463 |
+
- Conversion to Python-native formats
|
| 464 |
+
|
| 465 |
+
### .mzTab-M - Metabolomics mzTab
|
| 466 |
+
**Description:** mzTab specific to metabolomics
|
| 467 |
+
**Typical Data:** Small molecule quantification
|
| 468 |
+
**Use Cases:** Metabolomics data sharing
|
| 469 |
+
**Python Libraries:**
|
| 470 |
+
- `pyteomics.mztab`: Can parse mzTab-M
|
| 471 |
+
**EDA Approach:**
|
| 472 |
+
- Small molecule evidence
|
| 473 |
+
- Feature quantification
|
| 474 |
+
- Database references (HMDB, KEGG, etc.)
|
| 475 |
+
- Adduct and charge annotation
|
| 476 |
+
- MS level information
|
| 477 |
+
|
| 478 |
+
### .parquet (Omics)
|
| 479 |
+
**Description:** Columnar storage for large tables
|
| 480 |
+
**Typical Data:** Feature matrices, metadata
|
| 481 |
+
**Use Cases:** Efficient big data omics
|
| 482 |
+
**Python Libraries:**
|
| 483 |
+
- `pandas`: `pd.read_parquet()`
|
| 484 |
+
- `pyarrow`: Direct parquet access
|
| 485 |
+
**EDA Approach:**
|
| 486 |
+
- Compression efficiency
|
| 487 |
+
- Column-wise statistics
|
| 488 |
+
- Partition structure
|
| 489 |
+
- Schema validation
|
| 490 |
+
- Fast filtering and aggregation
|
| 491 |
+
|
| 492 |
+
### .pkl (Omics Models)
|
| 493 |
+
**Description:** Pickled Python objects
|
| 494 |
+
**Typical Data:** ML models, processed data
|
| 495 |
+
**Use Cases:** Workflow intermediate storage
|
| 496 |
+
**Python Libraries:**
|
| 497 |
+
- `pickle`: Standard serialization
|
| 498 |
+
- `joblib`: Enhanced pickling
|
| 499 |
+
**EDA Approach:**
|
| 500 |
+
- Object type and structure
|
| 501 |
+
- Model parameters
|
| 502 |
+
- Feature importance (if ML model)
|
| 503 |
+
- Data shapes and types
|
| 504 |
+
- Deserialization validation
|
| 505 |
+
|
| 506 |
+
### .zarr (Omics)
|
| 507 |
+
**Description:** Chunked, compressed array storage
|
| 508 |
+
**Typical Data:** Multi-dimensional omics data
|
| 509 |
+
**Use Cases:** Cloud-optimized analysis
|
| 510 |
+
**Python Libraries:**
|
| 511 |
+
- `zarr`: Array storage
|
| 512 |
+
**EDA Approach:**
|
| 513 |
+
- Chunk optimization
|
| 514 |
+
- Compression codecs
|
| 515 |
+
- Multi-scale data
|
| 516 |
+
- Parallel access patterns
|
| 517 |
+
- Metadata annotations
|
.scider/skills/exploratory-data-analysis/references/spectroscopy_analytical_formats.md
ADDED
|
@@ -0,0 +1,633 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Spectroscopy and Analytical Chemistry File Formats Reference
|
| 2 |
+
|
| 3 |
+
This reference covers file formats used in various spectroscopic techniques and analytical chemistry instrumentation.
|
| 4 |
+
|
| 5 |
+
## NMR Spectroscopy
|
| 6 |
+
|
| 7 |
+
### .fid - NMR Free Induction Decay
|
| 8 |
+
**Description:** Raw time-domain NMR data from Bruker, Agilent, JEOL
|
| 9 |
+
**Typical Data:** Complex time-domain signal
|
| 10 |
+
**Use Cases:** NMR spectroscopy, structure elucidation
|
| 11 |
+
**Python Libraries:**
|
| 12 |
+
- `nmrglue`: `nmrglue.bruker.read_fid('fid')` or `nmrglue.varian.read_fid('fid')`
|
| 13 |
+
- `nmrstarlib`: NMR data handling
|
| 14 |
+
**EDA Approach:**
|
| 15 |
+
- Time-domain signal decay
|
| 16 |
+
- Sampling rate and acquisition time
|
| 17 |
+
- Number of data points
|
| 18 |
+
- Signal-to-noise ratio estimation
|
| 19 |
+
- Baseline drift assessment
|
| 20 |
+
- Digital filter effects
|
| 21 |
+
- Acquisition parameter validation
|
| 22 |
+
- Apodization function selection
|
| 23 |
+
|
| 24 |
+
### .ft / .ft1 / .ft2 - NMR Frequency Domain
|
| 25 |
+
**Description:** Fourier-transformed NMR spectrum
|
| 26 |
+
**Typical Data:** Processed frequency-domain data
|
| 27 |
+
**Use Cases:** NMR analysis, peak integration
|
| 28 |
+
**Python Libraries:**
|
| 29 |
+
- `nmrglue`: Frequency domain reading
|
| 30 |
+
- Custom processing pipelines
|
| 31 |
+
**EDA Approach:**
|
| 32 |
+
- Peak picking and integration
|
| 33 |
+
- Chemical shift range
|
| 34 |
+
- Baseline correction quality
|
| 35 |
+
- Phase correction assessment
|
| 36 |
+
- Reference peak identification
|
| 37 |
+
- Spectral resolution
|
| 38 |
+
- Artifacts detection
|
| 39 |
+
- Multiplicity analysis
|
| 40 |
+
|
| 41 |
+
### .1r / .2rr - Bruker NMR Processed Data
|
| 42 |
+
**Description:** Bruker processed spectrum (real part)
|
| 43 |
+
**Typical Data:** 1D or 2D processed NMR spectra
|
| 44 |
+
**Use Cases:** NMR data analysis with Bruker software
|
| 45 |
+
**Python Libraries:**
|
| 46 |
+
- `nmrglue`: Bruker format support
|
| 47 |
+
**EDA Approach:**
|
| 48 |
+
- Processing parameters review
|
| 49 |
+
- Window function effects
|
| 50 |
+
- Zero-filling assessment
|
| 51 |
+
- Linear prediction validation
|
| 52 |
+
- Spectral artifacts
|
| 53 |
+
|
| 54 |
+
### .dx - NMR JCAMP-DX
|
| 55 |
+
**Description:** JCAMP-DX format for NMR
|
| 56 |
+
**Typical Data:** Standardized NMR spectrum
|
| 57 |
+
**Use Cases:** Data exchange between software
|
| 58 |
+
**Python Libraries:**
|
| 59 |
+
- `jcamp`: JCAMP reader
|
| 60 |
+
- `nmrglue`: Can import JCAMP
|
| 61 |
+
**EDA Approach:**
|
| 62 |
+
- Format compliance
|
| 63 |
+
- Metadata completeness
|
| 64 |
+
- Peak table validation
|
| 65 |
+
- Integration values
|
| 66 |
+
- Compound identification info
|
| 67 |
+
|
| 68 |
+
### .mnova - Mnova Format
|
| 69 |
+
**Description:** Mestrelab Research Mnova format
|
| 70 |
+
**Typical Data:** NMR data with processing info
|
| 71 |
+
**Use Cases:** Mnova software workflows
|
| 72 |
+
**Python Libraries:**
|
| 73 |
+
- `nmrglue`: Limited Mnova support
|
| 74 |
+
- Conversion tools to standard formats
|
| 75 |
+
**EDA Approach:**
|
| 76 |
+
- Multi-spectrum handling
|
| 77 |
+
- Processing pipeline review
|
| 78 |
+
- Quantification data
|
| 79 |
+
- Structure assignment
|
| 80 |
+
|
| 81 |
+
## Mass Spectrometry
|
| 82 |
+
|
| 83 |
+
### .mzML - Mass Spectrometry Markup Language
|
| 84 |
+
**Description:** Standard XML-based MS format
|
| 85 |
+
**Typical Data:** MS spectra, chromatograms, metadata
|
| 86 |
+
**Use Cases:** Proteomics, metabolomics, lipidomics
|
| 87 |
+
**Python Libraries:**
|
| 88 |
+
- `pymzml`: `pymzml.run.Reader('file.mzML')`
|
| 89 |
+
- `pyteomics.mzml`: `pyteomics.mzml.read('file.mzML')`
|
| 90 |
+
- `MSFileReader`: Various wrappers
|
| 91 |
+
**EDA Approach:**
|
| 92 |
+
- Scan count and MS level distribution
|
| 93 |
+
- Retention time range and TIC
|
| 94 |
+
- m/z range and resolution
|
| 95 |
+
- Precursor ion selection
|
| 96 |
+
- Fragmentation patterns
|
| 97 |
+
- Instrument configuration
|
| 98 |
+
- Quality control metrics
|
| 99 |
+
- Data completeness
|
| 100 |
+
|
| 101 |
+
### .mzXML - Mass Spectrometry XML
|
| 102 |
+
**Description:** Legacy XML MS format
|
| 103 |
+
**Typical Data:** Mass spectra and chromatograms
|
| 104 |
+
**Use Cases:** Proteomics workflows (older)
|
| 105 |
+
**Python Libraries:**
|
| 106 |
+
- `pyteomics.mzxml`
|
| 107 |
+
- `pymzml`: Can read mzXML
|
| 108 |
+
**EDA Approach:**
|
| 109 |
+
- Similar to mzML
|
| 110 |
+
- Version compatibility
|
| 111 |
+
- Conversion quality assessment
|
| 112 |
+
|
| 113 |
+
### .mzData - mzData Format
|
| 114 |
+
**Description:** Legacy PSI MS format
|
| 115 |
+
**Typical Data:** Mass spectrometry data
|
| 116 |
+
**Use Cases:** Legacy data archives
|
| 117 |
+
**Python Libraries:**
|
| 118 |
+
- `pyteomics`: Limited support
|
| 119 |
+
- Conversion to mzML recommended
|
| 120 |
+
**EDA Approach:**
|
| 121 |
+
- Format conversion validation
|
| 122 |
+
- Data completeness
|
| 123 |
+
- Metadata extraction
|
| 124 |
+
|
| 125 |
+
### .raw - Vendor Raw Files (Thermo, Agilent, Bruker)
|
| 126 |
+
**Description:** Proprietary instrument data
|
| 127 |
+
**Typical Data:** Raw mass spectra and metadata
|
| 128 |
+
**Use Cases:** Direct instrument output
|
| 129 |
+
**Python Libraries:**
|
| 130 |
+
- `pymsfilereader`: Thermo RAW files
|
| 131 |
+
- `ThermoRawFileParser`: CLI wrapper
|
| 132 |
+
- Vendor-specific APIs
|
| 133 |
+
**EDA Approach:**
|
| 134 |
+
- Method parameter extraction
|
| 135 |
+
- Instrument performance metrics
|
| 136 |
+
- Calibration status
|
| 137 |
+
- Scan function analysis
|
| 138 |
+
- MS/MS quality metrics
|
| 139 |
+
- Dynamic exclusion evaluation
|
| 140 |
+
|
| 141 |
+
### .d - Agilent Data Directory
|
| 142 |
+
**Description:** Agilent MS data folder
|
| 143 |
+
**Typical Data:** LC-MS, GC-MS with methods
|
| 144 |
+
**Use Cases:** Agilent MassHunter workflows
|
| 145 |
+
**Python Libraries:**
|
| 146 |
+
- Community parsers
|
| 147 |
+
- Chemstation integration
|
| 148 |
+
**EDA Approach:**
|
| 149 |
+
- Directory structure validation
|
| 150 |
+
- Method parameters
|
| 151 |
+
- Calibration curves
|
| 152 |
+
- Sequence metadata
|
| 153 |
+
- Signal quality metrics
|
| 154 |
+
|
| 155 |
+
### .wiff - AB SCIEX Data
|
| 156 |
+
**Description:** AB SCIEX/SCIEX instrument format
|
| 157 |
+
**Typical Data:** Mass spectrometry data
|
| 158 |
+
**Use Cases:** SCIEX instrument workflows
|
| 159 |
+
**Python Libraries:**
|
| 160 |
+
- Vendor SDKs (limited Python support)
|
| 161 |
+
- Conversion tools
|
| 162 |
+
**EDA Approach:**
|
| 163 |
+
- Experiment type identification
|
| 164 |
+
- Scan properties
|
| 165 |
+
- Quantitation data
|
| 166 |
+
- Multi-experiment structure
|
| 167 |
+
|
| 168 |
+
### .mgf - Mascot Generic Format
|
| 169 |
+
**Description:** Peak list format for MS/MS
|
| 170 |
+
**Typical Data:** Precursor and fragment masses
|
| 171 |
+
**Use Cases:** Peptide identification, database searches
|
| 172 |
+
**Python Libraries:**
|
| 173 |
+
- `pyteomics.mgf`: `pyteomics.mgf.read('file.mgf')`
|
| 174 |
+
- `pyopenms`: MGF support
|
| 175 |
+
**EDA Approach:**
|
| 176 |
+
- Spectrum count
|
| 177 |
+
- Charge state distribution
|
| 178 |
+
- Precursor m/z and intensity
|
| 179 |
+
- Fragment peak count
|
| 180 |
+
- Mass accuracy
|
| 181 |
+
- Title and metadata parsing
|
| 182 |
+
|
| 183 |
+
### .pkl - Peak List (Binary)
|
| 184 |
+
**Description:** Binary peak list format
|
| 185 |
+
**Typical Data:** Serialized MS/MS spectra
|
| 186 |
+
**Use Cases:** Software-specific storage
|
| 187 |
+
**Python Libraries:**
|
| 188 |
+
- `pickle`: Standard deserialization
|
| 189 |
+
- `pyteomics`: PKL support
|
| 190 |
+
**EDA Approach:**
|
| 191 |
+
- Data structure inspection
|
| 192 |
+
- Conversion to standard formats
|
| 193 |
+
- Metadata preservation
|
| 194 |
+
|
| 195 |
+
### .ms1 / .ms2 - MS1/MS2 Formats
|
| 196 |
+
**Description:** Simple text format for MS data
|
| 197 |
+
**Typical Data:** MS1 and MS2 scans
|
| 198 |
+
**Use Cases:** Database searching, proteomics
|
| 199 |
+
**Python Libraries:**
|
| 200 |
+
- `pyteomics.ms1` and `ms2`
|
| 201 |
+
- Simple text parsing
|
| 202 |
+
**EDA Approach:**
|
| 203 |
+
- Scan count by level
|
| 204 |
+
- Retention time series
|
| 205 |
+
- Charge state analysis
|
| 206 |
+
- m/z range coverage
|
| 207 |
+
|
| 208 |
+
### .pepXML - Peptide XML
|
| 209 |
+
**Description:** TPP peptide identification format
|
| 210 |
+
**Typical Data:** Peptide-spectrum matches
|
| 211 |
+
**Use Cases:** Proteomics search results
|
| 212 |
+
**Python Libraries:**
|
| 213 |
+
- `pyteomics.pepxml`
|
| 214 |
+
**EDA Approach:**
|
| 215 |
+
- Search result statistics
|
| 216 |
+
- Score distribution
|
| 217 |
+
- Modification analysis
|
| 218 |
+
- FDR assessment
|
| 219 |
+
- Enzyme specificity
|
| 220 |
+
|
| 221 |
+
### .protXML - Protein XML
|
| 222 |
+
**Description:** TPP protein inference format
|
| 223 |
+
**Typical Data:** Protein identifications
|
| 224 |
+
**Use Cases:** Proteomics protein-level results
|
| 225 |
+
**Python Libraries:**
|
| 226 |
+
- `pyteomics.protxml`
|
| 227 |
+
**EDA Approach:**
|
| 228 |
+
- Protein group analysis
|
| 229 |
+
- Coverage statistics
|
| 230 |
+
- Confidence scoring
|
| 231 |
+
- Parsimony analysis
|
| 232 |
+
|
| 233 |
+
### .msp - NIST MS Search Format
|
| 234 |
+
**Description:** NIST spectral library format
|
| 235 |
+
**Typical Data:** Reference mass spectra
|
| 236 |
+
**Use Cases:** Spectral library searching
|
| 237 |
+
**Python Libraries:**
|
| 238 |
+
- `matchms`: Spectral library handling
|
| 239 |
+
- Custom parsers
|
| 240 |
+
**EDA Approach:**
|
| 241 |
+
- Library size and coverage
|
| 242 |
+
- Metadata completeness
|
| 243 |
+
- Peak count statistics
|
| 244 |
+
- Compound annotation quality
|
| 245 |
+
|
| 246 |
+
## Infrared and Raman Spectroscopy
|
| 247 |
+
|
| 248 |
+
### .spc - Galactic SPC
|
| 249 |
+
**Description:** Thermo Galactic spectroscopy format
|
| 250 |
+
**Typical Data:** IR, Raman, UV-Vis spectra
|
| 251 |
+
**Use Cases:** Various spectroscopy instruments
|
| 252 |
+
**Python Libraries:**
|
| 253 |
+
- `spc`: `spc.File('file.spc')`
|
| 254 |
+
- `specio`: Multi-format reader
|
| 255 |
+
**EDA Approach:**
|
| 256 |
+
- Wavenumber/wavelength range
|
| 257 |
+
- Data point density
|
| 258 |
+
- Multi-spectrum handling
|
| 259 |
+
- Baseline characteristics
|
| 260 |
+
- Peak identification
|
| 261 |
+
- Absorbance/transmittance mode
|
| 262 |
+
- Instrument information
|
| 263 |
+
|
| 264 |
+
### .spa - Thermo Nicolet
|
| 265 |
+
**Description:** Thermo Fisher FTIR format
|
| 266 |
+
**Typical Data:** FTIR spectra
|
| 267 |
+
**Use Cases:** OMNIC software data
|
| 268 |
+
**Python Libraries:**
|
| 269 |
+
- Custom binary parsers
|
| 270 |
+
- Conversion to JCAMP or SPC
|
| 271 |
+
**EDA Approach:**
|
| 272 |
+
- Interferogram vs spectrum
|
| 273 |
+
- Background spectrum validation
|
| 274 |
+
- Atmospheric compensation
|
| 275 |
+
- Resolution and scan number
|
| 276 |
+
- Sample information
|
| 277 |
+
|
| 278 |
+
### .0 - Bruker OPUS
|
| 279 |
+
**Description:** Bruker OPUS FTIR format (numbered files)
|
| 280 |
+
**Typical Data:** FTIR spectra and metadata
|
| 281 |
+
**Use Cases:** Bruker FTIR instruments
|
| 282 |
+
**Python Libraries:**
|
| 283 |
+
- `brukeropusreader`: OPUS format parser
|
| 284 |
+
- `specio`: OPUS support
|
| 285 |
+
**EDA Approach:**
|
| 286 |
+
- Multiple block types (AB, ScSm, etc.)
|
| 287 |
+
- Sample and reference spectra
|
| 288 |
+
- Instrument parameters
|
| 289 |
+
- Optical path configuration
|
| 290 |
+
- Beam splitter and detector info
|
| 291 |
+
|
| 292 |
+
### .dpt - Data Point Table
|
| 293 |
+
**Description:** Simple XY data format
|
| 294 |
+
**Typical Data:** Generic spectroscopic data
|
| 295 |
+
**Use Cases:** Renishaw Raman, generic exports
|
| 296 |
+
**Python Libraries:**
|
| 297 |
+
- `pandas`: CSV-like reading
|
| 298 |
+
- Text parsing
|
| 299 |
+
**EDA Approach:**
|
| 300 |
+
- X-axis type (wavelength, wavenumber, Raman shift)
|
| 301 |
+
- Y-axis units (intensity, absorbance, etc.)
|
| 302 |
+
- Data point spacing
|
| 303 |
+
- Header information
|
| 304 |
+
- Multi-column data handling
|
| 305 |
+
|
| 306 |
+
### .wdf - Renishaw Raman
|
| 307 |
+
**Description:** Renishaw WiRE data format
|
| 308 |
+
**Typical Data:** Raman spectra and maps
|
| 309 |
+
**Use Cases:** Renishaw Raman microscopy
|
| 310 |
+
**Python Libraries:**
|
| 311 |
+
- `renishawWiRE`: WDF reader
|
| 312 |
+
- Custom parsers for WDF format
|
| 313 |
+
**EDA Approach:**
|
| 314 |
+
- Spectral vs mapping data
|
| 315 |
+
- Laser wavelength
|
| 316 |
+
- Accumulation and exposure time
|
| 317 |
+
- Spatial coordinates (mapping)
|
| 318 |
+
- Z-scan data
|
| 319 |
+
- Baseline and cosmic ray correction
|
| 320 |
+
|
| 321 |
+
### .txt (Spectroscopy)
|
| 322 |
+
**Description:** Generic text export from instruments
|
| 323 |
+
**Typical Data:** Wavelength/wavenumber and intensity
|
| 324 |
+
**Use Cases:** Universal data exchange
|
| 325 |
+
**Python Libraries:**
|
| 326 |
+
- `pandas`: Text file reading
|
| 327 |
+
- `numpy`: Simple array loading
|
| 328 |
+
**EDA Approach:**
|
| 329 |
+
- Delimiter and format detection
|
| 330 |
+
- Header parsing
|
| 331 |
+
- Units identification
|
| 332 |
+
- Multiple spectrum handling
|
| 333 |
+
- Metadata extraction from comments
|
| 334 |
+
|
| 335 |
+
## UV-Visible Spectroscopy
|
| 336 |
+
|
| 337 |
+
### .asd / .asc - ASD Binary/ASCII
|
| 338 |
+
**Description:** ASD FieldSpec spectroradiometer
|
| 339 |
+
**Typical Data:** Hyperspectral UV-Vis-NIR data
|
| 340 |
+
**Use Cases:** Remote sensing, reflectance spectroscopy
|
| 341 |
+
**Python Libraries:**
|
| 342 |
+
- `spectral.io.asd`: ASD format support
|
| 343 |
+
- Custom parsers
|
| 344 |
+
**EDA Approach:**
|
| 345 |
+
- Wavelength range (UV to NIR)
|
| 346 |
+
- Reference spectrum validation
|
| 347 |
+
- Dark current correction
|
| 348 |
+
- Integration time
|
| 349 |
+
- GPS metadata (if present)
|
| 350 |
+
- Reflectance vs radiance
|
| 351 |
+
|
| 352 |
+
### .sp - Perkin Elmer
|
| 353 |
+
**Description:** Perkin Elmer UV/Vis format
|
| 354 |
+
**Typical Data:** UV-Vis spectrophotometer data
|
| 355 |
+
**Use Cases:** PE Lambda instruments
|
| 356 |
+
**Python Libraries:**
|
| 357 |
+
- Custom parsers
|
| 358 |
+
- Conversion to standard formats
|
| 359 |
+
**EDA Approach:**
|
| 360 |
+
- Scan parameters
|
| 361 |
+
- Baseline correction
|
| 362 |
+
- Multi-wavelength scans
|
| 363 |
+
- Time-based measurements
|
| 364 |
+
- Sample/reference handling
|
| 365 |
+
|
| 366 |
+
### .csv (Spectroscopy)
|
| 367 |
+
**Description:** CSV export from UV-Vis instruments
|
| 368 |
+
**Typical Data:** Wavelength and absorbance/transmittance
|
| 369 |
+
**Use Cases:** Universal format for UV-Vis data
|
| 370 |
+
**Python Libraries:**
|
| 371 |
+
- `pandas`: Native CSV support
|
| 372 |
+
**EDA Approach:**
|
| 373 |
+
- Lambda max identification
|
| 374 |
+
- Beer's law compliance
|
| 375 |
+
- Baseline offset
|
| 376 |
+
- Path length correction
|
| 377 |
+
- Concentration calculations
|
| 378 |
+
|
| 379 |
+
## X-ray and Diffraction
|
| 380 |
+
|
| 381 |
+
### .cif - Crystallographic Information File
|
| 382 |
+
**Description:** Crystal structure and diffraction data
|
| 383 |
+
**Typical Data:** Unit cell, atomic positions, structure factors
|
| 384 |
+
**Use Cases:** Crystallography, materials science
|
| 385 |
+
**Python Libraries:**
|
| 386 |
+
- `gemmi`: `gemmi.cif.read_file('file.cif')`
|
| 387 |
+
- `PyCifRW`: CIF reading/writing
|
| 388 |
+
- `pymatgen`: Materials structure analysis
|
| 389 |
+
**EDA Approach:**
|
| 390 |
+
- Crystal system and space group
|
| 391 |
+
- Unit cell parameters
|
| 392 |
+
- Atomic positions and occupancy
|
| 393 |
+
- Thermal parameters
|
| 394 |
+
- R-factors and refinement quality
|
| 395 |
+
- Completeness and redundancy
|
| 396 |
+
- Structure validation
|
| 397 |
+
|
| 398 |
+
### .hkl - Reflection Data
|
| 399 |
+
**Description:** Miller indices and intensities
|
| 400 |
+
**Typical Data:** Integrated diffraction intensities
|
| 401 |
+
**Use Cases:** Crystallographic refinement
|
| 402 |
+
**Python Libraries:**
|
| 403 |
+
- Custom parsers (format dependent)
|
| 404 |
+
- Crystallography packages (CCP4, etc.)
|
| 405 |
+
**EDA Approach:**
|
| 406 |
+
- Resolution range
|
| 407 |
+
- Completeness by shell
|
| 408 |
+
- I/sigma distribution
|
| 409 |
+
- Systematic absences
|
| 410 |
+
- Twinning detection
|
| 411 |
+
- Wilson plot
|
| 412 |
+
|
| 413 |
+
### .mtz - MTZ Format (CCP4)
|
| 414 |
+
**Description:** Binary crystallographic data
|
| 415 |
+
**Typical Data:** Reflections, phases, structure factors
|
| 416 |
+
**Use Cases:** Macromolecular crystallography
|
| 417 |
+
**Python Libraries:**
|
| 418 |
+
- `gemmi`: MTZ support
|
| 419 |
+
- `cctbx`: Comprehensive crystallography
|
| 420 |
+
**EDA Approach:**
|
| 421 |
+
- Column types and data
|
| 422 |
+
- Resolution limits
|
| 423 |
+
- R-factors (Rwork, Rfree)
|
| 424 |
+
- Phase probability distribution
|
| 425 |
+
- Map coefficients
|
| 426 |
+
- Batch information
|
| 427 |
+
|
| 428 |
+
### .xy / .xye - Powder Diffraction
|
| 429 |
+
**Description:** 2-theta vs intensity data
|
| 430 |
+
**Typical Data:** Powder X-ray diffraction patterns
|
| 431 |
+
**Use Cases:** Phase identification, Rietveld refinement
|
| 432 |
+
**Python Libraries:**
|
| 433 |
+
- `pandas`: Simple XY reading
|
| 434 |
+
- `pymatgen`: XRD pattern analysis
|
| 435 |
+
**EDA Approach:**
|
| 436 |
+
- 2-theta range
|
| 437 |
+
- Peak positions and intensities
|
| 438 |
+
- Background modeling
|
| 439 |
+
- Peak width analysis (strain/size)
|
| 440 |
+
- Phase identification via matching
|
| 441 |
+
- Preferred orientation effects
|
| 442 |
+
|
| 443 |
+
### .raw (XRD)
|
| 444 |
+
**Description:** Vendor-specific XRD raw data
|
| 445 |
+
**Typical Data:** XRD patterns with metadata
|
| 446 |
+
**Use Cases:** Bruker, PANalytical, Rigaku instruments
|
| 447 |
+
**Python Libraries:**
|
| 448 |
+
- Vendor-specific parsers
|
| 449 |
+
- Conversion tools
|
| 450 |
+
**EDA Approach:**
|
| 451 |
+
- Scan parameters (step size, time)
|
| 452 |
+
- Sample alignment
|
| 453 |
+
- Incident beam setup
|
| 454 |
+
- Detector configuration
|
| 455 |
+
- Background scan validation
|
| 456 |
+
|
| 457 |
+
### .gsa / .gsas - GSAS Format
|
| 458 |
+
**Description:** General Structure Analysis System
|
| 459 |
+
**Typical Data:** Powder diffraction for Rietveld
|
| 460 |
+
**Use Cases:** Rietveld refinement
|
| 461 |
+
**Python Libraries:**
|
| 462 |
+
- GSAS-II Python interface
|
| 463 |
+
- Custom parsers
|
| 464 |
+
**EDA Approach:**
|
| 465 |
+
- Histogram data
|
| 466 |
+
- Instrument parameters
|
| 467 |
+
- Phase information
|
| 468 |
+
- Refinement constraints
|
| 469 |
+
- Profile function parameters
|
| 470 |
+
|
| 471 |
+
## Electron Spectroscopy
|
| 472 |
+
|
| 473 |
+
### .vms - VG Scienta
|
| 474 |
+
**Description:** VG Scienta spectrometer format
|
| 475 |
+
**Typical Data:** XPS, UPS, ARPES spectra
|
| 476 |
+
**Use Cases:** Photoelectron spectroscopy
|
| 477 |
+
**Python Libraries:**
|
| 478 |
+
- Custom parsers for VMS
|
| 479 |
+
- `specio`: Multi-format support
|
| 480 |
+
**EDA Approach:**
|
| 481 |
+
- Binding energy calibration
|
| 482 |
+
- Pass energy and resolution
|
| 483 |
+
- Photoelectron line identification
|
| 484 |
+
- Satellite peak analysis
|
| 485 |
+
- Background subtraction quality
|
| 486 |
+
- Fermi edge position
|
| 487 |
+
|
| 488 |
+
### .spe - WinSpec/SPE Format
|
| 489 |
+
**Description:** Princeton Instruments/Roper Scientific
|
| 490 |
+
**Typical Data:** CCD spectra, Raman, PL
|
| 491 |
+
**Use Cases:** Spectroscopy with CCD detectors
|
| 492 |
+
**Python Libraries:**
|
| 493 |
+
- `spe2py`: SPE file reader
|
| 494 |
+
- `spe_loader`: Alternative parser
|
| 495 |
+
**EDA Approach:**
|
| 496 |
+
- CCD frame analysis
|
| 497 |
+
- Wavelength calibration
|
| 498 |
+
- Dark frame subtraction
|
| 499 |
+
- Cosmic ray identification
|
| 500 |
+
- Readout noise
|
| 501 |
+
- Accumulation statistics
|
| 502 |
+
|
| 503 |
+
### .pxt - Princeton PTI
|
| 504 |
+
**Description:** Photon Technology International
|
| 505 |
+
**Typical Data:** Fluorescence, phosphorescence spectra
|
| 506 |
+
**Use Cases:** Fluorescence spectroscopy
|
| 507 |
+
**Python Libraries:**
|
| 508 |
+
- Custom parsers
|
| 509 |
+
- Text-based format variants
|
| 510 |
+
**EDA Approach:**
|
| 511 |
+
- Excitation and emission spectra
|
| 512 |
+
- Quantum yield calculations
|
| 513 |
+
- Time-resolved measurements
|
| 514 |
+
- Temperature-dependent data
|
| 515 |
+
- Correction factors applied
|
| 516 |
+
|
| 517 |
+
### .dat (Spectroscopy Generic)
|
| 518 |
+
**Description:** Generic binary or text spectroscopy data
|
| 519 |
+
**Typical Data:** Various spectroscopic measurements
|
| 520 |
+
**Use Cases:** Many instruments use .dat extension
|
| 521 |
+
**Python Libraries:**
|
| 522 |
+
- Format-specific identification needed
|
| 523 |
+
- `numpy`, `pandas` for known formats
|
| 524 |
+
**EDA Approach:**
|
| 525 |
+
- Format detection (binary vs text)
|
| 526 |
+
- Header identification
|
| 527 |
+
- Data structure inference
|
| 528 |
+
- Units and axis labels
|
| 529 |
+
- Instrument signature detection
|
| 530 |
+
|
| 531 |
+
## Chromatography
|
| 532 |
+
|
| 533 |
+
### .chrom - Chromatogram Data
|
| 534 |
+
**Description:** Generic chromatography format
|
| 535 |
+
**Typical Data:** Retention time vs signal
|
| 536 |
+
**Use Cases:** HPLC, GC, LC-MS
|
| 537 |
+
**Python Libraries:**
|
| 538 |
+
- Vendor-specific parsers
|
| 539 |
+
- `pandas` for text exports
|
| 540 |
+
**EDA Approach:**
|
| 541 |
+
- Retention time range
|
| 542 |
+
- Peak detection and integration
|
| 543 |
+
- Baseline drift
|
| 544 |
+
- Resolution between peaks
|
| 545 |
+
- Signal-to-noise ratio
|
| 546 |
+
- Tailing factor
|
| 547 |
+
|
| 548 |
+
### .ch - ChemStation
|
| 549 |
+
**Description:** Agilent ChemStation format
|
| 550 |
+
**Typical Data:** Chromatograms and method parameters
|
| 551 |
+
**Use Cases:** Agilent HPLC and GC systems
|
| 552 |
+
**Python Libraries:**
|
| 553 |
+
- `agilent-chemstation`: Community tools
|
| 554 |
+
- Binary format parsers
|
| 555 |
+
**EDA Approach:**
|
| 556 |
+
- Method validation
|
| 557 |
+
- Integration parameters
|
| 558 |
+
- Calibration curve
|
| 559 |
+
- Sample sequence information
|
| 560 |
+
- Instrument status
|
| 561 |
+
|
| 562 |
+
### .arw - Empower (Waters)
|
| 563 |
+
**Description:** Waters Empower format
|
| 564 |
+
**Typical Data:** UPLC/HPLC chromatograms
|
| 565 |
+
**Use Cases:** Waters instrument data
|
| 566 |
+
**Python Libraries:**
|
| 567 |
+
- Vendor tools (limited Python access)
|
| 568 |
+
- Database extraction tools
|
| 569 |
+
**EDA Approach:**
|
| 570 |
+
- Audit trail information
|
| 571 |
+
- Processing methods
|
| 572 |
+
- Compound identification
|
| 573 |
+
- Quantitation results
|
| 574 |
+
- System suitability tests
|
| 575 |
+
|
| 576 |
+
### .lcd - Shimadzu LabSolutions
|
| 577 |
+
**Description:** Shimadzu chromatography format
|
| 578 |
+
**Typical Data:** GC/HPLC data
|
| 579 |
+
**Use Cases:** Shimadzu instruments
|
| 580 |
+
**Python Libraries:**
|
| 581 |
+
- Vendor-specific parsers
|
| 582 |
+
**EDA Approach:**
|
| 583 |
+
- Method parameters
|
| 584 |
+
- Peak purity analysis
|
| 585 |
+
- Spectral data (if PDA)
|
| 586 |
+
- Quantitative results
|
| 587 |
+
|
| 588 |
+
## Other Analytical Techniques
|
| 589 |
+
|
| 590 |
+
### .dta - DSC/TGA Data
|
| 591 |
+
**Description:** Thermal analysis data (TA Instruments)
|
| 592 |
+
**Typical Data:** Temperature vs heat flow or mass
|
| 593 |
+
**Use Cases:** Differential scanning calorimetry, thermogravimetry
|
| 594 |
+
**Python Libraries:**
|
| 595 |
+
- Custom parsers for TA formats
|
| 596 |
+
- `pandas` for exported data
|
| 597 |
+
**EDA Approach:**
|
| 598 |
+
- Transition temperature identification
|
| 599 |
+
- Enthalpy calculations
|
| 600 |
+
- Mass loss steps
|
| 601 |
+
- Heating rate effects
|
| 602 |
+
- Baseline determination
|
| 603 |
+
- Purity assessment
|
| 604 |
+
|
| 605 |
+
### .run - ICP-MS/ICP-OES
|
| 606 |
+
**Description:** Elemental analysis data
|
| 607 |
+
**Typical Data:** Element concentrations or counts
|
| 608 |
+
**Use Cases:** Inductively coupled plasma MS/OES
|
| 609 |
+
**Python Libraries:**
|
| 610 |
+
- Vendor-specific tools
|
| 611 |
+
- Custom parsers
|
| 612 |
+
**EDA Approach:**
|
| 613 |
+
- Element detection and quantitation
|
| 614 |
+
- Internal standard performance
|
| 615 |
+
- Spike recovery
|
| 616 |
+
- Dilution factor corrections
|
| 617 |
+
- Isotope ratios
|
| 618 |
+
- LOD/LOQ calculations
|
| 619 |
+
|
| 620 |
+
### .exp - Electrochemistry Data
|
| 621 |
+
**Description:** Electrochemical experiment data
|
| 622 |
+
**Typical Data:** Potential vs current or charge
|
| 623 |
+
**Use Cases:** Cyclic voltammetry, chronoamperometry
|
| 624 |
+
**Python Libraries:**
|
| 625 |
+
- Custom parsers per instrument (CHI, Gamry, etc.)
|
| 626 |
+
- `galvani`: Biologic EC-Lab files
|
| 627 |
+
**EDA Approach:**
|
| 628 |
+
- Redox peak identification
|
| 629 |
+
- Peak potential and current
|
| 630 |
+
- Scan rate effects
|
| 631 |
+
- Electron transfer kinetics
|
| 632 |
+
- Background subtraction
|
| 633 |
+
- Capacitance calculations
|
.scider/skills/exploratory-data-analysis/scripts/eda_analyzer.py
ADDED
|
@@ -0,0 +1,548 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Exploratory Data Analysis Analyzer
|
| 4 |
+
Analyzes scientific data files and generates comprehensive markdown reports
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
import sys
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def detect_file_type(filepath):
|
| 15 |
+
"""
|
| 16 |
+
Detect the file type based on extension and content.
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
tuple: (extension, file_category, reference_file)
|
| 20 |
+
"""
|
| 21 |
+
file_path = Path(filepath)
|
| 22 |
+
extension = file_path.suffix.lower()
|
| 23 |
+
name = file_path.name.lower()
|
| 24 |
+
|
| 25 |
+
# Map extensions to categories and reference files
|
| 26 |
+
extension_map = {
|
| 27 |
+
# Chemistry/Molecular
|
| 28 |
+
"pdb": ("chemistry_molecular", "Protein Data Bank"),
|
| 29 |
+
"cif": ("chemistry_molecular", "Crystallographic Information File"),
|
| 30 |
+
"mol": ("chemistry_molecular", "MDL Molfile"),
|
| 31 |
+
"mol2": ("chemistry_molecular", "Tripos Mol2"),
|
| 32 |
+
"sdf": ("chemistry_molecular", "Structure Data File"),
|
| 33 |
+
"xyz": ("chemistry_molecular", "XYZ Coordinates"),
|
| 34 |
+
"smi": ("chemistry_molecular", "SMILES String"),
|
| 35 |
+
"smiles": ("chemistry_molecular", "SMILES String"),
|
| 36 |
+
"pdbqt": ("chemistry_molecular", "AutoDock PDBQT"),
|
| 37 |
+
"mae": ("chemistry_molecular", "Maestro Format"),
|
| 38 |
+
"gro": ("chemistry_molecular", "GROMACS Coordinate File"),
|
| 39 |
+
"log": ("chemistry_molecular", "Gaussian Log File"),
|
| 40 |
+
"out": ("chemistry_molecular", "Quantum Chemistry Output"),
|
| 41 |
+
"wfn": ("chemistry_molecular", "Wavefunction Files"),
|
| 42 |
+
"wfx": ("chemistry_molecular", "Wavefunction Files"),
|
| 43 |
+
"fchk": ("chemistry_molecular", "Gaussian Formatted Checkpoint"),
|
| 44 |
+
"cube": ("chemistry_molecular", "Gaussian Cube File"),
|
| 45 |
+
"dcd": ("chemistry_molecular", "Binary Trajectory"),
|
| 46 |
+
"xtc": ("chemistry_molecular", "Compressed Trajectory"),
|
| 47 |
+
"trr": ("chemistry_molecular", "GROMACS Trajectory"),
|
| 48 |
+
"nc": ("chemistry_molecular", "Amber NetCDF Trajectory"),
|
| 49 |
+
"netcdf": ("chemistry_molecular", "Amber NetCDF Trajectory"),
|
| 50 |
+
# Bioinformatics/Genomics
|
| 51 |
+
"fasta": ("bioinformatics_genomics", "FASTA Format"),
|
| 52 |
+
"fa": ("bioinformatics_genomics", "FASTA Format"),
|
| 53 |
+
"fna": ("bioinformatics_genomics", "FASTA Format"),
|
| 54 |
+
"fastq": ("bioinformatics_genomics", "FASTQ Format"),
|
| 55 |
+
"fq": ("bioinformatics_genomics", "FASTQ Format"),
|
| 56 |
+
"sam": ("bioinformatics_genomics", "Sequence Alignment/Map"),
|
| 57 |
+
"bam": ("bioinformatics_genomics", "Binary Alignment/Map"),
|
| 58 |
+
"cram": ("bioinformatics_genomics", "CRAM Format"),
|
| 59 |
+
"bed": ("bioinformatics_genomics", "Browser Extensible Data"),
|
| 60 |
+
"bedgraph": ("bioinformatics_genomics", "BED with Graph Data"),
|
| 61 |
+
"bigwig": ("bioinformatics_genomics", "Binary BigWig"),
|
| 62 |
+
"bw": ("bioinformatics_genomics", "Binary BigWig"),
|
| 63 |
+
"bigbed": ("bioinformatics_genomics", "Binary BigBed"),
|
| 64 |
+
"bb": ("bioinformatics_genomics", "Binary BigBed"),
|
| 65 |
+
"gff": ("bioinformatics_genomics", "General Feature Format"),
|
| 66 |
+
"gff3": ("bioinformatics_genomics", "General Feature Format"),
|
| 67 |
+
"gtf": ("bioinformatics_genomics", "Gene Transfer Format"),
|
| 68 |
+
"vcf": ("bioinformatics_genomics", "Variant Call Format"),
|
| 69 |
+
"bcf": ("bioinformatics_genomics", "Binary VCF"),
|
| 70 |
+
"gvcf": ("bioinformatics_genomics", "Genomic VCF"),
|
| 71 |
+
# Microscopy/Imaging
|
| 72 |
+
"tif": ("microscopy_imaging", "Tagged Image File Format"),
|
| 73 |
+
"tiff": ("microscopy_imaging", "Tagged Image File Format"),
|
| 74 |
+
"nd2": ("microscopy_imaging", "Nikon NIS-Elements"),
|
| 75 |
+
"lif": ("microscopy_imaging", "Leica Image Format"),
|
| 76 |
+
"czi": ("microscopy_imaging", "Carl Zeiss Image"),
|
| 77 |
+
"oib": ("microscopy_imaging", "Olympus Image Format"),
|
| 78 |
+
"oif": ("microscopy_imaging", "Olympus Image Format"),
|
| 79 |
+
"vsi": ("microscopy_imaging", "Olympus VSI"),
|
| 80 |
+
"ims": ("microscopy_imaging", "Imaris Format"),
|
| 81 |
+
"lsm": ("microscopy_imaging", "Zeiss LSM"),
|
| 82 |
+
"stk": ("microscopy_imaging", "MetaMorph Stack"),
|
| 83 |
+
"dv": ("microscopy_imaging", "DeltaVision"),
|
| 84 |
+
"mrc": ("microscopy_imaging", "Medical Research Council"),
|
| 85 |
+
"dm3": ("microscopy_imaging", "Gatan Digital Micrograph"),
|
| 86 |
+
"dm4": ("microscopy_imaging", "Gatan Digital Micrograph"),
|
| 87 |
+
"dcm": ("microscopy_imaging", "DICOM"),
|
| 88 |
+
"nii": ("microscopy_imaging", "NIfTI"),
|
| 89 |
+
"nrrd": ("microscopy_imaging", "Nearly Raw Raster Data"),
|
| 90 |
+
# Spectroscopy/Analytical
|
| 91 |
+
"fid": ("spectroscopy_analytical", "NMR Free Induction Decay"),
|
| 92 |
+
"mzml": ("spectroscopy_analytical", "Mass Spectrometry Markup Language"),
|
| 93 |
+
"mzxml": ("spectroscopy_analytical", "Mass Spectrometry XML"),
|
| 94 |
+
"raw": ("spectroscopy_analytical", "Vendor Raw Files"),
|
| 95 |
+
"d": ("spectroscopy_analytical", "Agilent Data Directory"),
|
| 96 |
+
"mgf": ("spectroscopy_analytical", "Mascot Generic Format"),
|
| 97 |
+
"spc": ("spectroscopy_analytical", "Galactic SPC"),
|
| 98 |
+
"jdx": ("spectroscopy_analytical", "JCAMP-DX"),
|
| 99 |
+
"jcamp": ("spectroscopy_analytical", "JCAMP-DX"),
|
| 100 |
+
# Proteomics/Metabolomics
|
| 101 |
+
"pepxml": ("proteomics_metabolomics", "Trans-Proteomic Pipeline Peptide XML"),
|
| 102 |
+
"protxml": ("proteomics_metabolomics", "Protein Inference Results"),
|
| 103 |
+
"mzid": ("proteomics_metabolomics", "Peptide Identification Format"),
|
| 104 |
+
"mztab": ("proteomics_metabolomics", "Proteomics/Metabolomics Tabular Format"),
|
| 105 |
+
# General Scientific
|
| 106 |
+
"npy": ("general_scientific", "NumPy Array"),
|
| 107 |
+
"npz": ("general_scientific", "Compressed NumPy Archive"),
|
| 108 |
+
"csv": ("general_scientific", "Comma-Separated Values"),
|
| 109 |
+
"tsv": ("general_scientific", "Tab-Separated Values"),
|
| 110 |
+
"xlsx": ("general_scientific", "Excel Spreadsheets"),
|
| 111 |
+
"xls": ("general_scientific", "Excel Spreadsheets"),
|
| 112 |
+
"json": ("general_scientific", "JavaScript Object Notation"),
|
| 113 |
+
"xml": ("general_scientific", "Extensible Markup Language"),
|
| 114 |
+
"hdf5": ("general_scientific", "Hierarchical Data Format 5"),
|
| 115 |
+
"h5": ("general_scientific", "Hierarchical Data Format 5"),
|
| 116 |
+
"h5ad": ("bioinformatics_genomics", "Anndata Format"),
|
| 117 |
+
"zarr": ("general_scientific", "Chunked Array Storage"),
|
| 118 |
+
"parquet": ("general_scientific", "Apache Parquet"),
|
| 119 |
+
"mat": ("general_scientific", "MATLAB Data"),
|
| 120 |
+
"fits": ("general_scientific", "Flexible Image Transport System"),
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
ext_clean = extension.lstrip(".")
|
| 124 |
+
if ext_clean in extension_map:
|
| 125 |
+
category, description = extension_map[ext_clean]
|
| 126 |
+
return ext_clean, category, description
|
| 127 |
+
|
| 128 |
+
return ext_clean, "unknown", "Unknown Format"
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def get_file_basic_info(filepath):
|
| 132 |
+
"""Get basic file information."""
|
| 133 |
+
file_path = Path(filepath)
|
| 134 |
+
stat = file_path.stat()
|
| 135 |
+
|
| 136 |
+
return {
|
| 137 |
+
"filename": file_path.name,
|
| 138 |
+
"path": str(file_path.absolute()),
|
| 139 |
+
"size_bytes": stat.st_size,
|
| 140 |
+
"size_human": format_bytes(stat.st_size),
|
| 141 |
+
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
| 142 |
+
"extension": file_path.suffix.lower(),
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def format_bytes(size):
|
| 147 |
+
"""Convert bytes to human-readable format."""
|
| 148 |
+
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
| 149 |
+
if size < 1024.0:
|
| 150 |
+
return f"{size:.2f} {unit}"
|
| 151 |
+
size /= 1024.0
|
| 152 |
+
return f"{size:.2f} PB"
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def load_reference_info(category, extension):
|
| 156 |
+
"""
|
| 157 |
+
Load reference information for the file type.
|
| 158 |
+
|
| 159 |
+
Args:
|
| 160 |
+
category: File category (e.g., 'chemistry_molecular')
|
| 161 |
+
extension: File extension
|
| 162 |
+
|
| 163 |
+
Returns:
|
| 164 |
+
dict: Reference information
|
| 165 |
+
"""
|
| 166 |
+
# Map categories to reference files
|
| 167 |
+
category_files = {
|
| 168 |
+
"chemistry_molecular": "chemistry_molecular_formats.md",
|
| 169 |
+
"bioinformatics_genomics": "bioinformatics_genomics_formats.md",
|
| 170 |
+
"microscopy_imaging": "microscopy_imaging_formats.md",
|
| 171 |
+
"spectroscopy_analytical": "spectroscopy_analytical_formats.md",
|
| 172 |
+
"proteomics_metabolomics": "proteomics_metabolomics_formats.md",
|
| 173 |
+
"general_scientific": "general_scientific_formats.md",
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
if category not in category_files:
|
| 177 |
+
return None
|
| 178 |
+
|
| 179 |
+
# Get the reference file path
|
| 180 |
+
script_dir = Path(__file__).parent
|
| 181 |
+
ref_file = script_dir.parent / "references" / category_files[category]
|
| 182 |
+
|
| 183 |
+
if not ref_file.exists():
|
| 184 |
+
return None
|
| 185 |
+
|
| 186 |
+
# Parse the reference file for the specific extension
|
| 187 |
+
# This is a simplified parser - could be more sophisticated
|
| 188 |
+
try:
|
| 189 |
+
with open(ref_file, "r") as f:
|
| 190 |
+
content = f.read()
|
| 191 |
+
|
| 192 |
+
# Extract section for this file type
|
| 193 |
+
# Look for the extension heading
|
| 194 |
+
import re
|
| 195 |
+
|
| 196 |
+
pattern = rf"### \.{extension}[^#]*?(?=###|\Z)"
|
| 197 |
+
match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
|
| 198 |
+
|
| 199 |
+
if match:
|
| 200 |
+
section = match.group(0)
|
| 201 |
+
return {"raw_section": section, "reference_file": category_files[category]}
|
| 202 |
+
except Exception as e:
|
| 203 |
+
print(f"Error loading reference: {e}", file=sys.stderr)
|
| 204 |
+
|
| 205 |
+
return None
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def analyze_file(filepath):
|
| 209 |
+
"""
|
| 210 |
+
Main analysis function that routes to specific analyzers.
|
| 211 |
+
|
| 212 |
+
Returns:
|
| 213 |
+
dict: Analysis results
|
| 214 |
+
"""
|
| 215 |
+
basic_info = get_file_basic_info(filepath)
|
| 216 |
+
extension, category, description = detect_file_type(filepath)
|
| 217 |
+
|
| 218 |
+
analysis = {
|
| 219 |
+
"basic_info": basic_info,
|
| 220 |
+
"file_type": {"extension": extension, "category": category, "description": description},
|
| 221 |
+
"reference_info": load_reference_info(category, extension),
|
| 222 |
+
"data_analysis": {},
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
# Try to perform data-specific analysis based on file type
|
| 226 |
+
try:
|
| 227 |
+
if category == "general_scientific":
|
| 228 |
+
analysis["data_analysis"] = analyze_general_scientific(filepath, extension)
|
| 229 |
+
elif category == "bioinformatics_genomics":
|
| 230 |
+
analysis["data_analysis"] = analyze_bioinformatics(filepath, extension)
|
| 231 |
+
elif category == "microscopy_imaging":
|
| 232 |
+
analysis["data_analysis"] = analyze_imaging(filepath, extension)
|
| 233 |
+
# Add more specific analyzers as needed
|
| 234 |
+
except Exception as e:
|
| 235 |
+
analysis["data_analysis"]["error"] = str(e)
|
| 236 |
+
|
| 237 |
+
return analysis
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def analyze_general_scientific(filepath, extension):
|
| 241 |
+
"""Analyze general scientific data formats."""
|
| 242 |
+
results = {}
|
| 243 |
+
|
| 244 |
+
try:
|
| 245 |
+
if extension in ["npy"]:
|
| 246 |
+
import numpy as np
|
| 247 |
+
|
| 248 |
+
data = np.load(filepath)
|
| 249 |
+
results = {
|
| 250 |
+
"shape": data.shape,
|
| 251 |
+
"dtype": str(data.dtype),
|
| 252 |
+
"size": data.size,
|
| 253 |
+
"ndim": data.ndim,
|
| 254 |
+
"statistics": {
|
| 255 |
+
"min": float(np.min(data)) if np.issubdtype(data.dtype, np.number) else None,
|
| 256 |
+
"max": float(np.max(data)) if np.issubdtype(data.dtype, np.number) else None,
|
| 257 |
+
"mean": float(np.mean(data)) if np.issubdtype(data.dtype, np.number) else None,
|
| 258 |
+
"std": float(np.std(data)) if np.issubdtype(data.dtype, np.number) else None,
|
| 259 |
+
},
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
elif extension in ["npz"]:
|
| 263 |
+
import numpy as np
|
| 264 |
+
|
| 265 |
+
data = np.load(filepath)
|
| 266 |
+
results = {
|
| 267 |
+
"arrays": list(data.files),
|
| 268 |
+
"array_count": len(data.files),
|
| 269 |
+
"array_shapes": {name: data[name].shape for name in data.files},
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
elif extension in ["csv", "tsv"]:
|
| 273 |
+
import pandas as pd
|
| 274 |
+
|
| 275 |
+
sep = "\t" if extension == "tsv" else ","
|
| 276 |
+
df = pd.read_csv(filepath, sep=sep, nrows=10000) # Sample first 10k rows
|
| 277 |
+
|
| 278 |
+
results = {
|
| 279 |
+
"shape": df.shape,
|
| 280 |
+
"columns": list(df.columns),
|
| 281 |
+
"dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
|
| 282 |
+
"missing_values": df.isnull().sum().to_dict(),
|
| 283 |
+
"summary_statistics": (
|
| 284 |
+
df.describe().to_dict()
|
| 285 |
+
if len(df.select_dtypes(include="number").columns) > 0
|
| 286 |
+
else {}
|
| 287 |
+
),
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
elif extension in ["json"]:
|
| 291 |
+
with open(filepath, "r") as f:
|
| 292 |
+
data = json.load(f)
|
| 293 |
+
|
| 294 |
+
results = {
|
| 295 |
+
"type": type(data).__name__,
|
| 296 |
+
"keys": list(data.keys()) if isinstance(data, dict) else None,
|
| 297 |
+
"length": len(data) if isinstance(data, (list, dict)) else None,
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
elif extension in ["h5", "hdf5"]:
|
| 301 |
+
import h5py
|
| 302 |
+
|
| 303 |
+
with h5py.File(filepath, "r") as f:
|
| 304 |
+
|
| 305 |
+
def get_structure(group, prefix=""):
|
| 306 |
+
items = {}
|
| 307 |
+
for key in group.keys():
|
| 308 |
+
path = f"{prefix}/{key}"
|
| 309 |
+
if isinstance(group[key], h5py.Dataset):
|
| 310 |
+
items[path] = {
|
| 311 |
+
"type": "dataset",
|
| 312 |
+
"shape": group[key].shape,
|
| 313 |
+
"dtype": str(group[key].dtype),
|
| 314 |
+
}
|
| 315 |
+
elif isinstance(group[key], h5py.Group):
|
| 316 |
+
items[path] = {"type": "group"}
|
| 317 |
+
items.update(get_structure(group[key], path))
|
| 318 |
+
return items
|
| 319 |
+
|
| 320 |
+
results = {"structure": get_structure(f), "attributes": dict(f.attrs)}
|
| 321 |
+
|
| 322 |
+
except ImportError as e:
|
| 323 |
+
results["error"] = f"Required library not installed: {e}"
|
| 324 |
+
except Exception as e:
|
| 325 |
+
results["error"] = f"Analysis error: {e}"
|
| 326 |
+
|
| 327 |
+
return results
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
def analyze_bioinformatics(filepath, extension):
|
| 331 |
+
"""Analyze bioinformatics/genomics formats."""
|
| 332 |
+
results = {}
|
| 333 |
+
|
| 334 |
+
try:
|
| 335 |
+
if extension in ["fasta", "fa", "fna"]:
|
| 336 |
+
from Bio import SeqIO
|
| 337 |
+
|
| 338 |
+
sequences = list(SeqIO.parse(filepath, "fasta"))
|
| 339 |
+
lengths = [len(seq) for seq in sequences]
|
| 340 |
+
|
| 341 |
+
results = {
|
| 342 |
+
"sequence_count": len(sequences),
|
| 343 |
+
"total_length": sum(lengths),
|
| 344 |
+
"mean_length": sum(lengths) / len(lengths) if lengths else 0,
|
| 345 |
+
"min_length": min(lengths) if lengths else 0,
|
| 346 |
+
"max_length": max(lengths) if lengths else 0,
|
| 347 |
+
"sequence_ids": [seq.id for seq in sequences[:10]], # First 10
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
elif extension in ["fastq", "fq"]:
|
| 351 |
+
from Bio import SeqIO
|
| 352 |
+
|
| 353 |
+
sequences = []
|
| 354 |
+
for i, seq in enumerate(SeqIO.parse(filepath, "fastq")):
|
| 355 |
+
sequences.append(seq)
|
| 356 |
+
if i >= 9999: # Sample first 10k
|
| 357 |
+
break
|
| 358 |
+
|
| 359 |
+
lengths = [len(seq) for seq in sequences]
|
| 360 |
+
qualities = [
|
| 361 |
+
sum(seq.letter_annotations["phred_quality"]) / len(seq) for seq in sequences
|
| 362 |
+
]
|
| 363 |
+
|
| 364 |
+
results = {
|
| 365 |
+
"read_count_sampled": len(sequences),
|
| 366 |
+
"mean_length": sum(lengths) / len(lengths) if lengths else 0,
|
| 367 |
+
"mean_quality": sum(qualities) / len(qualities) if qualities else 0,
|
| 368 |
+
"min_length": min(lengths) if lengths else 0,
|
| 369 |
+
"max_length": max(lengths) if lengths else 0,
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
except ImportError as e:
|
| 373 |
+
results["error"] = f"Required library not installed (try: pip install biopython): {e}"
|
| 374 |
+
except Exception as e:
|
| 375 |
+
results["error"] = f"Analysis error: {e}"
|
| 376 |
+
|
| 377 |
+
return results
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
def analyze_imaging(filepath, extension):
|
| 381 |
+
"""Analyze microscopy/imaging formats."""
|
| 382 |
+
results = {}
|
| 383 |
+
|
| 384 |
+
try:
|
| 385 |
+
if extension in ["tif", "tiff", "png", "jpg", "jpeg"]:
|
| 386 |
+
import numpy as np
|
| 387 |
+
from PIL import Image
|
| 388 |
+
|
| 389 |
+
img = Image.open(filepath)
|
| 390 |
+
img_array = np.array(img)
|
| 391 |
+
|
| 392 |
+
results = {
|
| 393 |
+
"size": img.size,
|
| 394 |
+
"mode": img.mode,
|
| 395 |
+
"format": img.format,
|
| 396 |
+
"shape": img_array.shape,
|
| 397 |
+
"dtype": str(img_array.dtype),
|
| 398 |
+
"value_range": [int(img_array.min()), int(img_array.max())],
|
| 399 |
+
"mean_intensity": float(img_array.mean()),
|
| 400 |
+
}
|
| 401 |
+
|
| 402 |
+
# Check for multi-page TIFF
|
| 403 |
+
if extension in ["tif", "tiff"]:
|
| 404 |
+
try:
|
| 405 |
+
frame_count = 0
|
| 406 |
+
while True:
|
| 407 |
+
img.seek(frame_count)
|
| 408 |
+
frame_count += 1
|
| 409 |
+
except EOFError:
|
| 410 |
+
results["page_count"] = frame_count
|
| 411 |
+
|
| 412 |
+
except ImportError as e:
|
| 413 |
+
results["error"] = f"Required library not installed (try: pip install pillow): {e}"
|
| 414 |
+
except Exception as e:
|
| 415 |
+
results["error"] = f"Analysis error: {e}"
|
| 416 |
+
|
| 417 |
+
return results
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
def generate_markdown_report(analysis, output_path=None):
|
| 421 |
+
"""
|
| 422 |
+
Generate a comprehensive markdown report from analysis results.
|
| 423 |
+
|
| 424 |
+
Args:
|
| 425 |
+
analysis: Analysis results dictionary
|
| 426 |
+
output_path: Path to save the report (if None, prints to stdout)
|
| 427 |
+
"""
|
| 428 |
+
lines = []
|
| 429 |
+
|
| 430 |
+
# Title
|
| 431 |
+
filename = analysis["basic_info"]["filename"]
|
| 432 |
+
lines.append(f"# Exploratory Data Analysis Report: {filename}\n")
|
| 433 |
+
lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 434 |
+
lines.append("---\n")
|
| 435 |
+
|
| 436 |
+
# Basic Information
|
| 437 |
+
lines.append("## Basic Information\n")
|
| 438 |
+
basic = analysis["basic_info"]
|
| 439 |
+
lines.append(f"- **Filename:** `{basic['filename']}`")
|
| 440 |
+
lines.append(f"- **Full Path:** `{basic['path']}`")
|
| 441 |
+
lines.append(f"- **File Size:** {basic['size_human']} ({basic['size_bytes']:,} bytes)")
|
| 442 |
+
lines.append(f"- **Last Modified:** {basic['modified']}")
|
| 443 |
+
lines.append(f"- **Extension:** `.{analysis['file_type']['extension']}`\n")
|
| 444 |
+
|
| 445 |
+
# File Type Information
|
| 446 |
+
lines.append("## File Type\n")
|
| 447 |
+
ft = analysis["file_type"]
|
| 448 |
+
lines.append(f"- **Category:** {ft['category'].replace('_', ' ').title()}")
|
| 449 |
+
lines.append(f"- **Description:** {ft['description']}\n")
|
| 450 |
+
|
| 451 |
+
# Reference Information
|
| 452 |
+
if analysis.get("reference_info"):
|
| 453 |
+
lines.append("## Format Reference\n")
|
| 454 |
+
ref = analysis["reference_info"]
|
| 455 |
+
if "raw_section" in ref:
|
| 456 |
+
lines.append(ref["raw_section"])
|
| 457 |
+
lines.append(f"\n*Reference: {ref['reference_file']}*\n")
|
| 458 |
+
|
| 459 |
+
# Data Analysis
|
| 460 |
+
if analysis.get("data_analysis"):
|
| 461 |
+
lines.append("## Data Analysis\n")
|
| 462 |
+
data = analysis["data_analysis"]
|
| 463 |
+
|
| 464 |
+
if "error" in data:
|
| 465 |
+
lines.append(f"⚠️ **Analysis Error:** {data['error']}\n")
|
| 466 |
+
else:
|
| 467 |
+
# Format the data analysis based on what's present
|
| 468 |
+
lines.append("### Summary Statistics\n")
|
| 469 |
+
lines.append("```json")
|
| 470 |
+
lines.append(json.dumps(data, indent=2, default=str))
|
| 471 |
+
lines.append("```\n")
|
| 472 |
+
|
| 473 |
+
# Recommendations
|
| 474 |
+
lines.append("## Recommendations for Further Analysis\n")
|
| 475 |
+
lines.append(
|
| 476 |
+
f"Based on the file type (`.{analysis['file_type']['extension']}`), consider the following analyses:\n"
|
| 477 |
+
)
|
| 478 |
+
|
| 479 |
+
# Add specific recommendations based on category
|
| 480 |
+
category = analysis["file_type"]["category"]
|
| 481 |
+
if category == "general_scientific":
|
| 482 |
+
lines.append("- Statistical distribution analysis")
|
| 483 |
+
lines.append("- Missing value imputation strategies")
|
| 484 |
+
lines.append("- Correlation analysis between variables")
|
| 485 |
+
lines.append("- Outlier detection and handling")
|
| 486 |
+
lines.append("- Dimensionality reduction (PCA, t-SNE)")
|
| 487 |
+
elif category == "bioinformatics_genomics":
|
| 488 |
+
lines.append("- Sequence quality control and filtering")
|
| 489 |
+
lines.append("- GC content analysis")
|
| 490 |
+
lines.append("- Read alignment and mapping statistics")
|
| 491 |
+
lines.append("- Variant calling and annotation")
|
| 492 |
+
lines.append("- Differential expression analysis")
|
| 493 |
+
elif category == "microscopy_imaging":
|
| 494 |
+
lines.append("- Image quality assessment")
|
| 495 |
+
lines.append("- Background correction and normalization")
|
| 496 |
+
lines.append("- Segmentation and object detection")
|
| 497 |
+
lines.append("- Colocalization analysis")
|
| 498 |
+
lines.append("- Intensity measurements and quantification")
|
| 499 |
+
|
| 500 |
+
lines.append("")
|
| 501 |
+
|
| 502 |
+
# Footer
|
| 503 |
+
lines.append("---")
|
| 504 |
+
lines.append("*This report was generated by the exploratory-data-analysis skill.*")
|
| 505 |
+
|
| 506 |
+
report = "\n".join(lines)
|
| 507 |
+
|
| 508 |
+
if output_path:
|
| 509 |
+
with open(output_path, "w") as f:
|
| 510 |
+
f.write(report)
|
| 511 |
+
print(f"Report saved to: {output_path}")
|
| 512 |
+
else:
|
| 513 |
+
print(report)
|
| 514 |
+
|
| 515 |
+
return report
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
def main():
|
| 519 |
+
"""Main CLI interface."""
|
| 520 |
+
if len(sys.argv) < 2:
|
| 521 |
+
print("Usage: python eda_analyzer.py <filepath> [output.md]")
|
| 522 |
+
print(" filepath: Path to the data file to analyze")
|
| 523 |
+
print(" output.md: Optional output path for markdown report")
|
| 524 |
+
sys.exit(1)
|
| 525 |
+
|
| 526 |
+
filepath = sys.argv[1]
|
| 527 |
+
output_path = sys.argv[2] if len(sys.argv) > 2 else None
|
| 528 |
+
|
| 529 |
+
if not os.path.exists(filepath):
|
| 530 |
+
print(f"Error: File not found: {filepath}")
|
| 531 |
+
sys.exit(1)
|
| 532 |
+
|
| 533 |
+
# If no output path specified, use the input filename
|
| 534 |
+
if output_path is None:
|
| 535 |
+
input_path = Path(filepath)
|
| 536 |
+
output_path = input_path.parent / f"{input_path.stem}_eda_report.md"
|
| 537 |
+
|
| 538 |
+
print(f"Analyzing: {filepath}")
|
| 539 |
+
analysis = analyze_file(filepath)
|
| 540 |
+
|
| 541 |
+
print(f"\nGenerating report...")
|
| 542 |
+
generate_markdown_report(analysis, output_path)
|
| 543 |
+
|
| 544 |
+
print(f"\n✓ Analysis complete!")
|
| 545 |
+
|
| 546 |
+
|
| 547 |
+
if __name__ == "__main__":
|
| 548 |
+
main()
|
.scider/skills/literature-review-agent/SKILL.md
ADDED
|
@@ -0,0 +1,357 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
name: literature-review-agent
|
| 3 |
+
description: Step 3 of the PaperOrchestra pipeline (arXiv:2604.05018). Execute the literature search strategy from outline.json — discover candidate papers via web search, verify them through Semantic Scholar (Levenshtein > 70 fuzzy title match, temporal cutoff, dedup by paperId), build a BibTeX file, and draft Introduction + Related Work using ≥90% of the verified pool. Runs in parallel with the plotting-agent. TRIGGER when the orchestrator delegates Step 3 or when the user asks to "find citations for my paper", "draft the related work", or "build the bibliography".
|
| 4 |
+
allowed_agents: [writing]
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
# Literature Review Agent (Step 3)
|
| 8 |
+
|
| 9 |
+
Faithful implementation of the Hybrid Literature Agent from PaperOrchestra
|
| 10 |
+
(Song et al., 2026, arXiv:2604.05018, §4 Step 3, App. D.3, App. F.1 p.46).
|
| 11 |
+
|
| 12 |
+
**Cost: ~20–30 LLM calls.** This is one of the two longest steps (the other is
|
| 13 |
+
plotting). Wall-time floor is set by Semantic Scholar's 1 QPS verification
|
| 14 |
+
limit.
|
| 15 |
+
|
| 16 |
+
## Inputs
|
| 17 |
+
|
| 18 |
+
- `workspace/outline.json` — specifically `intro_related_work_plan` with the
|
| 19 |
+
Introduction search directions and the 2-4 Related Work methodology
|
| 20 |
+
clusters
|
| 21 |
+
- `workspace/inputs/conference_guidelines.md` — used to derive `cutoff_date`
|
| 22 |
+
- `workspace/inputs/idea.md`, `workspace/inputs/experimental_log.md` — for
|
| 23 |
+
framing the Intro and grounding the Related Work positioning
|
| 24 |
+
|
| 25 |
+
## Outputs
|
| 26 |
+
|
| 27 |
+
- `workspace/citation_pool.json` — verified Semantic Scholar metadata for
|
| 28 |
+
every paper that survived verification
|
| 29 |
+
- `workspace/refs.bib` — BibTeX file generated from the verified pool
|
| 30 |
+
- `workspace/drafts/intro_relwork.tex` — drafted Introduction and Related
|
| 31 |
+
Work sections, written into the template, with the rest of the template
|
| 32 |
+
preserved verbatim
|
| 33 |
+
|
| 34 |
+
## Two-phase pipeline (App. D.3)
|
| 35 |
+
|
| 36 |
+
```
|
| 37 |
+
PHASE 1 — Parallel Candidate Discovery
|
| 38 |
+
For each search direction in introduction_strategy.search_directions:
|
| 39 |
+
For each limitation_search_query in each related_work cluster:
|
| 40 |
+
- Use the host's web search tool to discover up to ~10 candidate papers.
|
| 41 |
+
- Run up to 10 discovery queries in parallel (host-permitting).
|
| 42 |
+
- Collect (title, snippet, url) tuples — no verification yet.
|
| 43 |
+
→ PRE-DEDUP before Phase 2 (see Step 1.5 below)
|
| 44 |
+
|
| 45 |
+
PHASE 2 — Sequential Citation Verification (1 QPS, with cache)
|
| 46 |
+
For each candidate (after pre-dedup), sequentially:
|
| 47 |
+
0. Check s2_cache.json first (scripts/s2_cache.py --check).
|
| 48 |
+
If HIT: use cached response, skip live S2 call. No throttle needed.
|
| 49 |
+
If MISS: proceed with live request below.
|
| 50 |
+
1. Query Semantic Scholar by title:
|
| 51 |
+
GET https://api.semanticscholar.org/graph/v1/paper/search?query=<title>
|
| 52 |
+
&fields=title,abstract,year,authors,venue,externalIds&limit=5
|
| 53 |
+
(Public endpoint, no key. Throttle to 1 QPS for live requests only.)
|
| 54 |
+
2. Store the S2 response in cache: s2_cache.py --store.
|
| 55 |
+
3. Pick the top hit. Check Levenshtein title ratio against the original
|
| 56 |
+
candidate title. If ratio < 70: discard.
|
| 57 |
+
4. Bonus: if year and venue exactly align with hints, add a +5 point
|
| 58 |
+
match-quality bonus.
|
| 59 |
+
5. Require: abstract is non-empty.
|
| 60 |
+
6. Require: paper.year (or month if known) strictly predates cutoff_date.
|
| 61 |
+
Months default to day-1: e.g., "October 2024" → 2024-10-01.
|
| 62 |
+
7. If all checks pass, add to verified pool.
|
| 63 |
+
After all candidates are verified, dedup by Semantic Scholar paperId.
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
The host agent does the LLM/web work; the deterministic helpers in `scripts/`
|
| 67 |
+
do the math.
|
| 68 |
+
|
| 69 |
+
## Step-by-step
|
| 70 |
+
|
| 71 |
+
### 0. Derive `cutoff_date`
|
| 72 |
+
|
| 73 |
+
Parse `conference_guidelines.md` for the submission deadline. The paper aligns
|
| 74 |
+
research cutoff with venue submission deadline (App. D.1):
|
| 75 |
+
|
| 76 |
+
| Venue | Cutoff |
|
| 77 |
+
|---|---|
|
| 78 |
+
| CVPR 2025 | Nov 2024 |
|
| 79 |
+
| ICLR 2025 | Oct 2024 |
|
| 80 |
+
| Other | One month before the stated submission deadline |
|
| 81 |
+
|
| 82 |
+
Encode as `YYYY-MM-DD`. Months default to day-1 (e.g., `2024-10-01`).
|
| 83 |
+
|
| 84 |
+
### 1. Phase 1: Parallel Candidate Discovery
|
| 85 |
+
|
| 86 |
+
From `outline.json`:
|
| 87 |
+
|
| 88 |
+
- All `introduction_strategy.search_directions` (3-5 queries)
|
| 89 |
+
- For each cluster in `related_work_strategy.subsections`:
|
| 90 |
+
- The cluster's `sota_investigation_mission` becomes a search query
|
| 91 |
+
- All `limitation_search_queries` (1-3 each)
|
| 92 |
+
|
| 93 |
+
For each query, **use your host's web search tool** (e.g., `WebSearch` in
|
| 94 |
+
Claude Code, `@web` in Cursor, the search tool in Antigravity). Collect the
|
| 95 |
+
top ~10 candidates per query: title, abstract snippet, source URL.
|
| 96 |
+
|
| 97 |
+
If your host supports parallel sub-tasks, fire up to 10 concurrent search
|
| 98 |
+
queries. If not, run sequentially — slower but functionally equivalent.
|
| 99 |
+
|
| 100 |
+
#### Optional: Exa as a Phase 1 backend
|
| 101 |
+
|
| 102 |
+
If your host has no native web search, OR you want a research-paper-focused
|
| 103 |
+
backend with better signal-to-noise, you can use [Exa](https://exa.ai) via
|
| 104 |
+
the bundled `scripts/exa_search.py` helper. It is **opt-in** and reads
|
| 105 |
+
`EXA_API_KEY` from the environment — the repo never commits a key.
|
| 106 |
+
|
| 107 |
+
```bash
|
| 108 |
+
export EXA_API_KEY="your-key-here" # get one at https://dashboard.exa.ai/
|
| 109 |
+
python skills/literature-review-agent/scripts/exa_search.py \
|
| 110 |
+
--query "Sparse attention long context transformers" \
|
| 111 |
+
--num-results 15 \
|
| 112 |
+
--discovered-for "related_work[2.1]"
|
| 113 |
+
```
|
| 114 |
+
|
| 115 |
+
Output is a normalized candidate list ready to merge into
|
| 116 |
+
`raw_candidates.json`. Phase 2 verification (Semantic Scholar fuzzy match,
|
| 117 |
+
cutoff, dedup) is unchanged. See `references/exa-search-cookbook.md` for
|
| 118 |
+
the full recipe, query patterns, cost estimates, and security notes.
|
| 119 |
+
|
| 120 |
+
Combine all discovered candidates into a single working list. Tag each with
|
| 121 |
+
the originating query ID so you can later attribute it to "intro" vs
|
| 122 |
+
"related_work[i]".
|
| 123 |
+
|
| 124 |
+
### 1.5. Pre-dedup before Phase 2
|
| 125 |
+
|
| 126 |
+
**Always run this before starting Phase 2.** Multiple search queries routinely
|
| 127 |
+
return the same papers (e.g., "Attention is All You Need" appears in almost
|
| 128 |
+
every NLP discovery query). Verifying duplicates wastes 30-40% of S2 quota
|
| 129 |
+
at 1 QPS.
|
| 130 |
+
|
| 131 |
+
```bash
|
| 132 |
+
python skills/literature-review-agent/scripts/pre_dedup_candidates.py \
|
| 133 |
+
--in workspace/raw_candidates.json \
|
| 134 |
+
--out workspace/deduped_candidates.json
|
| 135 |
+
# Prints: "150 candidates → 97 unique (53 duplicates removed)"
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
Use `workspace/deduped_candidates.json` as input to Phase 2.
|
| 139 |
+
|
| 140 |
+
### 2. Phase 2: Sequential Verification via Semantic Scholar (with cache)
|
| 141 |
+
|
| 142 |
+
For each candidate in `deduped_candidates.json`, in **sequential** order:
|
| 143 |
+
|
| 144 |
+
**Step A — check cache first** (no S2 call, no throttle needed):
|
| 145 |
+
```bash
|
| 146 |
+
python skills/literature-review-agent/scripts/s2_cache.py \
|
| 147 |
+
--cache workspace/cache/s2_cache.json \
|
| 148 |
+
--check "<candidate title>"
|
| 149 |
+
# exit 0 + prints JSON → use cached response, skip Step B
|
| 150 |
+
# exit 1 → proceed to Step B
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
**Step B — live S2 request** (cache MISS only, throttle to 1 QPS):
|
| 154 |
+
|
| 155 |
+
**Preferred:** use the bundled `scripts/s2_search.py` helper — it handles
|
| 156 |
+
auth, retries, and 429 back-off automatically:
|
| 157 |
+
|
| 158 |
+
```bash
|
| 159 |
+
python skills/literature-review-agent/scripts/s2_search.py \
|
| 160 |
+
--query "<URL-decoded candidate title>" --limit 5
|
| 161 |
+
# If SEMANTIC_SCHOLAR_API_KEY is set the key is forwarded automatically.
|
| 162 |
+
# If not, the public unauthenticated endpoint is used (≤1 QPS, still works).
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
Check whether the key is configured before starting Phase 2:
|
| 166 |
+
|
| 167 |
+
```bash
|
| 168 |
+
python skills/literature-review-agent/scripts/s2_search.py --check-key
|
| 169 |
+
```
|
| 170 |
+
|
| 171 |
+
**Fallback:** if you prefer your host's URL fetch tool, GET:
|
| 172 |
+
```
|
| 173 |
+
https://api.semanticscholar.org/graph/v1/paper/search?query=<URL-encoded title>&limit=5&fields=title,abstract,year,authors,venue,externalIds
|
| 174 |
+
```
|
| 175 |
+
Add header `x-api-key: <SEMANTIC_SCHOLAR_API_KEY>` if the env var is set.
|
| 176 |
+
Be polite: ≤1 request per second for live requests. Cache hits are free.
|
| 177 |
+
|
| 178 |
+
**Step C — store in cache** (after every successful live request):
|
| 179 |
+
```bash
|
| 180 |
+
python skills/literature-review-agent/scripts/s2_cache.py \
|
| 181 |
+
--cache workspace/cache/s2_cache.json \
|
| 182 |
+
--store "<candidate title>" \
|
| 183 |
+
--response '<full S2 JSON response>'
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
For the top hit:
|
| 187 |
+
|
| 188 |
+
```bash
|
| 189 |
+
python skills/literature-review-agent/scripts/levenshtein_match.py \
|
| 190 |
+
--candidate "Original candidate title" \
|
| 191 |
+
--found "S2 returned title"
|
| 192 |
+
# prints integer 0-100. Discard if < 70.
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
Then check the temporal cutoff:
|
| 196 |
+
|
| 197 |
+
```bash
|
| 198 |
+
python skills/literature-review-agent/scripts/check_cutoff.py \
|
| 199 |
+
--paper-year 2024 \
|
| 200 |
+
--paper-month 9 \
|
| 201 |
+
--cutoff 2024-10-01
|
| 202 |
+
# exit 0 if strictly predates, exit 1 if not
|
| 203 |
+
```
|
| 204 |
+
|
| 205 |
+
If both checks pass AND the abstract is non-empty, append the paper's full
|
| 206 |
+
S2 metadata to the verified pool.
|
| 207 |
+
|
| 208 |
+
### 3. Dedup and assemble the pool
|
| 209 |
+
|
| 210 |
+
After all candidates are verified:
|
| 211 |
+
|
| 212 |
+
```bash
|
| 213 |
+
python skills/literature-review-agent/scripts/dedupe_by_id.py \
|
| 214 |
+
--in raw_pool.json \
|
| 215 |
+
--out workspace/citation_pool.json
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
+
The dedupe script keys on `paperId` (Semantic Scholar's internal unique ID),
|
| 219 |
+
falling back to `externalIds.DOI`, then `externalIds.ArXiv`, then a
|
| 220 |
+
normalized title.
|
| 221 |
+
|
| 222 |
+
The script also computes and writes `min_cite_paper_count` =
|
| 223 |
+
`floor(0.9 * len(papers))` — the minimum number of papers the writing step
|
| 224 |
+
must cite (the paper's ≥90% integration rule, App. D.3).
|
| 225 |
+
|
| 226 |
+
**Immediately after dedupe_by_id.py**, validate and auto-fix the pool schema:
|
| 227 |
+
|
| 228 |
+
```bash
|
| 229 |
+
python skills/literature-review-agent/scripts/validate_pool.py \
|
| 230 |
+
--pool workspace/citation_pool.json --fix
|
| 231 |
+
# Catches and fixes authors-as-strings, reports missing required fields.
|
| 232 |
+
# Must pass before proceeding to Step 4.
|
| 233 |
+
```
|
| 234 |
+
|
| 235 |
+
### 4. Build the BibTeX file
|
| 236 |
+
|
| 237 |
+
```bash
|
| 238 |
+
python skills/literature-review-agent/scripts/bibtex_format.py \
|
| 239 |
+
--pool workspace/citation_pool.json \
|
| 240 |
+
--out workspace/refs.bib
|
| 241 |
+
```
|
| 242 |
+
|
| 243 |
+
The script generates citation keys deterministically from `firstauthor + year
|
| 244 |
+
+ first significant word of title` (e.g., `vaswani2017attention`). It writes
|
| 245 |
+
out only `@article` / `@inproceedings` / `@misc` entries — never invents
|
| 246 |
+
fields. It also writes the canonical `bibtex_key` back into each paper record
|
| 247 |
+
in `citation_pool.json`.
|
| 248 |
+
|
| 249 |
+
**Immediately after bibtex_format.py**, sync keys in `intro_relwork.tex`:
|
| 250 |
+
|
| 251 |
+
```bash
|
| 252 |
+
python skills/literature-review-agent/scripts/sync_keys.py \
|
| 253 |
+
--pool workspace/citation_pool.json \
|
| 254 |
+
--tex workspace/drafts/intro_relwork.tex \
|
| 255 |
+
--inplace
|
| 256 |
+
# Replaces every \cite{agent_key} with \cite{canonical_bibtex_key}.
|
| 257 |
+
# Eliminates citation_coverage gate failures caused by key mismatch.
|
| 258 |
+
```
|
| 259 |
+
|
| 260 |
+
These two steps replace the manual Python snippets that were previously
|
| 261 |
+
required. The pipeline is now:
|
| 262 |
+
|
| 263 |
+
```
|
| 264 |
+
dedupe_by_id → validate_pool --fix → bibtex_format → sync_keys
|
| 265 |
+
```
|
| 266 |
+
|
| 267 |
+
### 5. Draft Introduction + Related Work
|
| 268 |
+
|
| 269 |
+
This is where you (the host agent) actually write text. Load the
|
| 270 |
+
**verbatim Literature Review Agent prompt** at `references/prompt.md`.
|
| 271 |
+
Substitute the template placeholders:
|
| 272 |
+
|
| 273 |
+
| Placeholder | Value |
|
| 274 |
+
|---|---|
|
| 275 |
+
| `intro_related_work_plan` | full JSON object from `outline.json` |
|
| 276 |
+
| `project_idea` | contents of `idea.md` |
|
| 277 |
+
| `project_experimental_log` | contents of `experimental_log.md` |
|
| 278 |
+
| `citation_checklist` | the BibTeX keys from `refs.bib` |
|
| 279 |
+
| `collected_papers` | list of `{key, title, abstract}` from `citation_pool.json` |
|
| 280 |
+
| `paper_count` | `len(citation_pool.papers)` |
|
| 281 |
+
| `min_cite_paper_count` | from `citation_pool.json` |
|
| 282 |
+
| `cutoff_date` | the date you derived in Step 0 |
|
| 283 |
+
|
| 284 |
+
**Also prepend the Anti-Leakage Prompt** from
|
| 285 |
+
`../paper-orchestra/references/anti-leakage-prompt.md`.
|
| 286 |
+
|
| 287 |
+
Run your LLM with the combined prompt against `template.tex`. The agent's
|
| 288 |
+
job is to fill in the empty Introduction and Related Work sections of the
|
| 289 |
+
template **and leave everything else untouched**. Output: the full
|
| 290 |
+
`template.tex` with those two sections filled. Save to
|
| 291 |
+
`workspace/drafts/intro_relwork.tex`.
|
| 292 |
+
|
| 293 |
+
### 6. Verify ≥90% citation coverage
|
| 294 |
+
|
| 295 |
+
```bash
|
| 296 |
+
python skills/literature-review-agent/scripts/citation_coverage.py \
|
| 297 |
+
--tex workspace/drafts/intro_relwork.tex \
|
| 298 |
+
--pool workspace/citation_pool.json
|
| 299 |
+
# exit 0 if ≥90% of pool is cited; exit 1 otherwise
|
| 300 |
+
```
|
| 301 |
+
|
| 302 |
+
If the gate fails, re-prompt the writing step explicitly listing the missing
|
| 303 |
+
keys and asking the agent to integrate them where contextually appropriate.
|
| 304 |
+
|
| 305 |
+
## Critical rules from the prompt
|
| 306 |
+
|
| 307 |
+
These are excerpted from `references/prompt.md`. The host agent MUST honor
|
| 308 |
+
them on the writing call:
|
| 309 |
+
|
| 310 |
+
- **Cite ONLY from `collected_papers`.** Never invent BibTeX keys, never
|
| 311 |
+
reference papers not in the pool.
|
| 312 |
+
- **Cite at least `min_cite_paper_count` of them** in Intro + Related Work
|
| 313 |
+
combined.
|
| 314 |
+
- **TIMELINE RULE**: Do not treat any papers published after `cutoff_date`
|
| 315 |
+
as prior baselines to beat. They are concurrent work only.
|
| 316 |
+
- **EVALUATION RULE**: Do not claim our method beats / achieves SOTA over a
|
| 317 |
+
specific cited paper UNLESS that paper is explicitly evaluated against in
|
| 318 |
+
`experimental_log.md`. Frame other recent papers strictly as concurrent,
|
| 319 |
+
orthogonal, or conceptual work.
|
| 320 |
+
- **Output format**: return the full code for the updated `template.tex`,
|
| 321 |
+
with the two empty sections (Introduction and Related Work) filled in,
|
| 322 |
+
and **all the other code** (packages, styles, other sections) **identical
|
| 323 |
+
to the original** template.tex.
|
| 324 |
+
- Wrap output in ```` ```latex ... ``` ```` fences.
|
| 325 |
+
- Do not change `\usepackage[capitalize]{cleveref}` to `cleverref` (there is
|
| 326 |
+
no `cleverref.sty`).
|
| 327 |
+
|
| 328 |
+
## Degraded mode (no web search)
|
| 329 |
+
|
| 330 |
+
If your host has no web search tool, switch to degraded mode:
|
| 331 |
+
|
| 332 |
+
1. If the user has placed a pre-built `workspace/inputs/refs.bib` in the
|
| 333 |
+
workspace, load it directly into `workspace/refs.bib` and skip Phase 1
|
| 334 |
+
and Phase 2.
|
| 335 |
+
2. Otherwise, emit `workspace/drafts/intro_relwork.tex` containing the
|
| 336 |
+
template with two TODO markers in the Intro and Related Work sections,
|
| 337 |
+
and tell the user the pipeline cannot complete Step 3 without web search.
|
| 338 |
+
|
| 339 |
+
## Resources
|
| 340 |
+
|
| 341 |
+
- `references/prompt.md` — verbatim Literature Review Agent prompt from App. F.1
|
| 342 |
+
- `references/discovery-pipeline.md` — Phase 1 + Phase 2 explained in detail
|
| 343 |
+
- `references/verification-rules.md` — Levenshtein cutoff, year alignment, dedup
|
| 344 |
+
- `references/citation-density-rule.md` — the ≥90% integration rule
|
| 345 |
+
- `references/s2-api-cookbook.md` — Semantic Scholar URLs, fields, rate limits
|
| 346 |
+
- `references/exa-search-cookbook.md` — optional Exa backend for Phase 1 (research-paper-focused web search)
|
| 347 |
+
- `scripts/pre_dedup_candidates.py` — **NEW** dedup Phase 1 candidates before Phase 2 (saves 30-40% S2 quota)
|
| 348 |
+
- `scripts/s2_cache.py` — **NEW** persistent S2 response cache (eliminates re-verification on re-runs)
|
| 349 |
+
- `scripts/validate_pool.py` — **NEW** validate & auto-fix citation_pool.json schema (authors format)
|
| 350 |
+
- `scripts/sync_keys.py` — **NEW** sync cite keys in .tex with canonical bibtex_keys after bibtex_format.py
|
| 351 |
+
- `scripts/levenshtein_match.py` — fuzzy title match (ratio > 70)
|
| 352 |
+
- `scripts/check_cutoff.py` — date cmp w/ month → day-1 default
|
| 353 |
+
- `scripts/dedupe_by_id.py` — dedup verified pool by S2 paperId
|
| 354 |
+
- `scripts/bibtex_format.py` — build refs.bib from JSON pool
|
| 355 |
+
- `scripts/citation_coverage.py` — ≥90% citation coverage gate
|
| 356 |
+
- `scripts/s2_search.py` — **NEW** Semantic Scholar title-search helper; reads `SEMANTIC_SCHOLAR_API_KEY` from env (optional — falls back to unauthenticated)
|
| 357 |
+
- `scripts/exa_search.py` — optional Exa Phase 1 backend (reads `EXA_API_KEY` from env)
|
.scider/skills/literature-review-agent/references/citation-density-rule.md
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Citation Density Rule
|
| 2 |
+
|
| 3 |
+
Source: arXiv:2604.05018, App. D.3.
|
| 4 |
+
|
| 5 |
+
## The 90% rule
|
| 6 |
+
|
| 7 |
+
> ...the system strictly constrains the model to cite only the provided
|
| 8 |
+
> verified papers, explicitly mandating that at least 90% of the gathered
|
| 9 |
+
> literature pool must be actively integrated and cited when synthesizing
|
| 10 |
+
> the Introduction and Related Work sections.
|
| 11 |
+
|
| 12 |
+
Why: this is the paper's core defense against citation inflation. The
|
| 13 |
+
literature review pool is built once via the rigorous discovery →
|
| 14 |
+
verification → dedup pipeline. The writing step must then *use* almost all
|
| 15 |
+
of it. This prevents the agent from gathering 50 papers and citing only the
|
| 16 |
+
3 most famous ones, which would defeat the entire literature search.
|
| 17 |
+
|
| 18 |
+
## Implementation
|
| 19 |
+
|
| 20 |
+
After the Lit Review writing call produces `intro_relwork.tex`:
|
| 21 |
+
|
| 22 |
+
```bash
|
| 23 |
+
python scripts/citation_coverage.py \
|
| 24 |
+
--tex workspace/drafts/intro_relwork.tex \
|
| 25 |
+
--pool workspace/citation_pool.json \
|
| 26 |
+
--threshold 0.90
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
The script:
|
| 30 |
+
|
| 31 |
+
1. Reads `citation_pool.json` and counts `papers[]` (= N).
|
| 32 |
+
2. Computes `min_required = floor(0.90 * N)`.
|
| 33 |
+
3. Greps `intro_relwork.tex` for all `\cite{KEY}`, `\citep{KEY}`, `\citet{KEY}`,
|
| 34 |
+
`\autocite{KEY}`, `\citeauthor{KEY}`, etc.
|
| 35 |
+
4. Counts the **unique** keys actually cited.
|
| 36 |
+
5. Reports `cited / N` and exits non-zero if `cited < min_required`.
|
| 37 |
+
|
| 38 |
+
## What to do on failure
|
| 39 |
+
|
| 40 |
+
The script prints the missing keys grouped by `discovered_for` cluster:
|
| 41 |
+
|
| 42 |
+
```
|
| 43 |
+
FAIL: 17/22 papers cited (77.3%, need ≥90%)
|
| 44 |
+
Uncited papers (5):
|
| 45 |
+
- vaswani2017attention [discovered_for: intro] (Attention Is All You Need)
|
| 46 |
+
- he2016deep [discovered_for: intro] (Deep Residual Learning ...)
|
| 47 |
+
- liu2024video [discovered_for: related_work[2.1]] (Long Video Generation ...)
|
| 48 |
+
- chen2024sparse [discovered_for: related_work[2.2]] (Sparse Attention Surveys ...)
|
| 49 |
+
- kim2024transformer [discovered_for: related_work[2.2]] (Transformer Scaling Laws ...)
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
The host agent should then re-call the Lit Review writing step with an
|
| 53 |
+
appended instruction:
|
| 54 |
+
|
| 55 |
+
```
|
| 56 |
+
The previous draft cited only 17 out of 22 verified papers (77.3%, threshold
|
| 57 |
+
is 90%). You MUST integrate the following 5 papers into the appropriate
|
| 58 |
+
sections:
|
| 59 |
+
- vaswani2017attention (intro): foundational attention reference
|
| 60 |
+
- he2016deep (intro): foundational ResNet reference
|
| 61 |
+
- liu2024video (related work 2.1): direct competing approach for long video
|
| 62 |
+
- chen2024sparse (related work 2.2): sparse attention survey, group with [...]
|
| 63 |
+
- kim2024transformer (related work 2.2): scaling-laws context
|
| 64 |
+
|
| 65 |
+
Do not remove any existing citations. Add new ones where contextually
|
| 66 |
+
appropriate. Re-emit the full template.tex with both sections updated.
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
After 2-3 re-prompts, if coverage still falls short, the pipeline should
|
| 70 |
+
emit a warning and proceed — the paper does not specify a hard halt on this,
|
| 71 |
+
only a strong constraint.
|
.scider/skills/literature-review-agent/references/discovery-pipeline.md
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Discovery Pipeline (Phase 1 + Phase 2)
|
| 2 |
+
|
| 3 |
+
Source: arXiv:2604.05018, App. D.3 ("Citation Verification") and App. B
|
| 4 |
+
(LLM-call distribution).
|
| 5 |
+
|
| 6 |
+
## Phase 1 — Parallel Candidate Discovery
|
| 7 |
+
|
| 8 |
+
The paper uses 10 concurrent workers to fan out search-grounded LLM calls
|
| 9 |
+
("Gemini-3-Flash with Google Search grounding"). For our host-agent
|
| 10 |
+
implementation, the equivalent is: spawn up to 10 concurrent search queries
|
| 11 |
+
using the host's native web search tool.
|
| 12 |
+
|
| 13 |
+
### Inputs
|
| 14 |
+
|
| 15 |
+
From `outline.json`:
|
| 16 |
+
|
| 17 |
+
```
|
| 18 |
+
introduction_strategy:
|
| 19 |
+
search_directions: [q1, q2, q3] # 3-5 queries
|
| 20 |
+
related_work_strategy:
|
| 21 |
+
subsections:
|
| 22 |
+
- methodology_cluster: "..."
|
| 23 |
+
sota_investigation_mission: "..." # 1 derived query
|
| 24 |
+
limitation_search_queries: [q4, q5] # 1-3 queries
|
| 25 |
+
- ...
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
Total query budget: typically 10-20 queries per paper.
|
| 29 |
+
|
| 30 |
+
### Per-query procedure
|
| 31 |
+
|
| 32 |
+
For each search query, instruct your host's search tool:
|
| 33 |
+
|
| 34 |
+
```
|
| 35 |
+
search("<query>", num_results=10)
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
Or, if you've enabled the optional Exa backend (see `exa-search-cookbook.md`):
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
+
python scripts/exa_search.py --query "<query>" --num-results 10
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
Both paths produce the same normalized candidate format. Collect the top
|
| 45 |
+
10 results per query. Each result should yield:
|
| 46 |
+
|
| 47 |
+
- `title` — the paper's title from the search snippet
|
| 48 |
+
- `snippet` — the abstract preview from the search snippet
|
| 49 |
+
- `source_url` — the result URL (often the arXiv abstract page)
|
| 50 |
+
|
| 51 |
+
Tag each result with `discovered_for: ["intro"]` or
|
| 52 |
+
`discovered_for: ["related_work[2.1]"]` so you can later trace which cluster
|
| 53 |
+
each citation supports.
|
| 54 |
+
|
| 55 |
+
Combine all results across all queries into a single `raw_candidates.json`:
|
| 56 |
+
|
| 57 |
+
```json
|
| 58 |
+
{
|
| 59 |
+
"candidates": [
|
| 60 |
+
{
|
| 61 |
+
"title": "Attention Is All You Need",
|
| 62 |
+
"snippet": "The dominant sequence transduction models...",
|
| 63 |
+
"source_url": "https://arxiv.org/abs/1706.03762",
|
| 64 |
+
"discovered_for": ["intro"]
|
| 65 |
+
},
|
| 66 |
+
...
|
| 67 |
+
]
|
| 68 |
+
}
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
## Phase 2 — Sequential Verification via Semantic Scholar
|
| 72 |
+
|
| 73 |
+
The paper enforces strict sequential verification at ≤1 QPS via the public
|
| 74 |
+
Semantic Scholar API. We follow the same constraint.
|
| 75 |
+
|
| 76 |
+
### Per-candidate procedure
|
| 77 |
+
|
| 78 |
+
1. **Search S2 by title**. Use the host's URL fetch tool:
|
| 79 |
+
```
|
| 80 |
+
GET https://api.semanticscholar.org/graph/v1/paper/search
|
| 81 |
+
?query=<URL-encoded(title)>
|
| 82 |
+
&limit=5
|
| 83 |
+
&fields=title,abstract,year,authors,venue,externalIds
|
| 84 |
+
```
|
| 85 |
+
No API key required for the public endpoint. Be polite: 1 QPS.
|
| 86 |
+
|
| 87 |
+
2. **Take the top hit**. Compare `title` to the candidate `title` via the
|
| 88 |
+
helper:
|
| 89 |
+
```bash
|
| 90 |
+
python scripts/levenshtein_match.py --candidate "..." --found "..."
|
| 91 |
+
```
|
| 92 |
+
The helper prints an integer 0-100 (the Levenshtein ratio).
|
| 93 |
+
- **< 70 → discard the candidate.** Move on.
|
| 94 |
+
- **≥ 70 → continue to checks 3-5.**
|
| 95 |
+
|
| 96 |
+
3. **Check abstract presence**. If `abstract` is null or empty → discard.
|
| 97 |
+
The paper requires every cited entity to have a retrievable abstract for
|
| 98 |
+
downstream context enrichment in the Section Writing Agent.
|
| 99 |
+
|
| 100 |
+
4. **Check temporal cutoff**:
|
| 101 |
+
```bash
|
| 102 |
+
python scripts/check_cutoff.py \
|
| 103 |
+
--paper-year <year> \
|
| 104 |
+
--paper-month <month or omit> \
|
| 105 |
+
--cutoff <YYYY-MM-DD>
|
| 106 |
+
```
|
| 107 |
+
Exit 0 if strictly predates; exit 1 if not. Discard on exit 1.
|
| 108 |
+
|
| 109 |
+
5. **Year-alignment bonus**. If the candidate's `discovered_for` query
|
| 110 |
+
mentioned a specific year and the S2 hit's year matches exactly, record
|
| 111 |
+
`match_score = ratio + 5`. (This is a soft bonus used for tie-breaking
|
| 112 |
+
when two candidates dedup to similar entries.)
|
| 113 |
+
|
| 114 |
+
6. **Append to verified pool** if all checks pass. Record:
|
| 115 |
+
```json
|
| 116 |
+
{
|
| 117 |
+
"paperId": "abc123...",
|
| 118 |
+
"title": "...",
|
| 119 |
+
"abstract": "...",
|
| 120 |
+
"year": 2017,
|
| 121 |
+
"venue": "NeurIPS",
|
| 122 |
+
"authors": [{"name": "A. Vaswani"}, ...],
|
| 123 |
+
"externalIds": {"DOI": "...", "ArXiv": "1706.03762"},
|
| 124 |
+
"match_score": 100,
|
| 125 |
+
"discovered_for": ["intro"]
|
| 126 |
+
}
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
### Rate-limit etiquette
|
| 130 |
+
|
| 131 |
+
The S2 public endpoint enforces ~1 QPS without an API key. If you receive
|
| 132 |
+
HTTP 429, sleep 5 seconds and retry. Do not parallelize Phase 2 — verification
|
| 133 |
+
must be strictly sequential.
|
| 134 |
+
|
| 135 |
+
If your host has the patience for it, the paper measures ~20-30 LLM/API calls
|
| 136 |
+
total per Lit Review Agent invocation. With ~30 candidates that's roughly
|
| 137 |
+
30 seconds of verification wall-time. With 100 candidates it's ~100 seconds.
|
| 138 |
+
|
| 139 |
+
## Why two phases
|
| 140 |
+
|
| 141 |
+
The split exists because:
|
| 142 |
+
|
| 143 |
+
- **Discovery is high-throughput, low-stakes**. You want to cast a wide net
|
| 144 |
+
fast. Search APIs accept high concurrency.
|
| 145 |
+
- **Verification is low-throughput, high-stakes**. The S2 API protects
|
| 146 |
+
itself with QPS limits, and the verification step is what keeps the paper
|
| 147 |
+
honest. Faking a citation is trivially easy without it.
|
| 148 |
+
|
| 149 |
+
The paper's design "successfully combines the high-concurrency tolerance of
|
| 150 |
+
the LLM API with the strict throughput limits of the Semantic Scholar API to
|
| 151 |
+
prevent quota-induced latency" (App. B).
|
.scider/skills/literature-review-agent/references/exa-search-cookbook.md
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Exa Search Cookbook (optional Phase 1 backend)
|
| 2 |
+
|
| 3 |
+
[Exa](https://exa.ai) is a search engine optimized for finding academic
|
| 4 |
+
papers and other high-quality content. The `literature-review-agent` can
|
| 5 |
+
use Exa as an **OPTIONAL** backend for Phase 1 candidate discovery — useful
|
| 6 |
+
when your host coding agent has no native web search tool, or when you
|
| 7 |
+
want a research-paper-focused search backend with better signal-to-noise
|
| 8 |
+
than general web search.
|
| 9 |
+
|
| 10 |
+
> **Exa is opt-in.** The literature-review-agent's default Phase 1 path is
|
| 11 |
+
> "use your host agent's native web search tool" (`WebSearch` in Claude
|
| 12 |
+
> Code, `@web` in Cursor, the search tool in Antigravity, etc.). That
|
| 13 |
+
> requires zero configuration and no API key. Use Exa only if you want
|
| 14 |
+
> to.
|
| 15 |
+
|
| 16 |
+
## Why use it
|
| 17 |
+
|
| 18 |
+
Exa fills three gaps:
|
| 19 |
+
|
| 20 |
+
1. **Hosts with no built-in search.** Aider, OpenCode, and generic CLI
|
| 21 |
+
agents often lack a native web search tool. Exa gives them one.
|
| 22 |
+
2. **Research-paper-focused results.** Exa's `category: "research paper"`
|
| 23 |
+
filter returns higher signal-to-noise than general web search for
|
| 24 |
+
academic queries. The example response (e.g., for the query
|
| 25 |
+
"PaperOrchestra") returns arXiv pages, conference proceedings, and
|
| 26 |
+
academic tools rather than general SEO content.
|
| 27 |
+
3. **Batch / non-interactive runs.** When you want a deterministic,
|
| 28 |
+
scriptable backend rather than going through the host agent's tool
|
| 29 |
+
interface.
|
| 30 |
+
|
| 31 |
+
Exa returns 10–20 results per call (the helper clamps to that range), and
|
| 32 |
+
each result includes a `title`, `url`, optional `publishedDate`, and a
|
| 33 |
+
list of `highlights` (snippets) which the helper joins into a `snippet`
|
| 34 |
+
field consumable by the rest of the Phase 1 pipeline.
|
| 35 |
+
|
| 36 |
+
## Get a key
|
| 37 |
+
|
| 38 |
+
1. Sign up at <https://dashboard.exa.ai/>.
|
| 39 |
+
2. Copy your API key (format: `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`).
|
| 40 |
+
3. Set it in your environment:
|
| 41 |
+
|
| 42 |
+
```bash
|
| 43 |
+
export EXA_API_KEY="paste-key-here"
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
Or put it in a `.env` file (which is gitignored — the repo `.gitignore`
|
| 47 |
+
blocks `*.env` and `.env*` patterns) and source it:
|
| 48 |
+
|
| 49 |
+
```bash
|
| 50 |
+
set -a; source .env; set +a
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
**This repo never commits a key.** The helper reads `EXA_API_KEY` from the
|
| 54 |
+
environment at runtime. The key is your responsibility to provision and
|
| 55 |
+
secure.
|
| 56 |
+
|
| 57 |
+
## Run the helper
|
| 58 |
+
|
| 59 |
+
```bash
|
| 60 |
+
python skills/literature-review-agent/scripts/exa_search.py \
|
| 61 |
+
--query "Sparse attention long context transformers" \
|
| 62 |
+
--num-results 15 \
|
| 63 |
+
--discovered-for "related_work[2.1]"
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
Output (default — normalized to the literature-review-agent candidate
|
| 67 |
+
format):
|
| 68 |
+
|
| 69 |
+
```json
|
| 70 |
+
{
|
| 71 |
+
"candidates": [
|
| 72 |
+
{
|
| 73 |
+
"title": "Longformer: The Long-Document Transformer",
|
| 74 |
+
"snippet": "We present the Longformer, a self-attention mechanism that scales linearly with sequence length...",
|
| 75 |
+
"source_url": "https://arxiv.org/abs/2004.05150",
|
| 76 |
+
"discovered_for": ["related_work[2.1]"],
|
| 77 |
+
"_exa_id": "https://arxiv.org/abs/2004.05150",
|
| 78 |
+
"_exa_published_date": "2020-04-10T00:00:00.000Z"
|
| 79 |
+
},
|
| 80 |
+
...
|
| 81 |
+
]
|
| 82 |
+
}
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
This JSON can be merged directly into `workspace/raw_candidates.json`
|
| 86 |
+
before the Phase 2 sequential verification step.
|
| 87 |
+
|
| 88 |
+
### Useful flags
|
| 89 |
+
|
| 90 |
+
| Flag | Default | Purpose |
|
| 91 |
+
|---|---|---|
|
| 92 |
+
| `--query` | (required) | Search query string |
|
| 93 |
+
| `--num-results` | `10` | 1–20; the helper clamps to this range |
|
| 94 |
+
| `--category` | `"research paper"` | Pass `""` to disable category filtering for broader results |
|
| 95 |
+
| `--highlight-chars` | `4000` | Max characters per highlight (Exa parameter) |
|
| 96 |
+
| `--discovered-for` | `"intro"` | Tag attached to each candidate; use `"related_work[2.1]"` for cluster queries |
|
| 97 |
+
| `--raw` | off | Print the full Exa response JSON instead of normalized candidates |
|
| 98 |
+
|
| 99 |
+
## Direct curl recipe
|
| 100 |
+
|
| 101 |
+
If you'd rather not use the Python helper (for one-off testing, or to
|
| 102 |
+
invoke from a host agent's `Bash` / `WebFetch` tool directly):
|
| 103 |
+
|
| 104 |
+
```bash
|
| 105 |
+
curl -X POST https://api.exa.ai/search \
|
| 106 |
+
--header "content-type: application/json" \
|
| 107 |
+
--header "x-api-key: $EXA_API_KEY" \
|
| 108 |
+
--data '{
|
| 109 |
+
"query": "PaperOrchestra automated paper writing",
|
| 110 |
+
"category": "research paper",
|
| 111 |
+
"numResults": 10,
|
| 112 |
+
"type": "auto",
|
| 113 |
+
"contents": {
|
| 114 |
+
"highlights": {
|
| 115 |
+
"maxCharacters": 4000
|
| 116 |
+
}
|
| 117 |
+
}
|
| 118 |
+
}'
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
The `$EXA_API_KEY` reference assumes the key is in your shell env. **Do
|
| 122 |
+
not** paste the literal key into the curl command in shell history or
|
| 123 |
+
chat — use the env var.
|
| 124 |
+
|
| 125 |
+
## Response shape
|
| 126 |
+
|
| 127 |
+
```json
|
| 128 |
+
{
|
| 129 |
+
"requestId": "52fcb70256224863b33f356fdae37c7f",
|
| 130 |
+
"resolvedSearchType": "neural",
|
| 131 |
+
"results": [
|
| 132 |
+
{
|
| 133 |
+
"id": "https://arxiv.org/abs/2604.05018",
|
| 134 |
+
"title": "PaperOrchestra: A Multi-Agent Framework for ...",
|
| 135 |
+
"url": "https://arxiv.org/abs/2604.05018",
|
| 136 |
+
"publishedDate": "2026-04-06T00:00:00.000Z",
|
| 137 |
+
"highlights": ["...", "..."],
|
| 138 |
+
"highlightScores": [0.4, 0.3],
|
| 139 |
+
"image": "https://...",
|
| 140 |
+
"favicon": "https://..."
|
| 141 |
+
}
|
| 142 |
+
],
|
| 143 |
+
"searchTime": 975.2,
|
| 144 |
+
"costDollars": {
|
| 145 |
+
"total": 0.007,
|
| 146 |
+
"search": {"neural": 0.007}
|
| 147 |
+
}
|
| 148 |
+
}
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
## Mapping Exa → literature-review-agent candidate format
|
| 152 |
+
|
| 153 |
+
Phase 2 verification (Semantic Scholar fuzzy match → cutoff check → dedup)
|
| 154 |
+
expects candidates in this shape:
|
| 155 |
+
|
| 156 |
+
```json
|
| 157 |
+
{
|
| 158 |
+
"title": "...",
|
| 159 |
+
"snippet": "...",
|
| 160 |
+
"source_url": "...",
|
| 161 |
+
"discovered_for": ["intro"]
|
| 162 |
+
}
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
`exa_search.py --normalize` (the default mode) does this mapping:
|
| 166 |
+
|
| 167 |
+
| Exa field | Candidate field |
|
| 168 |
+
|---|---|
|
| 169 |
+
| `result.title` | `title` |
|
| 170 |
+
| `result.url` (fallback `result.id`) | `source_url` |
|
| 171 |
+
| `result.highlights` joined and capped at 1500 chars | `snippet` |
|
| 172 |
+
| `--discovered-for` flag | `discovered_for` |
|
| 173 |
+
| `result.id` | `_exa_id` (preserved for debugging) |
|
| 174 |
+
| `result.publishedDate` | `_exa_published_date` (preserved for tie-breaking) |
|
| 175 |
+
|
| 176 |
+
Phase 2 verification still goes through Semantic Scholar regardless of
|
| 177 |
+
whether the candidate came from Exa or from the host's native search.
|
| 178 |
+
Exa is ONLY a discovery backend; the verification chain
|
| 179 |
+
(`levenshtein_match.py` → `check_cutoff.py` → `dedupe_by_id.py` →
|
| 180 |
+
`bibtex_format.py` → `citation_coverage.py`) is unchanged.
|
| 181 |
+
|
| 182 |
+
## Query patterns
|
| 183 |
+
|
| 184 |
+
Match the literature-review-agent's outline-driven query design. Run one
|
| 185 |
+
Exa call per query, then merge all candidate lists:
|
| 186 |
+
|
| 187 |
+
| Query type | Source in `outline.json` | Example query | `--discovered-for` |
|
| 188 |
+
|---|---|---|---|
|
| 189 |
+
| Macro context | `introduction_strategy.search_directions[i]` | `"Survey of long-context attention mechanisms 2020-2024"` | `"intro"` |
|
| 190 |
+
| Foundational | same | `"Foundational papers transformer self-attention scaling laws"` | `"intro"` |
|
| 191 |
+
| SOTA scan | `related_work_strategy.subsections[i].sota_investigation_mission` | `"Recent SOTA sparse attention transformers 2024"` | `"related_work[2.1]"` |
|
| 192 |
+
| Limitation hunt | `related_work_strategy.subsections[i].limitation_search_queries[j]` | `"Block-sparse attention failure modes long sequences"` | `"related_work[2.1]"` |
|
| 193 |
+
|
| 194 |
+
For the related-work cluster queries, the `--discovered-for` tag matters
|
| 195 |
+
— the downstream `citation_coverage.py` gate uses it to attribute each
|
| 196 |
+
citation to the right cluster when reporting which papers were not yet
|
| 197 |
+
integrated.
|
| 198 |
+
|
| 199 |
+
## Cost and rate limits
|
| 200 |
+
|
| 201 |
+
Exa pricing is per-query (~$0.007 per neural search at the time of
|
| 202 |
+
writing). For a typical paper with ~15-20 search queries (3-5 intro
|
| 203 |
+
queries + 10-15 related-work queries), one full Lit Review Agent run
|
| 204 |
+
costs ~$0.10-$0.15. Check <https://exa.ai/pricing> for current rates.
|
| 205 |
+
|
| 206 |
+
Exa's rate limits are generous; the paper's 10-worker parallel discovery
|
| 207 |
+
pattern is well within them. The pipeline's wall-time floor is still set
|
| 208 |
+
by Semantic Scholar's 1 QPS verification limit, not by Exa.
|
| 209 |
+
|
| 210 |
+
## Security
|
| 211 |
+
|
| 212 |
+
- **NEVER commit `EXA_API_KEY` to git.** The repo's `.gitignore` blocks
|
| 213 |
+
`.env`, `*.env`, and `secrets.json` patterns. Keep your key in your
|
| 214 |
+
shell environment or your secrets manager (1Password CLI, op, doppler,
|
| 215 |
+
etc.).
|
| 216 |
+
- The helper reads the key from the environment only. It does NOT accept
|
| 217 |
+
the key as a command-line argument (which would expose it in shell
|
| 218 |
+
history).
|
| 219 |
+
- Exa logs requests for billing and quality. Assume your queries are not
|
| 220 |
+
private to Exa themselves. Don't include sensitive draft text in
|
| 221 |
+
queries.
|
| 222 |
+
|
| 223 |
+
## Troubleshooting
|
| 224 |
+
|
| 225 |
+
| Symptom | Likely cause | Fix |
|
| 226 |
+
|---|---|---|
|
| 227 |
+
| `ERROR: EXA_API_KEY environment variable not set` | env var missing | `export EXA_API_KEY="..."` |
|
| 228 |
+
| `ERROR: Exa HTTP 401` | invalid or expired key | check the dashboard for the current key |
|
| 229 |
+
| `ERROR: Exa HTTP 429` | rate-limited | back off, lower concurrency |
|
| 230 |
+
| `WARN: Exa returned 0 results` | query too narrow or odd category | broaden the query or try `--category ""` |
|
| 231 |
+
| `Exa network error` | no internet, DNS issue | check your connection; the helper uses urllib stdlib only, no proxy support |
|
| 232 |
+
|
| 233 |
+
## When to prefer Exa vs the host's native search
|
| 234 |
+
|
| 235 |
+
| Use case | Recommended backend |
|
| 236 |
+
|---|---|
|
| 237 |
+
| Claude Code, Cursor, Antigravity (have native web search) | host's native search (free, integrated) |
|
| 238 |
+
| Aider, OpenCode, generic CLI agents | Exa (gives them search) |
|
| 239 |
+
| Batch reproducible runs | Exa (deterministic backend) |
|
| 240 |
+
| Research-paper-heavy queries | Exa (better academic signal) |
|
| 241 |
+
| One-off interactive runs | host's native search (less friction) |
|
| 242 |
+
|
| 243 |
+
You can also mix: use the host's web search for the broad intro queries
|
| 244 |
+
and Exa for the narrow limitation-search queries where the
|
| 245 |
+
research-paper-category filter helps the most.
|
.scider/skills/literature-review-agent/references/prompt.md
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Literature Review Agent — verbatim prompt
|
| 2 |
+
|
| 3 |
+
**Source: arXiv:2604.05018, Appendix F.1, page 46 (verbatim).**
|
| 4 |
+
|
| 5 |
+
This is the exact prompt used by the Literature Review Agent in the paper.
|
| 6 |
+
Use it as your system message when drafting Introduction and Related Work.
|
| 7 |
+
Substitute the placeholders before sending. The Anti-Leakage Prompt
|
| 8 |
+
(`../paper-orchestra/references/anti-leakage-prompt.md`) MUST be prepended.
|
| 9 |
+
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
```
|
| 13 |
+
Role: Senior AI Researcher.
|
| 14 |
+
|
| 15 |
+
Task: Write the introduction and related work section of a paper.
|
| 16 |
+
|
| 17 |
+
You will be given a template.tex, this is the initial skeleton we outlined for
|
| 18 |
+
you. Your job is to fill in two sections: Introduction and Related Work.
|
| 19 |
+
Leave all the other sections untouched.
|
| 20 |
+
|
| 21 |
+
Inputs:
|
| 22 |
+
- intro_related_work_plan: This is your PRIMARY guide for structure and
|
| 23 |
+
arguments.
|
| 24 |
+
- project_idea and project_experimental_log: Use them to ensure the Intro
|
| 25 |
+
accurately frames the technical contribution and results.
|
| 26 |
+
- citation_checklist: This includes the citation keys that you should use
|
| 27 |
+
when citing relevant papers.
|
| 28 |
+
- collected_papers: These are all the relevant papers we collect for you for
|
| 29 |
+
citation purpose.
|
| 30 |
+
|
| 31 |
+
YOU MUST ONLY CITE THE GIVEN collected_papers, DO NOT cite new papers other
|
| 32 |
+
than the given papers.
|
| 33 |
+
|
| 34 |
+
Citation Requirements:
|
| 35 |
+
- You have access to the abstract of {paper_count} collected papers.
|
| 36 |
+
- You MUST cite at least {min_cite_paper_count} of them across the
|
| 37 |
+
introduction and related work sections.
|
| 38 |
+
- Introduction: Cite key statistics, foundational models (CLIP, etc.), and
|
| 39 |
+
broad problem statements.
|
| 40 |
+
- Related Work: Do deep comparative citations. Group distinct works (e.g.,
|
| 41 |
+
"Several methods [A, B, C]...").
|
| 42 |
+
- Ensure every \cite{{key}} corresponds exactly to a key in
|
| 43 |
+
citation_checklist.
|
| 44 |
+
- CRITICAL TIMELINE RULE: Do not treat any papers published after
|
| 45 |
+
{cutoff_date} as prior baselines to beat. Treat them strictly as
|
| 46 |
+
concurrent work.
|
| 47 |
+
- CRITICAL EVALUATION RULE: Do not claim our method beats or achieves
|
| 48 |
+
State-of-the-Art over a specific cited paper UNLESS that paper is
|
| 49 |
+
explicitly evaluated against in project_experimental_log. Frame other
|
| 50 |
+
recent papers strictly as concurrent, orthogonal, or conceptual work.
|
| 51 |
+
- You need to return the full code for the new template.tex, where the two
|
| 52 |
+
empty sections (Introduction and Related Work) are now filled in, while
|
| 53 |
+
all the other code (packages, styles, and other sections) are identical
|
| 54 |
+
to the original template.tex.
|
| 55 |
+
|
| 56 |
+
Important Note:
|
| 57 |
+
DO NOT change \usepackage[capitalize]{{cleveref}} into
|
| 58 |
+
\usepackage[capitalize]{{cleverref}}, as there's no cleverref.sty.
|
| 59 |
+
|
| 60 |
+
Output Format:
|
| 61 |
+
You must return the code for the updated template.tex. Make sure to wrap the
|
| 62 |
+
code with ```latex content ```.
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
---
|
| 66 |
+
|
| 67 |
+
## Placeholder substitution table
|
| 68 |
+
|
| 69 |
+
| Placeholder | Source |
|
| 70 |
+
|---|---|
|
| 71 |
+
| `{paper_count}` | `len(citation_pool.papers)` from `workspace/citation_pool.json` |
|
| 72 |
+
| `{min_cite_paper_count}` | `floor(0.9 * paper_count)` — the ≥90% rule |
|
| 73 |
+
| `{cutoff_date}` | Derived from `conference_guidelines.md` — see App. D.1 of the paper |
|
| 74 |
+
|
| 75 |
+
The other placeholders (`intro_related_work_plan`, `project_idea`,
|
| 76 |
+
`project_experimental_log`, `citation_checklist`, `collected_papers`) are
|
| 77 |
+
substituted by passing their full file/JSON contents into the user message.
|
.scider/skills/literature-review-agent/references/s2-api-cookbook.md
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Semantic Scholar API Cookbook
|
| 2 |
+
|
| 3 |
+
How to verify a candidate paper via the Semantic Scholar Graph API.
|
| 4 |
+
|
| 5 |
+
Base: `https://api.semanticscholar.org/graph/v1`
|
| 6 |
+
|
| 7 |
+
Reference: <https://api.semanticscholar.org/api-docs/graph>
|
| 8 |
+
|
| 9 |
+
## API key (optional)
|
| 10 |
+
|
| 11 |
+
The pipeline uses the **public, unauthenticated endpoint** by default — no key
|
| 12 |
+
required. If you have a Semantic Scholar API key you can pass it via the
|
| 13 |
+
`x-api-key` header to get higher rate limits (useful for large batches).
|
| 14 |
+
|
| 15 |
+
Get a free key at <https://api.semanticscholar.org/> then export it once:
|
| 16 |
+
|
| 17 |
+
```bash
|
| 18 |
+
export SEMANTIC_SCHOLAR_API_KEY="your-key-here"
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
The bundled `scripts/s2_search.py` helper picks this up automatically. If the
|
| 22 |
+
variable is not set the script falls back to the unauthenticated endpoint — the
|
| 23 |
+
pipeline works fine either way; just keep to ≤1 QPS on live requests.
|
| 24 |
+
|
| 25 |
+
```bash
|
| 26 |
+
# check whether the key is configured
|
| 27 |
+
python skills/literature-review-agent/scripts/s2_search.py --check-key
|
| 28 |
+
|
| 29 |
+
# search by title (key used automatically if set)
|
| 30 |
+
python skills/literature-review-agent/scripts/s2_search.py \
|
| 31 |
+
--query "Attention is All You Need" --limit 5
|
| 32 |
+
|
| 33 |
+
# print the raw S2 JSON
|
| 34 |
+
python skills/literature-review-agent/scripts/s2_search.py \
|
| 35 |
+
--query "BERT pre-training" --raw
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
The repo never commits a key. Key management is your responsibility (shell
|
| 39 |
+
environment, 1Password, doppler, etc.).
|
| 40 |
+
|
| 41 |
+
## Endpoint 1 — Search by title
|
| 42 |
+
|
| 43 |
+
```
|
| 44 |
+
GET /paper/search
|
| 45 |
+
?query=<URL-encoded title>
|
| 46 |
+
&limit=5
|
| 47 |
+
&fields=title,abstract,year,authors,venue,externalIds
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
Example:
|
| 51 |
+
|
| 52 |
+
```
|
| 53 |
+
GET https://api.semanticscholar.org/graph/v1/paper/search?query=Attention%20Is%20All%20You%20Need&limit=5&fields=title,abstract,year,authors,venue,externalIds
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
Response (truncated):
|
| 57 |
+
|
| 58 |
+
```json
|
| 59 |
+
{
|
| 60 |
+
"total": 12345,
|
| 61 |
+
"data": [
|
| 62 |
+
{
|
| 63 |
+
"paperId": "204e3073870fae3d05bcbc2f6a8e263d9b72e776",
|
| 64 |
+
"title": "Attention is All you Need",
|
| 65 |
+
"abstract": "The dominant sequence transduction models are based on...",
|
| 66 |
+
"year": 2017,
|
| 67 |
+
"venue": "NeurIPS",
|
| 68 |
+
"authors": [{"name": "Ashish Vaswani"}, ...],
|
| 69 |
+
"externalIds": {
|
| 70 |
+
"DBLP": "conf/nips/VaswaniSPUJGKP17",
|
| 71 |
+
"ArXiv": "1706.03762",
|
| 72 |
+
"DOI": "10.5555/3295222.3295349"
|
| 73 |
+
}
|
| 74 |
+
},
|
| 75 |
+
...
|
| 76 |
+
]
|
| 77 |
+
}
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
## Endpoint 2 — Get a specific paper by ID
|
| 81 |
+
|
| 82 |
+
```
|
| 83 |
+
GET /paper/<paperId>?fields=title,abstract,year,authors,venue,externalIds,citationCount
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
## Useful identifiers
|
| 87 |
+
|
| 88 |
+
You can pass these as `<paperId>`:
|
| 89 |
+
|
| 90 |
+
- S2 internal: `204e3073870fae3d05bcbc2f6a8e263d9b72e776`
|
| 91 |
+
- DOI: `DOI:10.18653/v1/N18-3011`
|
| 92 |
+
- ArXiv: `ARXIV:1706.03762`
|
| 93 |
+
- Corpus ID: `CorpusId:13756489`
|
| 94 |
+
- URL: `URL:https://arxiv.org/abs/1706.03762`
|
| 95 |
+
|
| 96 |
+
## Rate limits
|
| 97 |
+
|
| 98 |
+
- Unauthenticated: ~1 QPS sustained. Bursts will get 429.
|
| 99 |
+
- Per the paper, "the strict throughput limits of the Semantic Scholar API
|
| 100 |
+
(1 query per second)" — App. B.
|
| 101 |
+
|
| 102 |
+
If you get HTTP 429, sleep 5 seconds before retrying. Don't loop tightly.
|
| 103 |
+
|
| 104 |
+
## Fields cheat sheet
|
| 105 |
+
|
| 106 |
+
| Field | Type | Required by our pipeline? |
|
| 107 |
+
|---|---|---|
|
| 108 |
+
| `paperId` | string | yes (dedup key) |
|
| 109 |
+
| `title` | string | yes (Levenshtein match) |
|
| 110 |
+
| `abstract` | string | yes (rule 2: must exist) |
|
| 111 |
+
| `year` | int | yes (cutoff check) |
|
| 112 |
+
| `authors[].name` | string | yes (BibTeX author field) |
|
| 113 |
+
| `venue` | string | recommended (BibTeX journal/booktitle) |
|
| 114 |
+
| `externalIds.DOI` | string | recommended (dedup fallback, BibTeX doi) |
|
| 115 |
+
| `externalIds.ArXiv` | string | recommended (dedup fallback) |
|
| 116 |
+
| `publicationDate` | string `YYYY-MM-DD` | optional (more precise cutoff check) |
|
| 117 |
+
| `citationCount` | int | optional (could inform tie-breaking) |
|
| 118 |
+
|
| 119 |
+
Always pass `fields=...` explicitly — the default response is minimal and
|
| 120 |
+
will not include the abstract.
|
| 121 |
+
|
| 122 |
+
## Error handling
|
| 123 |
+
|
| 124 |
+
| Status | Meaning | What to do |
|
| 125 |
+
|---|---|---|
|
| 126 |
+
| 200 | OK | proceed |
|
| 127 |
+
| 400 | bad query syntax | URL-encode the title properly; retry once |
|
| 128 |
+
| 404 | not found | discard the candidate |
|
| 129 |
+
| 429 | rate limited | sleep 5s, retry |
|
| 130 |
+
| 500-503 | S2 down | sleep 30s, retry up to 3 times, then give up |
|
| 131 |
+
|
| 132 |
+
## Polite use
|
| 133 |
+
|
| 134 |
+
The S2 API is a public service. Do not hammer it. If you have many candidates:
|
| 135 |
+
|
| 136 |
+
- Throttle to 1 QPS.
|
| 137 |
+
- Cache hits (the dedup script already serves as a deduplication cache).
|
| 138 |
+
- Do not parallelize. Verification is sequential by design.
|
.scider/skills/literature-review-agent/references/verification-rules.md
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Verification Rules
|
| 2 |
+
|
| 3 |
+
Source: arXiv:2604.05018, App. D.3 ("Citation Verification"), verbatim
|
| 4 |
+
specifications below.
|
| 5 |
+
|
| 6 |
+
## Rule 1 — Fuzzy title match (Levenshtein > 70)
|
| 7 |
+
|
| 8 |
+
> Each candidate must resolve to a valid Semantic Scholar entity via a fuzzy
|
| 9 |
+
> title match (Levenshtein distance ratio > 70 [Levenshtein, 1965]),
|
| 10 |
+
> augmented by a point bonus for exact year alignment.
|
| 11 |
+
|
| 12 |
+
Implementation: `scripts/levenshtein_match.py` uses
|
| 13 |
+
`Levenshtein.ratio(a, b) * 100` from the `python-Levenshtein` package and
|
| 14 |
+
returns the integer ratio. Threshold: **strictly greater than 70**.
|
| 15 |
+
|
| 16 |
+
Examples:
|
| 17 |
+
|
| 18 |
+
| Candidate title | S2 title | Ratio | Verdict |
|
| 19 |
+
|---|---|---|---|
|
| 20 |
+
| "Attention Is All You Need" | "Attention Is All You Need" | 100 | accept |
|
| 21 |
+
| "Attention Is All You Need" | "Attention is All You Need." | 96 | accept |
|
| 22 |
+
| "Sparse Attention for Transformers" | "Sparse Attention in Transformers" | 88 | accept |
|
| 23 |
+
| "Self-Attention" | "Attention Is All You Need" | 47 | reject |
|
| 24 |
+
| "Linformer" | "Linformer: Self-Attention with Linear Complexity" | 28 | reject |
|
| 25 |
+
|
| 26 |
+
The Linformer case is the canonical false-negative: a short query against
|
| 27 |
+
a long title. Workaround: when the candidate title looks abbreviated
|
| 28 |
+
(< 4 words) and the S2 hit's title contains the candidate as a substring,
|
| 29 |
+
override the ratio check. The paper does not specify this workaround
|
| 30 |
+
explicitly; we add it as a soft safety net to avoid losing legitimate
|
| 31 |
+
short-title hits. See `levenshtein_match.py --substring-bypass`.
|
| 32 |
+
|
| 33 |
+
## Rule 2 — Abstract must exist
|
| 34 |
+
|
| 35 |
+
> To enter the final context pool, the entity must possess a retrievable
|
| 36 |
+
> abstract...
|
| 37 |
+
|
| 38 |
+
Discard any verified hit where `abstract` is null, empty, or `"N/A"`. The
|
| 39 |
+
Section Writing Agent uses the abstract to ground its citations contextually
|
| 40 |
+
(per the Section Writing Agent prompt: "Read the abstract provided in
|
| 41 |
+
citation_map.json for the papers you are citing. Use this context to write
|
| 42 |
+
accurate, specific sentences about those works.").
|
| 43 |
+
|
| 44 |
+
## Rule 3 — Strict temporal cutoff
|
| 45 |
+
|
| 46 |
+
> ...and strictly predate the research cutoff (when specified down to the
|
| 47 |
+
> month, the system defaults to the first day of that month).
|
| 48 |
+
|
| 49 |
+
Implementation: `scripts/check_cutoff.py`. Comparison rules:
|
| 50 |
+
|
| 51 |
+
- Cutoff is given as `YYYY-MM-DD`. The paper aligns it to venue submission
|
| 52 |
+
deadline (Nov 2024 for CVPR 2025, Oct 2024 for ICLR 2025 — App. D.1).
|
| 53 |
+
- Paper year is required. Paper month is optional.
|
| 54 |
+
- If paper has only year: assume month=12, day=31 (worst case for the paper —
|
| 55 |
+
must still be < cutoff).
|
| 56 |
+
- If paper has year + month: assume day=1 of that month.
|
| 57 |
+
- "Strictly predate" means `paper_date < cutoff_date`. Equality fails.
|
| 58 |
+
|
| 59 |
+
Examples (cutoff = 2024-10-01):
|
| 60 |
+
|
| 61 |
+
| Paper year | Paper month | Verdict |
|
| 62 |
+
|---|---|---|
|
| 63 |
+
| 2017 | — | accept |
|
| 64 |
+
| 2024 | 9 | accept (2024-09-01 < 2024-10-01) |
|
| 65 |
+
| 2024 | 10 | reject (2024-10-01 not strictly < 2024-10-01) |
|
| 66 |
+
| 2024 | — (only year) | reject (2024-12-31 ≥ 2024-10-01) |
|
| 67 |
+
|
| 68 |
+
The strict comparison is intentional: it prevents leakage of papers from
|
| 69 |
+
the same submission cycle as the target venue.
|
| 70 |
+
|
| 71 |
+
## Rule 4 — Dedup by Semantic Scholar paperId
|
| 72 |
+
|
| 73 |
+
> Finally, gathered citations are deduplicated using unique paper ID keys.
|
| 74 |
+
|
| 75 |
+
Implementation: `scripts/dedupe_by_id.py`. Key precedence:
|
| 76 |
+
|
| 77 |
+
1. `paperId` (S2's internal unique ID, always present on a verified hit)
|
| 78 |
+
2. `externalIds.DOI` (lowercased)
|
| 79 |
+
3. `externalIds.ArXiv` (without version suffix)
|
| 80 |
+
4. Normalized title (lowercased, alphanumeric only) — fallback only
|
| 81 |
+
|
| 82 |
+
When two candidates collide, keep the one with the higher `match_score`.
|
| 83 |
+
|
| 84 |
+
## Rule 5 — ≥90% citation integration
|
| 85 |
+
|
| 86 |
+
> The system constrains the model to cite only the provided verified papers,
|
| 87 |
+
> explicitly mandating that at least 90% of the gathered literature pool must
|
| 88 |
+
> be actively integrated and cited when synthesizing the Introduction and
|
| 89 |
+
> Related Work sections.
|
| 90 |
+
|
| 91 |
+
Implementation: `scripts/citation_coverage.py`. After the Lit Review writing
|
| 92 |
+
call produces `intro_relwork.tex`, this script:
|
| 93 |
+
|
| 94 |
+
1. Extracts every `\cite{KEY}` and `\citep{KEY}` (and variants) from the
|
| 95 |
+
`.tex` file.
|
| 96 |
+
2. Counts unique cited keys against `len(citation_pool.papers)`.
|
| 97 |
+
3. Requires `cited / total ≥ 0.90`. Exits non-zero if not.
|
| 98 |
+
|
| 99 |
+
If the gate fails, the host agent must re-prompt the writing step,
|
| 100 |
+
explicitly listing the un-cited keys and asking the agent to integrate them.
|
.scider/skills/literature-review-agent/scripts/bibtex_format.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
bibtex_format.py — Generate refs.bib from a verified citation pool.
|
| 4 |
+
|
| 5 |
+
Reads citation_pool.json (output of dedupe_by_id.py) and emits a BibTeX file
|
| 6 |
+
with deterministic citation keys derived from the first author + year +
|
| 7 |
+
first significant title word.
|
| 8 |
+
|
| 9 |
+
Never invents fields. Only writes fields that are actually present in the
|
| 10 |
+
S2 metadata. Writes one of:
|
| 11 |
+
@article{ ... } — when venue looks like a journal
|
| 12 |
+
@inproceedings{ ... }— when venue looks like a conference
|
| 13 |
+
@misc{ ... } — fallback (e.g., arXiv-only papers)
|
| 14 |
+
|
| 15 |
+
Usage:
|
| 16 |
+
python bibtex_format.py --pool citation_pool.json --out refs.bib
|
| 17 |
+
"""
|
| 18 |
+
import argparse
|
| 19 |
+
import json
|
| 20 |
+
import re
|
| 21 |
+
import sys
|
| 22 |
+
|
| 23 |
+
CONFERENCE_HINTS = {
|
| 24 |
+
"neurips",
|
| 25 |
+
"nips",
|
| 26 |
+
"icml",
|
| 27 |
+
"iclr",
|
| 28 |
+
"cvpr",
|
| 29 |
+
"iccv",
|
| 30 |
+
"eccv",
|
| 31 |
+
"aaai",
|
| 32 |
+
"ijcai",
|
| 33 |
+
"acl",
|
| 34 |
+
"emnlp",
|
| 35 |
+
"naacl",
|
| 36 |
+
"kdd",
|
| 37 |
+
"www",
|
| 38 |
+
"sigir",
|
| 39 |
+
"uai",
|
| 40 |
+
"siggraph",
|
| 41 |
+
"interspeech",
|
| 42 |
+
"icassp",
|
| 43 |
+
"miccai",
|
| 44 |
+
"wacv",
|
| 45 |
+
"bmvc",
|
| 46 |
+
"coling",
|
| 47 |
+
"conll",
|
| 48 |
+
}
|
| 49 |
+
STOPWORDS = {
|
| 50 |
+
"a",
|
| 51 |
+
"an",
|
| 52 |
+
"and",
|
| 53 |
+
"the",
|
| 54 |
+
"of",
|
| 55 |
+
"for",
|
| 56 |
+
"to",
|
| 57 |
+
"with",
|
| 58 |
+
"on",
|
| 59 |
+
"in",
|
| 60 |
+
"by",
|
| 61 |
+
"from",
|
| 62 |
+
"as",
|
| 63 |
+
"is",
|
| 64 |
+
"are",
|
| 65 |
+
"be",
|
| 66 |
+
"via",
|
| 67 |
+
"into",
|
| 68 |
+
"their",
|
| 69 |
+
"our",
|
| 70 |
+
"we",
|
| 71 |
+
"this",
|
| 72 |
+
"that",
|
| 73 |
+
"using",
|
| 74 |
+
"use",
|
| 75 |
+
"about",
|
| 76 |
+
"at",
|
| 77 |
+
"or",
|
| 78 |
+
"if",
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def normalize(s: str) -> str:
|
| 83 |
+
return re.sub(r"[^a-z]", "", s.lower())
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def first_significant_word(title: str) -> str:
|
| 87 |
+
for w in re.findall(r"[A-Za-z][A-Za-z\-]*", title):
|
| 88 |
+
wn = w.lower()
|
| 89 |
+
if wn not in STOPWORDS and len(wn) > 2:
|
| 90 |
+
return normalize(wn)
|
| 91 |
+
return "paper"
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def first_author_lastname(authors: list[dict]) -> str:
|
| 95 |
+
if not authors:
|
| 96 |
+
return "anon"
|
| 97 |
+
name = authors[0].get("name", "").strip()
|
| 98 |
+
if not name:
|
| 99 |
+
return "anon"
|
| 100 |
+
parts = name.replace(",", "").split()
|
| 101 |
+
return normalize(parts[-1]) or "anon"
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def make_key(paper: dict) -> str:
|
| 105 |
+
last = first_author_lastname(paper.get("authors") or [])
|
| 106 |
+
year = paper.get("year") or "0000"
|
| 107 |
+
word = first_significant_word(paper.get("title", ""))
|
| 108 |
+
return f"{last}{year}{word}"
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def is_conference(venue: str) -> bool:
|
| 112 |
+
if not venue:
|
| 113 |
+
return False
|
| 114 |
+
v = venue.lower()
|
| 115 |
+
return any(h in v for h in CONFERENCE_HINTS)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def escape_bibtex(s: str) -> str:
|
| 119 |
+
if not s:
|
| 120 |
+
return ""
|
| 121 |
+
return s.replace("{", "\\{").replace("}", "\\}").replace("&", "\\&")
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def author_field(authors: list[dict]) -> str:
|
| 125 |
+
names = [a.get("name", "").strip() for a in authors if a.get("name")]
|
| 126 |
+
return " and ".join(escape_bibtex(n) for n in names)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def format_entry(paper: dict, key: str) -> str:
|
| 130 |
+
venue = paper.get("venue") or ""
|
| 131 |
+
if is_conference(venue):
|
| 132 |
+
kind = "inproceedings"
|
| 133 |
+
venue_key = "booktitle"
|
| 134 |
+
elif venue:
|
| 135 |
+
kind = "article"
|
| 136 |
+
venue_key = "journal"
|
| 137 |
+
else:
|
| 138 |
+
kind = "misc"
|
| 139 |
+
venue_key = None
|
| 140 |
+
|
| 141 |
+
lines = [f"@{kind}{{{key},"]
|
| 142 |
+
if title := paper.get("title"):
|
| 143 |
+
lines.append(f" title = {{{escape_bibtex(title)}}},")
|
| 144 |
+
if authors := paper.get("authors"):
|
| 145 |
+
lines.append(f" author = {{{author_field(authors)}}},")
|
| 146 |
+
if year := paper.get("year"):
|
| 147 |
+
lines.append(f" year = {{{year}}},")
|
| 148 |
+
if venue and venue_key:
|
| 149 |
+
lines.append(f" {venue_key:8s} = {{{escape_bibtex(venue)}}},")
|
| 150 |
+
ext = paper.get("externalIds") or {}
|
| 151 |
+
if doi := ext.get("DOI"):
|
| 152 |
+
lines.append(f" doi = {{{doi}}},")
|
| 153 |
+
if arxiv := ext.get("ArXiv"):
|
| 154 |
+
lines.append(f" eprint = {{{arxiv}}},")
|
| 155 |
+
lines.append(f" archivePrefix = {{arXiv}},")
|
| 156 |
+
# Strip trailing comma on last field
|
| 157 |
+
if lines[-1].endswith(","):
|
| 158 |
+
lines[-1] = lines[-1].rstrip(",")
|
| 159 |
+
lines.append("}")
|
| 160 |
+
return "\n".join(lines)
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def main() -> int:
|
| 164 |
+
p = argparse.ArgumentParser(description=__doc__)
|
| 165 |
+
p.add_argument("--pool", required=True, help="citation_pool.json")
|
| 166 |
+
p.add_argument("--out", required=True, help="output refs.bib")
|
| 167 |
+
args = p.parse_args()
|
| 168 |
+
|
| 169 |
+
with open(args.pool) as f:
|
| 170 |
+
pool = json.load(f)
|
| 171 |
+
papers = pool.get("papers", [])
|
| 172 |
+
if not papers:
|
| 173 |
+
print("ERROR: pool contains no papers", file=sys.stderr)
|
| 174 |
+
return 1
|
| 175 |
+
|
| 176 |
+
keys_used: dict[str, int] = {}
|
| 177 |
+
entries: list[str] = []
|
| 178 |
+
paper_keys: list[str] = []
|
| 179 |
+
|
| 180 |
+
for paper in papers:
|
| 181 |
+
base_key = make_key(paper)
|
| 182 |
+
# Disambiguate collisions with letter suffix
|
| 183 |
+
if base_key in keys_used:
|
| 184 |
+
keys_used[base_key] += 1
|
| 185 |
+
suffix = chr(ord("a") + keys_used[base_key] - 1)
|
| 186 |
+
key = base_key + suffix
|
| 187 |
+
else:
|
| 188 |
+
keys_used[base_key] = 1
|
| 189 |
+
key = base_key
|
| 190 |
+
paper["bibtex_key"] = key
|
| 191 |
+
paper_keys.append(key)
|
| 192 |
+
entries.append(format_entry(paper, key))
|
| 193 |
+
|
| 194 |
+
with open(args.out, "w") as f:
|
| 195 |
+
f.write("% Generated by paper-orchestra literature-review-agent/bibtex_format.py\n")
|
| 196 |
+
f.write(f"% {len(entries)} entries from citation_pool.json\n\n")
|
| 197 |
+
f.write("\n\n".join(entries))
|
| 198 |
+
f.write("\n")
|
| 199 |
+
|
| 200 |
+
# Write the keys back into the pool so the writing step has the
|
| 201 |
+
# citation_checklist mapping. (Idempotent — overwrites with same data.)
|
| 202 |
+
with open(args.pool, "w") as f:
|
| 203 |
+
json.dump(pool, f, indent=2, ensure_ascii=False)
|
| 204 |
+
|
| 205 |
+
print(f"OK: {len(entries)} BibTeX entries → {args.out}")
|
| 206 |
+
print(f" keys: {', '.join(paper_keys[:5])}{'...' if len(paper_keys) > 5 else ''}")
|
| 207 |
+
return 0
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
if __name__ == "__main__":
|
| 211 |
+
sys.exit(main())
|
.scider/skills/literature-review-agent/scripts/check_cutoff.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
check_cutoff.py — Strict temporal cutoff check for citation verification.
|
| 4 |
+
|
| 5 |
+
Implements the paper's Rule 3 (App. D.3): a paper passes only if its
|
| 6 |
+
publication date strictly predates the research cutoff. When only the year
|
| 7 |
+
is known, assume the worst case (Dec 31). When year + month are known,
|
| 8 |
+
assume day-1 of that month (per the paper's "first day of that month"
|
| 9 |
+
default).
|
| 10 |
+
|
| 11 |
+
Exit codes:
|
| 12 |
+
0 paper strictly predates cutoff (PASS)
|
| 13 |
+
1 paper does not strictly predate cutoff (FAIL)
|
| 14 |
+
2 argument error
|
| 15 |
+
|
| 16 |
+
Usage:
|
| 17 |
+
python check_cutoff.py --paper-year 2024 --paper-month 9 --cutoff 2024-10-01
|
| 18 |
+
python check_cutoff.py --paper-year 2024 --cutoff 2024-10-01
|
| 19 |
+
python check_cutoff.py --paper-date 2024-09-15 --cutoff 2024-10-01
|
| 20 |
+
"""
|
| 21 |
+
import argparse
|
| 22 |
+
import datetime as dt
|
| 23 |
+
import sys
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def main() -> int:
|
| 27 |
+
p = argparse.ArgumentParser(description=__doc__)
|
| 28 |
+
p.add_argument("--paper-year", type=int, help="Paper publication year")
|
| 29 |
+
p.add_argument("--paper-month", type=int, help="Paper publication month (1-12), optional")
|
| 30 |
+
p.add_argument("--paper-date", help="Full paper date YYYY-MM-DD, overrides year/month")
|
| 31 |
+
p.add_argument("--cutoff", required=True, help="Research cutoff date YYYY-MM-DD")
|
| 32 |
+
args = p.parse_args()
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
cutoff = dt.date.fromisoformat(args.cutoff)
|
| 36 |
+
except ValueError:
|
| 37 |
+
print(f"ERROR: --cutoff must be YYYY-MM-DD, got {args.cutoff}", file=sys.stderr)
|
| 38 |
+
return 2
|
| 39 |
+
|
| 40 |
+
if args.paper_date:
|
| 41 |
+
try:
|
| 42 |
+
paper_date = dt.date.fromisoformat(args.paper_date)
|
| 43 |
+
except ValueError:
|
| 44 |
+
print(f"ERROR: --paper-date must be YYYY-MM-DD, got {args.paper_date}", file=sys.stderr)
|
| 45 |
+
return 2
|
| 46 |
+
elif args.paper_year:
|
| 47 |
+
if args.paper_month:
|
| 48 |
+
paper_date = dt.date(args.paper_year, args.paper_month, 1)
|
| 49 |
+
else:
|
| 50 |
+
paper_date = dt.date(args.paper_year, 12, 31)
|
| 51 |
+
else:
|
| 52 |
+
print("ERROR: must provide --paper-date OR --paper-year", file=sys.stderr)
|
| 53 |
+
return 2
|
| 54 |
+
|
| 55 |
+
if paper_date < cutoff:
|
| 56 |
+
print(f"PASS paper={paper_date} < cutoff={cutoff}")
|
| 57 |
+
return 0
|
| 58 |
+
print(f"FAIL paper={paper_date} not strictly before cutoff={cutoff}")
|
| 59 |
+
return 1
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
sys.exit(main())
|
.scider/skills/literature-review-agent/scripts/citation_coverage.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
citation_coverage.py — Enforce the paper's ≥90% citation integration rule
|
| 4 |
+
(App. D.3).
|
| 5 |
+
|
| 6 |
+
Greps a generated .tex file for all citation commands, counts the unique
|
| 7 |
+
keys actually cited, and compares against the verified citation pool.
|
| 8 |
+
Exits non-zero if coverage < 90%.
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
python citation_coverage.py --tex intro_relwork.tex --pool citation_pool.json
|
| 12 |
+
python citation_coverage.py --tex intro_relwork.tex --pool citation_pool.json --threshold 0.85
|
| 13 |
+
"""
|
| 14 |
+
import argparse
|
| 15 |
+
import json
|
| 16 |
+
import re
|
| 17 |
+
import sys
|
| 18 |
+
|
| 19 |
+
CITE_RE = re.compile(
|
| 20 |
+
r"\\(?:cite|citep|citet|citeauthor|citeyear|autocite|parencite|textcite)"
|
| 21 |
+
r"(?:\[[^\]]*\])?"
|
| 22 |
+
r"\{([^}]+)\}"
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def extract_cited_keys(tex: str) -> set[str]:
|
| 27 |
+
keys = set()
|
| 28 |
+
for m in CITE_RE.finditer(tex):
|
| 29 |
+
for k in m.group(1).split(","):
|
| 30 |
+
k = k.strip()
|
| 31 |
+
if k:
|
| 32 |
+
keys.add(k)
|
| 33 |
+
return keys
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def main() -> int:
|
| 37 |
+
p = argparse.ArgumentParser(description=__doc__)
|
| 38 |
+
p.add_argument("--tex", required=True, help="LaTeX file to inspect")
|
| 39 |
+
p.add_argument("--pool", required=True, help="citation_pool.json")
|
| 40 |
+
p.add_argument(
|
| 41 |
+
"--threshold",
|
| 42 |
+
type=float,
|
| 43 |
+
default=0.90,
|
| 44 |
+
help="Minimum integration ratio (default 0.90 per paper)",
|
| 45 |
+
)
|
| 46 |
+
args = p.parse_args()
|
| 47 |
+
|
| 48 |
+
with open(args.tex) as f:
|
| 49 |
+
tex = f.read()
|
| 50 |
+
with open(args.pool) as f:
|
| 51 |
+
pool = json.load(f)
|
| 52 |
+
|
| 53 |
+
pool_papers = pool.get("papers", [])
|
| 54 |
+
pool_keys = {p.get("bibtex_key") for p in pool_papers if p.get("bibtex_key")}
|
| 55 |
+
if not pool_keys:
|
| 56 |
+
print("ERROR: pool has no bibtex_keys. Run bibtex_format.py first.", file=sys.stderr)
|
| 57 |
+
return 1
|
| 58 |
+
|
| 59 |
+
cited = extract_cited_keys(tex)
|
| 60 |
+
cited_in_pool = cited & pool_keys
|
| 61 |
+
n_pool = len(pool_keys)
|
| 62 |
+
n_cited = len(cited_in_pool)
|
| 63 |
+
ratio = n_cited / n_pool if n_pool else 0.0
|
| 64 |
+
threshold_n = int(args.threshold * n_pool)
|
| 65 |
+
|
| 66 |
+
print(
|
| 67 |
+
f"Coverage: {n_cited}/{n_pool} = {ratio*100:.1f}% "
|
| 68 |
+
f"(threshold {args.threshold*100:.0f}% = {threshold_n})"
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# report keys cited but NOT in pool — those are forbidden by the prompt
|
| 72 |
+
foreign = cited - pool_keys
|
| 73 |
+
if foreign:
|
| 74 |
+
print(
|
| 75 |
+
f"\nWARNING: {len(foreign)} cited keys NOT in citation pool "
|
| 76 |
+
f"(violates 'cite ONLY collected_papers' rule):"
|
| 77 |
+
)
|
| 78 |
+
for k in sorted(foreign):
|
| 79 |
+
print(f" - {k}")
|
| 80 |
+
|
| 81 |
+
if n_cited < threshold_n:
|
| 82 |
+
uncited = pool_keys - cited
|
| 83 |
+
print(f"\nFAIL: missing {len(uncited)} pool papers from .tex:")
|
| 84 |
+
# show with title for actionable re-prompting
|
| 85 |
+
title_by_key = {
|
| 86 |
+
p.get("bibtex_key"): p.get("title", "") for p in pool_papers if p.get("bibtex_key")
|
| 87 |
+
}
|
| 88 |
+
discovered_by_key = {
|
| 89 |
+
p.get("bibtex_key"): p.get("discovered_for", [])
|
| 90 |
+
for p in pool_papers
|
| 91 |
+
if p.get("bibtex_key")
|
| 92 |
+
}
|
| 93 |
+
for k in sorted(uncited):
|
| 94 |
+
tag = ",".join(discovered_by_key.get(k, [])) or "?"
|
| 95 |
+
t = title_by_key.get(k, "")
|
| 96 |
+
print(f" - {k:40s} [{tag}] {t[:60]}")
|
| 97 |
+
return 1
|
| 98 |
+
|
| 99 |
+
print("OK: citation coverage meets threshold")
|
| 100 |
+
return 0
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
if __name__ == "__main__":
|
| 104 |
+
sys.exit(main())
|
.scider/skills/literature-review-agent/scripts/dedupe_by_id.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
dedupe_by_id.py — Deduplicate a verified citation pool using Semantic Scholar
|
| 4 |
+
unique paperId, with DOI / ArXiv / normalized-title fallbacks.
|
| 5 |
+
|
| 6 |
+
Implements the paper's Rule 4 (App. D.3): "gathered citations are
|
| 7 |
+
deduplicated using unique paper ID keys".
|
| 8 |
+
|
| 9 |
+
Also computes `min_cite_paper_count = floor(0.9 * len(papers))` for the
|
| 10 |
+
≥90% citation integration rule.
|
| 11 |
+
|
| 12 |
+
Usage:
|
| 13 |
+
python dedupe_by_id.py --in raw_pool.json --out citation_pool.json [--cutoff 2024-10-01]
|
| 14 |
+
"""
|
| 15 |
+
import argparse
|
| 16 |
+
import json
|
| 17 |
+
import math
|
| 18 |
+
import re
|
| 19 |
+
import sys
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def norm_title(t: str) -> str:
|
| 23 |
+
return re.sub(r"[^a-z0-9]", "", t.lower())
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def make_key(paper: dict) -> str:
|
| 27 |
+
if paper.get("paperId"):
|
| 28 |
+
return f"s2:{paper['paperId']}"
|
| 29 |
+
ext = paper.get("externalIds") or {}
|
| 30 |
+
if ext.get("DOI"):
|
| 31 |
+
return f"doi:{ext['DOI'].lower()}"
|
| 32 |
+
if ext.get("ArXiv"):
|
| 33 |
+
# strip version suffix if any
|
| 34 |
+
a = ext["ArXiv"].split("v")[0] if "v" in ext["ArXiv"][-3:] else ext["ArXiv"]
|
| 35 |
+
return f"arxiv:{a.lower()}"
|
| 36 |
+
title = paper.get("title", "")
|
| 37 |
+
return f"title:{norm_title(title)}"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def main() -> int:
|
| 41 |
+
p = argparse.ArgumentParser(description=__doc__)
|
| 42 |
+
p.add_argument("--in", dest="inp", required=True, help="Raw verified pool JSON")
|
| 43 |
+
p.add_argument("--out", required=True, help="Deduped citation_pool.json")
|
| 44 |
+
p.add_argument("--cutoff", help="Cutoff date YYYY-MM-DD (recorded in output)")
|
| 45 |
+
args = p.parse_args()
|
| 46 |
+
|
| 47 |
+
with open(args.inp) as f:
|
| 48 |
+
raw = json.load(f)
|
| 49 |
+
|
| 50 |
+
candidates = raw.get("papers") or raw.get("candidates") or []
|
| 51 |
+
if not candidates:
|
| 52 |
+
print("ERROR: input has neither 'papers' nor 'candidates' key", file=sys.stderr)
|
| 53 |
+
return 1
|
| 54 |
+
|
| 55 |
+
by_key: dict[str, dict] = {}
|
| 56 |
+
collisions: list[tuple[str, str]] = []
|
| 57 |
+
for c in candidates:
|
| 58 |
+
key = make_key(c)
|
| 59 |
+
if key in by_key:
|
| 60 |
+
existing = by_key[key]
|
| 61 |
+
score_new = c.get("match_score", 0)
|
| 62 |
+
score_old = existing.get("match_score", 0)
|
| 63 |
+
if score_new > score_old:
|
| 64 |
+
# merge discovered_for
|
| 65 |
+
merged = existing.get("discovered_for", []) + c.get("discovered_for", [])
|
| 66 |
+
c["discovered_for"] = list(dict.fromkeys(merged)) # preserve order, dedupe
|
| 67 |
+
by_key[key] = c
|
| 68 |
+
else:
|
| 69 |
+
merged = existing.get("discovered_for", []) + c.get("discovered_for", [])
|
| 70 |
+
existing["discovered_for"] = list(dict.fromkeys(merged))
|
| 71 |
+
collisions.append((key, c.get("title", "")))
|
| 72 |
+
else:
|
| 73 |
+
by_key[key] = c
|
| 74 |
+
|
| 75 |
+
deduped = list(by_key.values())
|
| 76 |
+
n = len(deduped)
|
| 77 |
+
min_cite = math.floor(0.9 * n)
|
| 78 |
+
|
| 79 |
+
out = {
|
| 80 |
+
"papers": deduped,
|
| 81 |
+
"min_cite_paper_count": min_cite,
|
| 82 |
+
"n_total": n,
|
| 83 |
+
"n_collisions_merged": len(collisions),
|
| 84 |
+
}
|
| 85 |
+
if args.cutoff:
|
| 86 |
+
out["cutoff_date"] = args.cutoff
|
| 87 |
+
|
| 88 |
+
with open(args.out, "w") as f:
|
| 89 |
+
json.dump(out, f, indent=2, ensure_ascii=False)
|
| 90 |
+
|
| 91 |
+
print(f"OK: {len(candidates)} candidates → {n} unique papers")
|
| 92 |
+
print(f" {len(collisions)} duplicates merged")
|
| 93 |
+
print(f" min_cite_paper_count (≥90%): {min_cite}")
|
| 94 |
+
return 0
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
if __name__ == "__main__":
|
| 98 |
+
sys.exit(main())
|
.scider/skills/literature-review-agent/scripts/exa_search.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
exa_search.py — Optional Exa (https://exa.ai) backend for the literature
|
| 4 |
+
review agent's Phase 1 (parallel candidate discovery) step.
|
| 5 |
+
|
| 6 |
+
Exa is a search engine optimized for finding academic papers and other
|
| 7 |
+
high-quality content. It is OPTIONAL — the literature-review-agent works
|
| 8 |
+
fine with any host coding agent's native web search tool. Use Exa only if:
|
| 9 |
+
|
| 10 |
+
- Your host has no built-in web search (e.g., Aider, OpenCode, generic
|
| 11 |
+
CLI agents).
|
| 12 |
+
- You want a research-paper-focused search backend with better
|
| 13 |
+
signal-to-noise than general web search.
|
| 14 |
+
- You're running the pipeline in batch / non-interactive mode and want
|
| 15 |
+
a deterministic, scriptable backend.
|
| 16 |
+
|
| 17 |
+
This helper reads EXA_API_KEY from the environment. The key is YOUR
|
| 18 |
+
responsibility to provide; this repo never commits one. Get a key at
|
| 19 |
+
https://dashboard.exa.ai/.
|
| 20 |
+
|
| 21 |
+
Usage:
|
| 22 |
+
export EXA_API_KEY="your-key-here"
|
| 23 |
+
python exa_search.py --query "Sparse attention long context" --num-results 15
|
| 24 |
+
python exa_search.py --query "..." --raw # full JSON
|
| 25 |
+
python exa_search.py --query "..." --discovered-for "related_work[2.1]"
|
| 26 |
+
|
| 27 |
+
Default output: JSON candidates in the literature-review-agent format, ready
|
| 28 |
+
to be merged into raw_candidates.json before Phase 2 verification.
|
| 29 |
+
|
| 30 |
+
Exit codes:
|
| 31 |
+
0 query succeeded
|
| 32 |
+
1 EXA_API_KEY missing, HTTP error, network error, or empty results
|
| 33 |
+
"""
|
| 34 |
+
import argparse
|
| 35 |
+
import json
|
| 36 |
+
import os
|
| 37 |
+
import sys
|
| 38 |
+
import urllib.error
|
| 39 |
+
import urllib.request
|
| 40 |
+
|
| 41 |
+
EXA_ENDPOINT = "https://api.exa.ai/search"
|
| 42 |
+
DEFAULT_NUM = 10
|
| 43 |
+
MAX_NUM = 20 # the user explicitly asked for a 10-20 range
|
| 44 |
+
SNIPPET_CAP = 1500
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def search(query: str, num_results: int, category: str | None, highlight_max_chars: int) -> dict:
|
| 48 |
+
api_key = os.environ.get("EXA_API_KEY")
|
| 49 |
+
if not api_key:
|
| 50 |
+
print(
|
| 51 |
+
"ERROR: EXA_API_KEY environment variable not set.\n"
|
| 52 |
+
"Get a key at https://dashboard.exa.ai/ and run:\n"
|
| 53 |
+
' export EXA_API_KEY="your-key-here"\n'
|
| 54 |
+
"Then retry. The literature-review-agent also works without\n"
|
| 55 |
+
"Exa — see references/discovery-pipeline.md for the default\n"
|
| 56 |
+
"host-native web search path.",
|
| 57 |
+
file=sys.stderr,
|
| 58 |
+
)
|
| 59 |
+
sys.exit(1)
|
| 60 |
+
|
| 61 |
+
body: dict = {
|
| 62 |
+
"query": query,
|
| 63 |
+
"numResults": num_results,
|
| 64 |
+
"type": "auto",
|
| 65 |
+
"contents": {"highlights": {"maxCharacters": highlight_max_chars}},
|
| 66 |
+
}
|
| 67 |
+
if category:
|
| 68 |
+
body["category"] = category
|
| 69 |
+
|
| 70 |
+
req = urllib.request.Request(
|
| 71 |
+
EXA_ENDPOINT,
|
| 72 |
+
data=json.dumps(body).encode("utf-8"),
|
| 73 |
+
headers={
|
| 74 |
+
"content-type": "application/json",
|
| 75 |
+
"x-api-key": api_key,
|
| 76 |
+
},
|
| 77 |
+
method="POST",
|
| 78 |
+
)
|
| 79 |
+
try:
|
| 80 |
+
with urllib.request.urlopen(req, timeout=30) as resp:
|
| 81 |
+
return json.loads(resp.read().decode("utf-8"))
|
| 82 |
+
except urllib.error.HTTPError as e:
|
| 83 |
+
body_text = e.read().decode("utf-8", errors="replace")[:500]
|
| 84 |
+
print(f"ERROR: Exa HTTP {e.code}: {body_text}", file=sys.stderr)
|
| 85 |
+
sys.exit(1)
|
| 86 |
+
except urllib.error.URLError as e:
|
| 87 |
+
print(f"ERROR: Exa network error: {e.reason}", file=sys.stderr)
|
| 88 |
+
sys.exit(1)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def normalize(exa_response: dict, discovered_for: list[str]) -> list[dict]:
|
| 92 |
+
"""Convert Exa results into the literature-review-agent candidate format."""
|
| 93 |
+
candidates: list[dict] = []
|
| 94 |
+
for r in exa_response.get("results", []):
|
| 95 |
+
title = (r.get("title") or "").strip()
|
| 96 |
+
url = r.get("url") or r.get("id") or ""
|
| 97 |
+
highlights = r.get("highlights") or []
|
| 98 |
+
snippet = " ".join(h.strip() for h in highlights)[:SNIPPET_CAP]
|
| 99 |
+
candidates.append(
|
| 100 |
+
{
|
| 101 |
+
"title": title,
|
| 102 |
+
"snippet": snippet,
|
| 103 |
+
"source_url": url,
|
| 104 |
+
"discovered_for": list(discovered_for),
|
| 105 |
+
"_exa_id": r.get("id"),
|
| 106 |
+
"_exa_published_date": r.get("publishedDate"),
|
| 107 |
+
}
|
| 108 |
+
)
|
| 109 |
+
return candidates
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def main() -> int:
|
| 113 |
+
p = argparse.ArgumentParser(
|
| 114 |
+
description=__doc__,
|
| 115 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 116 |
+
)
|
| 117 |
+
p.add_argument("--query", required=True, help="Search query")
|
| 118 |
+
p.add_argument(
|
| 119 |
+
"--num-results",
|
| 120 |
+
type=int,
|
| 121 |
+
default=DEFAULT_NUM,
|
| 122 |
+
help=f"Number of results to fetch " f"(default {DEFAULT_NUM}, clamped to [1, {MAX_NUM}])",
|
| 123 |
+
)
|
| 124 |
+
p.add_argument(
|
| 125 |
+
"--category",
|
| 126 |
+
default="research paper",
|
| 127 |
+
help='Exa category filter (default "research paper"; ' "pass an empty string to disable)",
|
| 128 |
+
)
|
| 129 |
+
p.add_argument(
|
| 130 |
+
"--highlight-chars",
|
| 131 |
+
type=int,
|
| 132 |
+
default=4000,
|
| 133 |
+
help="Max characters per highlight (default 4000)",
|
| 134 |
+
)
|
| 135 |
+
p.add_argument(
|
| 136 |
+
"--discovered-for",
|
| 137 |
+
default="intro",
|
| 138 |
+
help="Tag to attach to each candidate "
|
| 139 |
+
'(default "intro"). Use "related_work[2.1]" or '
|
| 140 |
+
"similar for cluster-specific queries so the "
|
| 141 |
+
"downstream citation_coverage gate can attribute "
|
| 142 |
+
"the citation to the right section.",
|
| 143 |
+
)
|
| 144 |
+
p.add_argument(
|
| 145 |
+
"--raw",
|
| 146 |
+
action="store_true",
|
| 147 |
+
help="Print the full Exa response JSON unmodified " "instead of normalized candidates",
|
| 148 |
+
)
|
| 149 |
+
args = p.parse_args()
|
| 150 |
+
|
| 151 |
+
n = max(1, min(MAX_NUM, args.num_results))
|
| 152 |
+
category = args.category or None
|
| 153 |
+
|
| 154 |
+
response = search(args.query, n, category, args.highlight_chars)
|
| 155 |
+
if not response.get("results"):
|
| 156 |
+
print(f"WARN: Exa returned 0 results for query: {args.query!r}", file=sys.stderr)
|
| 157 |
+
return 1
|
| 158 |
+
|
| 159 |
+
if args.raw:
|
| 160 |
+
json.dump(response, sys.stdout, indent=2, ensure_ascii=False)
|
| 161 |
+
else:
|
| 162 |
+
candidates = normalize(response, [args.discovered_for])
|
| 163 |
+
json.dump({"candidates": candidates}, sys.stdout, indent=2, ensure_ascii=False)
|
| 164 |
+
sys.stdout.write("\n")
|
| 165 |
+
return 0
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
if __name__ == "__main__":
|
| 169 |
+
sys.exit(main())
|
.scider/skills/literature-review-agent/scripts/levenshtein_match.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
levenshtein_match.py — Fuzzy title match for citation verification.
|
| 4 |
+
|
| 5 |
+
Implements the paper's Rule 1 (App. D.3): a candidate paper passes only if
|
| 6 |
+
its title's Levenshtein ratio against the Semantic Scholar hit's title is
|
| 7 |
+
strictly greater than 70.
|
| 8 |
+
|
| 9 |
+
Includes a substring-bypass safety net for short candidate titles (the
|
| 10 |
+
Linformer false-negative case): if the candidate is < 4 words and is
|
| 11 |
+
contained as a substring in the S2 hit's title, return 100.
|
| 12 |
+
|
| 13 |
+
Exit code is always 0; the integer ratio is printed to stdout. The caller
|
| 14 |
+
parses it and decides whether to discard.
|
| 15 |
+
|
| 16 |
+
Usage:
|
| 17 |
+
python levenshtein_match.py --candidate "..." --found "..."
|
| 18 |
+
python levenshtein_match.py --candidate "..." --found "..." --substring-bypass
|
| 19 |
+
"""
|
| 20 |
+
import argparse
|
| 21 |
+
import re
|
| 22 |
+
import sys
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
import Levenshtein
|
| 26 |
+
except ImportError:
|
| 27 |
+
print(
|
| 28 |
+
"ERROR: python-Levenshtein required. Install with: pip install python-Levenshtein",
|
| 29 |
+
file=sys.stderr,
|
| 30 |
+
)
|
| 31 |
+
sys.exit(2)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def normalize(s: str) -> str:
|
| 35 |
+
s = s.lower().strip()
|
| 36 |
+
s = re.sub(r"[^a-z0-9\s]", " ", s)
|
| 37 |
+
s = re.sub(r"\s+", " ", s)
|
| 38 |
+
return s
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def ratio(a: str, b: str, substring_bypass: bool = False) -> int:
|
| 42 |
+
na, nb = normalize(a), normalize(b)
|
| 43 |
+
r = int(round(Levenshtein.ratio(na, nb) * 100))
|
| 44 |
+
if substring_bypass and len(na.split()) < 4:
|
| 45 |
+
if na in nb:
|
| 46 |
+
return max(r, 95)
|
| 47 |
+
return r
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def main() -> int:
|
| 51 |
+
p = argparse.ArgumentParser(description=__doc__)
|
| 52 |
+
p.add_argument(
|
| 53 |
+
"--candidate", required=True, help="The original candidate title (from web search)"
|
| 54 |
+
)
|
| 55 |
+
p.add_argument("--found", required=True, help="The title returned by Semantic Scholar")
|
| 56 |
+
p.add_argument(
|
| 57 |
+
"--substring-bypass",
|
| 58 |
+
action="store_true",
|
| 59 |
+
help="Bump short-candidate substring matches to 95",
|
| 60 |
+
)
|
| 61 |
+
p.add_argument(
|
| 62 |
+
"--threshold", type=int, default=70, help="Print PASS/FAIL alongside the ratio (default 70)"
|
| 63 |
+
)
|
| 64 |
+
args = p.parse_args()
|
| 65 |
+
|
| 66 |
+
r = ratio(args.candidate, args.found, args.substring_bypass)
|
| 67 |
+
verdict = "PASS" if r > args.threshold else "FAIL"
|
| 68 |
+
print(f"{r} {verdict}")
|
| 69 |
+
return 0
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
if __name__ == "__main__":
|
| 73 |
+
sys.exit(main())
|
.scider/skills/literature-review-agent/scripts/pre_dedup_candidates.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
pre_dedup_candidates.py — Deduplicate Phase 1 raw candidates by normalized
|
| 4 |
+
title before Phase 2 Semantic Scholar verification.
|
| 5 |
+
|
| 6 |
+
Multiple search queries in Phase 1 often return the same papers. Verifying
|
| 7 |
+
duplicates wastes S2 quota (1 QPS hard cap) and adds 30-40% unnecessary
|
| 8 |
+
wall-time. This script removes obvious duplicates — same paper found via
|
| 9 |
+
multiple queries — before the sequential verification loop begins.
|
| 10 |
+
|
| 11 |
+
Dedup strategy (in order of preference):
|
| 12 |
+
1. Exact arXiv ID match extracted from source URL or snippet.
|
| 13 |
+
2. Levenshtein ratio >= 92 on normalized titles (high threshold to avoid
|
| 14 |
+
false collisions between similarly-named papers).
|
| 15 |
+
|
| 16 |
+
When two candidates are considered the same, we keep the one that appeared
|
| 17 |
+
earlier in the list and merge their `discovered_for` attribution tags so
|
| 18 |
+
the surviving entry is credited to all originating queries.
|
| 19 |
+
|
| 20 |
+
Usage:
|
| 21 |
+
python pre_dedup_candidates.py \\
|
| 22 |
+
--in workspace/raw_candidates.json \\
|
| 23 |
+
--out workspace/deduped_candidates.json
|
| 24 |
+
|
| 25 |
+
Input JSON shape:
|
| 26 |
+
{"candidates": [{"title": "...", "url": "...", "snippet": "...",
|
| 27 |
+
"discovered_for": ["intro.1"]}, ...]}
|
| 28 |
+
OR a bare list.
|
| 29 |
+
"""
|
| 30 |
+
import argparse
|
| 31 |
+
import json
|
| 32 |
+
import re
|
| 33 |
+
import sys
|
| 34 |
+
|
| 35 |
+
ARXIV_RE = re.compile(r"arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5})", re.IGNORECASE)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def norm_title(t: str) -> str:
|
| 39 |
+
t = re.sub(r"[^a-z0-9 ]", " ", t.lower())
|
| 40 |
+
return " ".join(t.split())
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def levenshtein_ratio(a: str, b: str) -> float:
|
| 44 |
+
if not a and not b:
|
| 45 |
+
return 100.0
|
| 46 |
+
if not a or not b:
|
| 47 |
+
return 0.0
|
| 48 |
+
la, lb = len(a), len(b)
|
| 49 |
+
if la < lb:
|
| 50 |
+
a, b = b, a
|
| 51 |
+
la, lb = lb, la
|
| 52 |
+
prev = list(range(lb + 1))
|
| 53 |
+
for i, ca in enumerate(a):
|
| 54 |
+
curr = [i + 1]
|
| 55 |
+
for j, cb in enumerate(b):
|
| 56 |
+
cost = 0 if ca == cb else 1
|
| 57 |
+
curr.append(min(prev[j + 1] + 1, curr[j] + 1, prev[j] + cost))
|
| 58 |
+
prev = curr
|
| 59 |
+
dist = prev[lb]
|
| 60 |
+
return (1.0 - dist / max(la, lb)) * 100.0
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def extract_arxiv_id(candidate: dict) -> str | None:
|
| 64 |
+
for text in (candidate.get("url", ""), candidate.get("snippet", "")):
|
| 65 |
+
m = ARXIV_RE.search(text)
|
| 66 |
+
if m:
|
| 67 |
+
return m.group(1)
|
| 68 |
+
return None
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def make_exact_key(candidate: dict) -> str:
|
| 72 |
+
"""Canonical key: arXiv ID if extractable, else normalized title."""
|
| 73 |
+
aid = extract_arxiv_id(candidate)
|
| 74 |
+
if aid:
|
| 75 |
+
return f"arxiv:{aid}"
|
| 76 |
+
return f"title:{norm_title(candidate.get('title', ''))}"
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def merge_discovered_for(a: dict, b: dict) -> list:
|
| 80 |
+
df_a = a.get("discovered_for") or []
|
| 81 |
+
df_b = b.get("discovered_for") or []
|
| 82 |
+
return list(dict.fromkeys(df_a + df_b))
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def dedup(candidates: list[dict], title_ratio_threshold: float = 92.0) -> list[dict]:
|
| 86 |
+
# Pass 1: exact key dedup (arXiv ID or identical normalized title)
|
| 87 |
+
by_key: dict[str, dict] = {}
|
| 88 |
+
for c in candidates:
|
| 89 |
+
key = make_exact_key(c)
|
| 90 |
+
if key in by_key:
|
| 91 |
+
by_key[key]["discovered_for"] = merge_discovered_for(by_key[key], c)
|
| 92 |
+
else:
|
| 93 |
+
by_key[key] = dict(c)
|
| 94 |
+
|
| 95 |
+
deduped = list(by_key.values())
|
| 96 |
+
|
| 97 |
+
# Pass 2: fuzzy title dedup — O(n²) but n is ~50-100 candidates max
|
| 98 |
+
normed = [norm_title(c.get("title", "")) for c in deduped]
|
| 99 |
+
drop: set[int] = set()
|
| 100 |
+
for i in range(len(deduped)):
|
| 101 |
+
if i in drop:
|
| 102 |
+
continue
|
| 103 |
+
for j in range(i + 1, len(deduped)):
|
| 104 |
+
if j in drop:
|
| 105 |
+
continue
|
| 106 |
+
if levenshtein_ratio(normed[i], normed[j]) >= title_ratio_threshold:
|
| 107 |
+
deduped[i]["discovered_for"] = merge_discovered_for(deduped[i], deduped[j])
|
| 108 |
+
drop.add(j)
|
| 109 |
+
|
| 110 |
+
return [c for idx, c in enumerate(deduped) if idx not in drop]
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def main() -> int:
|
| 114 |
+
p = argparse.ArgumentParser(description=__doc__)
|
| 115 |
+
p.add_argument("--in", dest="inp", required=True, help="Raw Phase 1 candidates JSON")
|
| 116 |
+
p.add_argument("--out", required=True, help="Deduped candidates JSON")
|
| 117 |
+
p.add_argument(
|
| 118 |
+
"--title-ratio",
|
| 119 |
+
type=float,
|
| 120 |
+
default=92.0,
|
| 121 |
+
help="Levenshtein ratio threshold for fuzzy title match (default: 92)",
|
| 122 |
+
)
|
| 123 |
+
args = p.parse_args()
|
| 124 |
+
|
| 125 |
+
with open(args.inp) as f:
|
| 126 |
+
raw = json.load(f)
|
| 127 |
+
|
| 128 |
+
if isinstance(raw, list):
|
| 129 |
+
candidates = raw
|
| 130 |
+
else:
|
| 131 |
+
candidates = raw.get("candidates") or raw.get("papers") or []
|
| 132 |
+
|
| 133 |
+
if not isinstance(candidates, list):
|
| 134 |
+
print("ERROR: input must be a JSON array or object with 'candidates' key", file=sys.stderr)
|
| 135 |
+
return 1
|
| 136 |
+
|
| 137 |
+
before = len(candidates)
|
| 138 |
+
result = dedup(candidates, title_ratio_threshold=args.title_ratio)
|
| 139 |
+
after = len(result)
|
| 140 |
+
removed = before - after
|
| 141 |
+
|
| 142 |
+
out_obj = {
|
| 143 |
+
"candidates": result,
|
| 144 |
+
"n_before_dedup": before,
|
| 145 |
+
"n_after_dedup": after,
|
| 146 |
+
"n_removed": removed,
|
| 147 |
+
}
|
| 148 |
+
with open(args.out, "w") as f:
|
| 149 |
+
json.dump(out_obj, f, indent=2, ensure_ascii=False)
|
| 150 |
+
|
| 151 |
+
print(f"OK: {before} candidates → {after} unique ({removed} duplicates removed)")
|
| 152 |
+
return 0
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
if __name__ == "__main__":
|
| 156 |
+
sys.exit(main())
|
.scider/skills/literature-review-agent/scripts/s2_cache.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
s2_cache.py — Persistent Semantic Scholar verification cache.
|
| 4 |
+
|
| 5 |
+
Problem: Phase 2 verification is throttled to 1 QPS. If a pipeline run
|
| 6 |
+
fails partway through (gate error, network timeout, interrupted session),
|
| 7 |
+
re-running wastes the full S2 wait time again on already-verified papers.
|
| 8 |
+
|
| 9 |
+
Solution: a flat JSON cache at workspace/cache/s2_cache.json. On a cache
|
| 10 |
+
HIT the script emits the stored response and exits 0 so the caller can skip
|
| 11 |
+
the live S2 request. On a cache MISS it exits 1. After a live request the
|
| 12 |
+
caller stores the result with --store.
|
| 13 |
+
|
| 14 |
+
The cache key is derived from the normalized query title (lowercase,
|
| 15 |
+
alphanumeric only) so minor whitespace differences still hit.
|
| 16 |
+
|
| 17 |
+
Usage:
|
| 18 |
+
|
| 19 |
+
CHECK mode — exits 0 + prints JSON if cached, else exits 1:
|
| 20 |
+
python s2_cache.py --cache workspace/cache/s2_cache.json \\
|
| 21 |
+
--check "Attention Is All You Need"
|
| 22 |
+
|
| 23 |
+
STORE mode — write a response into the cache:
|
| 24 |
+
python s2_cache.py --cache workspace/cache/s2_cache.json \\
|
| 25 |
+
--store "Attention Is All You Need" \\
|
| 26 |
+
--response '{"paperId": "...", "title": "..."}'
|
| 27 |
+
|
| 28 |
+
STATS mode — print cache size and hit rate summary:
|
| 29 |
+
python s2_cache.py --cache workspace/cache/s2_cache.json --stats
|
| 30 |
+
"""
|
| 31 |
+
import argparse
|
| 32 |
+
import json
|
| 33 |
+
import os
|
| 34 |
+
import re
|
| 35 |
+
import sys
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def norm_key(title: str) -> str:
|
| 39 |
+
"""Lowercase, alphanumeric-only cache key."""
|
| 40 |
+
return re.sub(r"[^a-z0-9]", "", title.lower())
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def load_cache(path: str) -> dict:
|
| 44 |
+
if os.path.isfile(path):
|
| 45 |
+
with open(path) as f:
|
| 46 |
+
try:
|
| 47 |
+
return json.load(f)
|
| 48 |
+
except json.JSONDecodeError:
|
| 49 |
+
return {}
|
| 50 |
+
return {}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def save_cache(path: str, cache: dict) -> None:
|
| 54 |
+
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
|
| 55 |
+
with open(path, "w") as f:
|
| 56 |
+
json.dump(cache, f, indent=2, ensure_ascii=False)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def main() -> int:
|
| 60 |
+
p = argparse.ArgumentParser(description=__doc__)
|
| 61 |
+
p.add_argument("--cache", required=True, help="Path to cache JSON file")
|
| 62 |
+
|
| 63 |
+
mode = p.add_mutually_exclusive_group(required=True)
|
| 64 |
+
mode.add_argument(
|
| 65 |
+
"--check",
|
| 66 |
+
metavar="TITLE",
|
| 67 |
+
help="Check for title; exit 0 + print JSON if found, else exit 1",
|
| 68 |
+
)
|
| 69 |
+
mode.add_argument(
|
| 70 |
+
"--store", metavar="TITLE", help="Store a response for TITLE (requires --response)"
|
| 71 |
+
)
|
| 72 |
+
mode.add_argument("--stats", action="store_true", help="Print cache statistics")
|
| 73 |
+
|
| 74 |
+
p.add_argument(
|
| 75 |
+
"--response", metavar="JSON", help="S2 response JSON to store (used with --store)"
|
| 76 |
+
)
|
| 77 |
+
args = p.parse_args()
|
| 78 |
+
|
| 79 |
+
cache = load_cache(args.cache)
|
| 80 |
+
|
| 81 |
+
if args.stats:
|
| 82 |
+
print(f"Cache file : {args.cache}")
|
| 83 |
+
print(f"Entries : {len(cache)}")
|
| 84 |
+
if cache:
|
| 85 |
+
print("Sample keys:", list(cache.keys())[:5])
|
| 86 |
+
return 0
|
| 87 |
+
|
| 88 |
+
if args.check:
|
| 89 |
+
key = norm_key(args.check)
|
| 90 |
+
if key in cache:
|
| 91 |
+
print(json.dumps(cache[key]))
|
| 92 |
+
return 0 # HIT
|
| 93 |
+
return 1 # MISS
|
| 94 |
+
|
| 95 |
+
# --store mode
|
| 96 |
+
if not args.response:
|
| 97 |
+
print("ERROR: --store requires --response", file=sys.stderr)
|
| 98 |
+
return 2
|
| 99 |
+
try:
|
| 100 |
+
response = json.loads(args.response)
|
| 101 |
+
except json.JSONDecodeError as e:
|
| 102 |
+
print(f"ERROR: invalid JSON in --response: {e}", file=sys.stderr)
|
| 103 |
+
return 2
|
| 104 |
+
|
| 105 |
+
key = norm_key(args.store)
|
| 106 |
+
cache[key] = response
|
| 107 |
+
save_cache(args.cache, cache)
|
| 108 |
+
print(f"OK: cached '{args.store}' → key '{key}' ({len(cache)} total entries)")
|
| 109 |
+
return 0
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
if __name__ == "__main__":
|
| 113 |
+
sys.exit(main())
|
.scider/skills/literature-review-agent/scripts/s2_search.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
s2_search.py — Semantic Scholar title-search helper for Phase 2 verification.
|
| 4 |
+
|
| 5 |
+
Queries the Semantic Scholar Graph API for a paper by title and returns the
|
| 6 |
+
top candidate hits as JSON. Used by the literature-review-agent to verify
|
| 7 |
+
each candidate from Phase 1 before adding it to citation_pool.json.
|
| 8 |
+
|
| 9 |
+
API key (optional):
|
| 10 |
+
If SEMANTIC_SCHOLAR_API_KEY is set in the environment the key is forwarded
|
| 11 |
+
via the ``x-api-key`` header, which raises the rate limit from ~100 req/5 min
|
| 12 |
+
(unauthenticated) to 1 req/s sustained with higher burst headroom.
|
| 13 |
+
If the variable is absent the script falls back to the public unauthenticated
|
| 14 |
+
endpoint — the pipeline works fine without a key; just keep to ≤1 QPS.
|
| 15 |
+
|
| 16 |
+
Get a free key at: https://api.semanticscholar.org/
|
| 17 |
+
Then export it once before running the pipeline:
|
| 18 |
+
export SEMANTIC_SCHOLAR_API_KEY="your-key-here"
|
| 19 |
+
|
| 20 |
+
Usage:
|
| 21 |
+
# check for key and search
|
| 22 |
+
python s2_search.py --query "Attention is All You Need"
|
| 23 |
+
|
| 24 |
+
# request more hits and extra fields
|
| 25 |
+
python s2_search.py --query "BERT pre-training" --limit 10 \\
|
| 26 |
+
--fields title,abstract,year,authors,venue,externalIds,citationCount
|
| 27 |
+
|
| 28 |
+
# pretty-print raw S2 JSON
|
| 29 |
+
python s2_search.py --query "GPT-4 technical report" --raw
|
| 30 |
+
|
| 31 |
+
Exit codes:
|
| 32 |
+
0 at least one result returned
|
| 33 |
+
1 HTTP error, network error, or zero results
|
| 34 |
+
2 usage error (bad arguments)
|
| 35 |
+
"""
|
| 36 |
+
import argparse
|
| 37 |
+
import json
|
| 38 |
+
import os
|
| 39 |
+
import sys
|
| 40 |
+
import time
|
| 41 |
+
import urllib.error
|
| 42 |
+
import urllib.parse
|
| 43 |
+
import urllib.request
|
| 44 |
+
|
| 45 |
+
S2_BASE = "https://api.semanticscholar.org/graph/v1"
|
| 46 |
+
DEFAULT_FIELDS = "title,abstract,year,authors,venue,externalIds"
|
| 47 |
+
DEFAULT_LIMIT = 5
|
| 48 |
+
MAX_LIMIT = 100
|
| 49 |
+
_RETRY_SLEEP = 5 # seconds to wait after a 429 before retrying
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _build_headers() -> dict:
|
| 53 |
+
headers = {"Accept": "application/json"}
|
| 54 |
+
api_key = os.environ.get("SEMANTIC_SCHOLAR_API_KEY", "").strip()
|
| 55 |
+
if api_key:
|
| 56 |
+
headers["x-api-key"] = api_key
|
| 57 |
+
return headers
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def search(query: str, limit: int, fields: str, retries: int = 3) -> dict:
|
| 61 |
+
"""
|
| 62 |
+
Call /paper/search and return the parsed JSON response.
|
| 63 |
+
|
| 64 |
+
Raises SystemExit on unrecoverable errors so the caller (or CLI) gets a
|
| 65 |
+
clean non-zero exit code.
|
| 66 |
+
"""
|
| 67 |
+
params = urllib.parse.urlencode(
|
| 68 |
+
{
|
| 69 |
+
"query": query,
|
| 70 |
+
"limit": limit,
|
| 71 |
+
"fields": fields,
|
| 72 |
+
}
|
| 73 |
+
)
|
| 74 |
+
url = f"{S2_BASE}/paper/search?{params}"
|
| 75 |
+
headers = _build_headers()
|
| 76 |
+
|
| 77 |
+
for attempt in range(1, retries + 1):
|
| 78 |
+
req = urllib.request.Request(url, headers=headers, method="GET")
|
| 79 |
+
try:
|
| 80 |
+
with urllib.request.urlopen(req, timeout=30) as resp:
|
| 81 |
+
return json.loads(resp.read().decode("utf-8"))
|
| 82 |
+
except urllib.error.HTTPError as exc:
|
| 83 |
+
if exc.code == 429:
|
| 84 |
+
if attempt < retries:
|
| 85 |
+
print(
|
| 86 |
+
f"WARN: S2 rate-limited (429). Sleeping {_RETRY_SLEEP}s "
|
| 87 |
+
f"before retry {attempt + 1}/{retries}.",
|
| 88 |
+
file=sys.stderr,
|
| 89 |
+
)
|
| 90 |
+
time.sleep(_RETRY_SLEEP)
|
| 91 |
+
continue
|
| 92 |
+
print(
|
| 93 |
+
"ERROR: S2 rate-limited (429) and retries exhausted.\n"
|
| 94 |
+
"Tip: set SEMANTIC_SCHOLAR_API_KEY to get a higher rate limit.\n"
|
| 95 |
+
" See https://api.semanticscholar.org/ for a free key.",
|
| 96 |
+
file=sys.stderr,
|
| 97 |
+
)
|
| 98 |
+
sys.exit(1)
|
| 99 |
+
if exc.code == 404:
|
| 100 |
+
# not found — return an empty result set (caller handles this)
|
| 101 |
+
return {"total": 0, "data": []}
|
| 102 |
+
if exc.code in (500, 502, 503):
|
| 103 |
+
if attempt < retries:
|
| 104 |
+
print(
|
| 105 |
+
f"WARN: S2 server error ({exc.code}). Sleeping 30s before "
|
| 106 |
+
f"retry {attempt + 1}/{retries}.",
|
| 107 |
+
file=sys.stderr,
|
| 108 |
+
)
|
| 109 |
+
time.sleep(30)
|
| 110 |
+
continue
|
| 111 |
+
print(
|
| 112 |
+
f"ERROR: S2 server error ({exc.code}) after {retries} attempts.",
|
| 113 |
+
file=sys.stderr,
|
| 114 |
+
)
|
| 115 |
+
sys.exit(1)
|
| 116 |
+
body = exc.read().decode("utf-8", errors="replace")[:400]
|
| 117 |
+
print(f"ERROR: S2 HTTP {exc.code}: {body}", file=sys.stderr)
|
| 118 |
+
sys.exit(1)
|
| 119 |
+
except urllib.error.URLError as exc:
|
| 120 |
+
print(f"ERROR: Network error reaching Semantic Scholar: {exc.reason}", file=sys.stderr)
|
| 121 |
+
sys.exit(1)
|
| 122 |
+
|
| 123 |
+
# should never reach here
|
| 124 |
+
sys.exit(1)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def main() -> int:
|
| 128 |
+
p = argparse.ArgumentParser(
|
| 129 |
+
description=__doc__,
|
| 130 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 131 |
+
)
|
| 132 |
+
p.add_argument(
|
| 133 |
+
"--query",
|
| 134 |
+
required=True,
|
| 135 |
+
help="Paper title (or search query) to look up on Semantic Scholar",
|
| 136 |
+
)
|
| 137 |
+
p.add_argument(
|
| 138 |
+
"--limit",
|
| 139 |
+
type=int,
|
| 140 |
+
default=DEFAULT_LIMIT,
|
| 141 |
+
help=f"Max hits to return (default {DEFAULT_LIMIT}, max {MAX_LIMIT})",
|
| 142 |
+
)
|
| 143 |
+
p.add_argument(
|
| 144 |
+
"--fields",
|
| 145 |
+
default=DEFAULT_FIELDS,
|
| 146 |
+
help=f"Comma-separated S2 fields to request (default: {DEFAULT_FIELDS})",
|
| 147 |
+
)
|
| 148 |
+
p.add_argument(
|
| 149 |
+
"--raw",
|
| 150 |
+
action="store_true",
|
| 151 |
+
help="Print the full S2 JSON response unmodified instead of normalized output",
|
| 152 |
+
)
|
| 153 |
+
p.add_argument(
|
| 154 |
+
"--check-key",
|
| 155 |
+
action="store_true",
|
| 156 |
+
help="Print whether SEMANTIC_SCHOLAR_API_KEY is set and exit (no network call)",
|
| 157 |
+
)
|
| 158 |
+
args = p.parse_args()
|
| 159 |
+
|
| 160 |
+
if args.check_key:
|
| 161 |
+
key = os.environ.get("SEMANTIC_SCHOLAR_API_KEY", "").strip()
|
| 162 |
+
if key:
|
| 163 |
+
masked = key[:4] + "..." + key[-4:] if len(key) > 8 else "****"
|
| 164 |
+
print(
|
| 165 |
+
f"SEMANTIC_SCHOLAR_API_KEY is set ({masked}). "
|
| 166 |
+
"Authenticated mode: higher rate limits."
|
| 167 |
+
)
|
| 168 |
+
else:
|
| 169 |
+
print(
|
| 170 |
+
"SEMANTIC_SCHOLAR_API_KEY is NOT set. "
|
| 171 |
+
"Unauthenticated mode: ~100 req/5 min, keep to ≤1 QPS.\n"
|
| 172 |
+
"To enable higher rate limits:\n"
|
| 173 |
+
" 1. Get a free key at https://api.semanticscholar.org/\n"
|
| 174 |
+
' 2. export SEMANTIC_SCHOLAR_API_KEY="your-key-here"'
|
| 175 |
+
)
|
| 176 |
+
return 0
|
| 177 |
+
|
| 178 |
+
limit = max(1, min(MAX_LIMIT, args.limit))
|
| 179 |
+
response = search(args.query, limit, args.fields)
|
| 180 |
+
|
| 181 |
+
if args.raw:
|
| 182 |
+
json.dump(response, sys.stdout, indent=2, ensure_ascii=False)
|
| 183 |
+
sys.stdout.write("\n")
|
| 184 |
+
return 0
|
| 185 |
+
|
| 186 |
+
data = response.get("data") or []
|
| 187 |
+
if not data:
|
| 188 |
+
print(
|
| 189 |
+
f"WARN: Semantic Scholar returned 0 results for query: {args.query!r}",
|
| 190 |
+
file=sys.stderr,
|
| 191 |
+
)
|
| 192 |
+
json.dump({"total": 0, "data": []}, sys.stdout, indent=2)
|
| 193 |
+
sys.stdout.write("\n")
|
| 194 |
+
return 1
|
| 195 |
+
|
| 196 |
+
# Emit normalized output (subset of fields used by pipeline)
|
| 197 |
+
out = {
|
| 198 |
+
"total": response.get("total", len(data)),
|
| 199 |
+
"authenticated": bool(os.environ.get("SEMANTIC_SCHOLAR_API_KEY", "").strip()),
|
| 200 |
+
"data": data,
|
| 201 |
+
}
|
| 202 |
+
json.dump(out, sys.stdout, indent=2, ensure_ascii=False)
|
| 203 |
+
sys.stdout.write("\n")
|
| 204 |
+
return 0
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
if __name__ == "__main__":
|
| 208 |
+
sys.exit(main())
|
.scider/skills/literature-review-agent/scripts/sync_keys.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
r"""
|
| 3 |
+
sync_keys.py — Synchronize citation keys in a .tex file with the canonical
|
| 4 |
+
bibtex_key values stored in citation_pool.json.
|
| 5 |
+
|
| 6 |
+
Problem: The Literature Review Agent writes cite keys in its own format
|
| 7 |
+
(e.g. 'lewis2020rag'), while bibtex_format.py generates canonical keys from
|
| 8 |
+
author + year + first-significant-title-word (e.g. 'lewis2020retrievalaugmented').
|
| 9 |
+
After running bibtex_format.py these two sources are out of sync, causing the
|
| 10 |
+
citation_coverage gate to fail (it looks for \cite{canonical_key} in the .tex).
|
| 11 |
+
|
| 12 |
+
This script reads the 'key' -> 'bibtex_key' mapping from citation_pool.json
|
| 13 |
+
and performs a targeted substitution inside \cite{}, \citep{}, \citet{}
|
| 14 |
+
commands in the target .tex file. It handles multi-key citations like
|
| 15 |
+
\cite{a,b,c} correctly.
|
| 16 |
+
|
| 17 |
+
Run this immediately after bibtex_format.py, before Step 4 (Section Writing).
|
| 18 |
+
|
| 19 |
+
Usage:
|
| 20 |
+
python sync_keys.py \
|
| 21 |
+
--pool workspace/citation_pool.json \
|
| 22 |
+
--tex workspace/drafts/intro_relwork.tex \
|
| 23 |
+
--inplace
|
| 24 |
+
|
| 25 |
+
# Without --inplace: prints updated content to stdout (safe preview mode).
|
| 26 |
+
"""
|
| 27 |
+
import argparse
|
| 28 |
+
import json
|
| 29 |
+
import re
|
| 30 |
+
import sys
|
| 31 |
+
|
| 32 |
+
# Matches \cite, \citep, \citet, \citealt, \citealp, \citeauthor, \citeyear,
|
| 33 |
+
# starred variants like \cite*, and the optional [prenote][postnote] args.
|
| 34 |
+
CITE_RE = re.compile(
|
| 35 |
+
r"(\\cite[a-zA-Z*]*)" # command
|
| 36 |
+
r"(?:\[[^\]]*\])*" # optional bracket args (prenote/postnote)
|
| 37 |
+
r"\{([^}]+)\}" # required brace arg with keys
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def build_key_map(pool: dict) -> dict[str, str]:
|
| 42 |
+
"""Return {agent_key: bibtex_key} for every paper where they differ."""
|
| 43 |
+
key_map: dict[str, str] = {}
|
| 44 |
+
for paper in pool.get("papers", []):
|
| 45 |
+
old = paper.get("key")
|
| 46 |
+
new = paper.get("bibtex_key")
|
| 47 |
+
if old and new and old != new:
|
| 48 |
+
key_map[old] = new
|
| 49 |
+
return key_map
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def replace_keys(content: str, key_map: dict[str, str]) -> tuple[str, int]:
|
| 53 |
+
if not key_map:
|
| 54 |
+
return content, 0
|
| 55 |
+
|
| 56 |
+
n_replaced = 0
|
| 57 |
+
|
| 58 |
+
def replacer(m: re.Match) -> str:
|
| 59 |
+
nonlocal n_replaced
|
| 60 |
+
cmd = m.group(1)
|
| 61 |
+
keys_str = m.group(2)
|
| 62 |
+
keys = [k.strip() for k in keys_str.split(",")]
|
| 63 |
+
new_keys: list[str] = []
|
| 64 |
+
for k in keys:
|
| 65 |
+
if k in key_map:
|
| 66 |
+
new_keys.append(key_map[k])
|
| 67 |
+
n_replaced += 1
|
| 68 |
+
else:
|
| 69 |
+
new_keys.append(k)
|
| 70 |
+
# Reconstruct original bracket args (they were consumed by the regex
|
| 71 |
+
# but we don't need to preserve them specially — re-emit as matched)
|
| 72 |
+
full_match = m.group(0)
|
| 73 |
+
# Rebuild: command + everything between command and { + new keys
|
| 74 |
+
bracket_part = full_match[len(cmd) : full_match.index("{")]
|
| 75 |
+
return f"{cmd}{bracket_part}{{{', '.join(new_keys)}}}"
|
| 76 |
+
|
| 77 |
+
updated = CITE_RE.sub(replacer, content)
|
| 78 |
+
return updated, n_replaced
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def main() -> int:
|
| 82 |
+
p = argparse.ArgumentParser(description=__doc__)
|
| 83 |
+
p.add_argument("--pool", required=True, help="citation_pool.json")
|
| 84 |
+
p.add_argument("--tex", required=True, help="Target .tex file to update")
|
| 85 |
+
p.add_argument(
|
| 86 |
+
"--inplace", action="store_true", help="Overwrite --tex in place (default: print to stdout)"
|
| 87 |
+
)
|
| 88 |
+
args = p.parse_args()
|
| 89 |
+
|
| 90 |
+
with open(args.pool) as f:
|
| 91 |
+
pool = json.load(f)
|
| 92 |
+
key_map = build_key_map(pool)
|
| 93 |
+
|
| 94 |
+
if not key_map:
|
| 95 |
+
print("OK: no key differences in citation_pool.json — nothing to sync")
|
| 96 |
+
return 0
|
| 97 |
+
|
| 98 |
+
print(f"Key map ({len(key_map)} substitutions):")
|
| 99 |
+
for old, new in key_map.items():
|
| 100 |
+
print(f" {old} → {new}")
|
| 101 |
+
|
| 102 |
+
with open(args.tex) as f:
|
| 103 |
+
content = f.read()
|
| 104 |
+
|
| 105 |
+
updated, n = replace_keys(content, key_map)
|
| 106 |
+
|
| 107 |
+
if args.inplace:
|
| 108 |
+
with open(args.tex, "w") as f:
|
| 109 |
+
f.write(updated)
|
| 110 |
+
print(f"OK: {n} citation key(s) updated in {args.tex}")
|
| 111 |
+
else:
|
| 112 |
+
sys.stdout.write(updated)
|
| 113 |
+
print(f"\n# sync_keys: {n} substitution(s) would be made", file=sys.stderr)
|
| 114 |
+
|
| 115 |
+
return 0
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
sys.exit(main())
|
.scider/skills/literature-review-agent/scripts/validate_pool.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
validate_pool.py — Validate and auto-fix citation_pool.json before it is
|
| 4 |
+
passed to bibtex_format.py or the Section Writing Agent.
|
| 5 |
+
|
| 6 |
+
Catches the two most common schema errors produced by the Literature Review
|
| 7 |
+
Agent and fixes them in place with --fix.
|
| 8 |
+
|
| 9 |
+
Error 1 — Authors as plain strings (WRONG format for bibtex_format.py):
|
| 10 |
+
WRONG: "authors": ["Alice Smith", "Bob Jones"]
|
| 11 |
+
CORRECT: "authors": [{"name": "Alice Smith"}, {"name": "Bob Jones"}]
|
| 12 |
+
|
| 13 |
+
Error 2 — Missing required fields (title, year). These cause bibtex_format.py
|
| 14 |
+
to emit incomplete entries. Reported as errors, not auto-fixed.
|
| 15 |
+
|
| 16 |
+
Also checks that the pool has the top-level keys that downstream scripts
|
| 17 |
+
expect: "papers", "min_cite_paper_count".
|
| 18 |
+
|
| 19 |
+
Exit codes:
|
| 20 |
+
0 Pool is valid (or was fully fixed with --fix)
|
| 21 |
+
1 Unrecoverable errors remain (missing required fields, no papers)
|
| 22 |
+
|
| 23 |
+
Usage:
|
| 24 |
+
python validate_pool.py --pool workspace/citation_pool.json
|
| 25 |
+
python validate_pool.py --pool workspace/citation_pool.json --fix
|
| 26 |
+
"""
|
| 27 |
+
import argparse
|
| 28 |
+
import json
|
| 29 |
+
import sys
|
| 30 |
+
|
| 31 |
+
REQUIRED_PAPER_FIELDS = ["title", "year"]
|
| 32 |
+
RECOMMENDED_PAPER_FIELDS = ["paperId", "abstract", "venue", "authors"]
|
| 33 |
+
REQUIRED_TOP_FIELDS = ["papers", "min_cite_paper_count"]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def validate_and_fix(pool: dict, fix: bool) -> tuple[list[str], list[str], int]:
|
| 37 |
+
"""
|
| 38 |
+
Returns (errors, warnings, n_fixed).
|
| 39 |
+
If fix=True, mutates pool in place where possible.
|
| 40 |
+
"""
|
| 41 |
+
errors: list[str] = []
|
| 42 |
+
warnings: list[str] = []
|
| 43 |
+
n_fixed = 0
|
| 44 |
+
|
| 45 |
+
# Top-level structure
|
| 46 |
+
for field in REQUIRED_TOP_FIELDS:
|
| 47 |
+
if field not in pool:
|
| 48 |
+
warnings.append(f"top-level field '{field}' missing — was dedupe_by_id.py run?")
|
| 49 |
+
|
| 50 |
+
papers = pool.get("papers", [])
|
| 51 |
+
if not papers:
|
| 52 |
+
errors.append("pool['papers'] is empty or missing")
|
| 53 |
+
return errors, warnings, n_fixed
|
| 54 |
+
|
| 55 |
+
for i, paper in enumerate(papers):
|
| 56 |
+
label = paper.get("title") or f"paper #{i}"
|
| 57 |
+
|
| 58 |
+
# --- Authors format check ---
|
| 59 |
+
authors = paper.get("authors")
|
| 60 |
+
if authors is not None:
|
| 61 |
+
if not isinstance(authors, list):
|
| 62 |
+
errors.append(f"[{label}] 'authors' must be a list, got {type(authors).__name__}")
|
| 63 |
+
elif authors:
|
| 64 |
+
if isinstance(authors[0], str):
|
| 65 |
+
if fix:
|
| 66 |
+
paper["authors"] = [{"name": a} for a in authors]
|
| 67 |
+
n_fixed += 1
|
| 68 |
+
else:
|
| 69 |
+
errors.append(
|
| 70 |
+
f"[{label}] authors are plain strings "
|
| 71 |
+
f'(e.g. "{authors[0]}") — run with --fix to auto-convert'
|
| 72 |
+
)
|
| 73 |
+
elif not isinstance(authors[0], dict):
|
| 74 |
+
errors.append(
|
| 75 |
+
f"[{label}] authors[0] is {type(authors[0]).__name__}, "
|
| 76 |
+
f"expected dict with 'name' key"
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# --- Required fields ---
|
| 80 |
+
for field in REQUIRED_PAPER_FIELDS:
|
| 81 |
+
if not paper.get(field):
|
| 82 |
+
errors.append(f"[{label}] missing required field '{field}'")
|
| 83 |
+
|
| 84 |
+
# --- Recommended fields ---
|
| 85 |
+
for field in RECOMMENDED_PAPER_FIELDS:
|
| 86 |
+
if not paper.get(field):
|
| 87 |
+
warnings.append(f"[{label}] missing recommended field '{field}'")
|
| 88 |
+
|
| 89 |
+
return errors, warnings, n_fixed
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def main() -> int:
|
| 93 |
+
p = argparse.ArgumentParser(description=__doc__)
|
| 94 |
+
p.add_argument("--pool", required=True, help="citation_pool.json path")
|
| 95 |
+
p.add_argument(
|
| 96 |
+
"--fix",
|
| 97 |
+
action="store_true",
|
| 98 |
+
help="Auto-fix recoverable errors (authors format) and write back",
|
| 99 |
+
)
|
| 100 |
+
p.add_argument("--quiet", action="store_true", help="Suppress warnings, only show errors")
|
| 101 |
+
args = p.parse_args()
|
| 102 |
+
|
| 103 |
+
with open(args.pool) as f:
|
| 104 |
+
pool = json.load(f)
|
| 105 |
+
|
| 106 |
+
errors, warnings, n_fixed = validate_and_fix(pool, fix=args.fix)
|
| 107 |
+
|
| 108 |
+
if not args.quiet:
|
| 109 |
+
for w in warnings:
|
| 110 |
+
print(f"WARN: {w}")
|
| 111 |
+
|
| 112 |
+
had_errors = bool(errors)
|
| 113 |
+
for e in errors:
|
| 114 |
+
print(f"ERROR: {e}", file=sys.stderr)
|
| 115 |
+
|
| 116 |
+
if had_errors and not args.fix:
|
| 117 |
+
print(
|
| 118 |
+
"\nTip: re-run with --fix to auto-correct recoverable issues (authors format).",
|
| 119 |
+
file=sys.stderr,
|
| 120 |
+
)
|
| 121 |
+
return 1
|
| 122 |
+
|
| 123 |
+
if n_fixed > 0:
|
| 124 |
+
with open(args.pool, "w") as f:
|
| 125 |
+
json.dump(pool, f, indent=2, ensure_ascii=False)
|
| 126 |
+
print(f"OK: {n_fixed} paper(s) auto-fixed and written back to {args.pool}")
|
| 127 |
+
|
| 128 |
+
n = len(pool.get("papers", []))
|
| 129 |
+
if not had_errors and n_fixed == 0:
|
| 130 |
+
print(f"OK: {n} papers validated — no errors")
|
| 131 |
+
elif n_fixed > 0 and not errors:
|
| 132 |
+
print(f"OK: {n} papers validated after auto-fix")
|
| 133 |
+
|
| 134 |
+
return (
|
| 135 |
+
0
|
| 136 |
+
if (
|
| 137 |
+
not errors
|
| 138 |
+
or (args.fix and n_fixed > 0 and not [e for e in errors if "missing required" in e])
|
| 139 |
+
)
|
| 140 |
+
else 1
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
if __name__ == "__main__":
|
| 145 |
+
sys.exit(main())
|
.scider/skills/matplotlib/SKILL.md
ADDED
|
@@ -0,0 +1,356 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
name: matplotlib
|
| 3 |
+
description: Low-level plotting library for full customization. Use when you need fine-grained control over every plot element, novel plot types, or publication-quality PNG/PDF/SVG export. For quick statistical plots use seaborn.
|
| 4 |
+
allowed_agents: [experiment, native_coding]
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
# Matplotlib
|
| 8 |
+
|
| 9 |
+
## Overview
|
| 10 |
+
|
| 11 |
+
Matplotlib is Python's foundational visualization library for creating static, animated, and interactive plots. This skill provides guidance on using matplotlib effectively, covering both the pyplot interface (MATLAB-style) and the object-oriented API (Figure/Axes), along with best practices for creating publication-quality visualizations.
|
| 12 |
+
|
| 13 |
+
## When to Use This Skill
|
| 14 |
+
|
| 15 |
+
This skill should be used when:
|
| 16 |
+
- Creating any type of plot or chart (line, scatter, bar, histogram, heatmap, contour, etc.)
|
| 17 |
+
- Generating scientific or statistical visualizations
|
| 18 |
+
- Customizing plot appearance (colors, styles, labels, legends)
|
| 19 |
+
- Creating multi-panel figures with subplots
|
| 20 |
+
- Exporting visualizations to various formats (PNG, PDF, SVG, etc.)
|
| 21 |
+
- Building interactive plots or animations
|
| 22 |
+
- Working with 3D visualizations
|
| 23 |
+
- Integrating plots into Jupyter notebooks or GUI applications
|
| 24 |
+
|
| 25 |
+
## Core Concepts
|
| 26 |
+
|
| 27 |
+
### The Matplotlib Hierarchy
|
| 28 |
+
|
| 29 |
+
Matplotlib uses a hierarchical structure of objects:
|
| 30 |
+
|
| 31 |
+
1. **Figure** - The top-level container for all plot elements
|
| 32 |
+
2. **Axes** - The actual plotting area where data is displayed (one Figure can contain multiple Axes)
|
| 33 |
+
3. **Artist** - Everything visible on the figure (lines, text, ticks, etc.)
|
| 34 |
+
4. **Axis** - The number line objects (x-axis, y-axis) that handle ticks and labels
|
| 35 |
+
|
| 36 |
+
### Two Interfaces
|
| 37 |
+
|
| 38 |
+
**1. pyplot Interface (Implicit, MATLAB-style)**
|
| 39 |
+
```python
|
| 40 |
+
import matplotlib.pyplot as plt
|
| 41 |
+
|
| 42 |
+
plt.plot([1, 2, 3, 4])
|
| 43 |
+
plt.ylabel('some numbers')
|
| 44 |
+
plt.show()
|
| 45 |
+
```
|
| 46 |
+
- Convenient for quick, simple plots
|
| 47 |
+
- Maintains state automatically
|
| 48 |
+
- Good for interactive work and simple scripts
|
| 49 |
+
|
| 50 |
+
**2. Object-Oriented Interface (Explicit)**
|
| 51 |
+
```python
|
| 52 |
+
import matplotlib.pyplot as plt
|
| 53 |
+
|
| 54 |
+
fig, ax = plt.subplots()
|
| 55 |
+
ax.plot([1, 2, 3, 4])
|
| 56 |
+
ax.set_ylabel('some numbers')
|
| 57 |
+
plt.show()
|
| 58 |
+
```
|
| 59 |
+
- **Recommended for most use cases**
|
| 60 |
+
- More explicit control over figure and axes
|
| 61 |
+
- Better for complex figures with multiple subplots
|
| 62 |
+
- Easier to maintain and debug
|
| 63 |
+
|
| 64 |
+
## Common Workflows
|
| 65 |
+
|
| 66 |
+
### 1. Basic Plot Creation
|
| 67 |
+
|
| 68 |
+
**Single plot workflow:**
|
| 69 |
+
```python
|
| 70 |
+
import matplotlib.pyplot as plt
|
| 71 |
+
import numpy as np
|
| 72 |
+
|
| 73 |
+
# Create figure and axes (OO interface - RECOMMENDED)
|
| 74 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 75 |
+
|
| 76 |
+
# Generate and plot data
|
| 77 |
+
x = np.linspace(0, 2*np.pi, 100)
|
| 78 |
+
ax.plot(x, np.sin(x), label='sin(x)')
|
| 79 |
+
ax.plot(x, np.cos(x), label='cos(x)')
|
| 80 |
+
|
| 81 |
+
# Customize
|
| 82 |
+
ax.set_xlabel('x')
|
| 83 |
+
ax.set_ylabel('y')
|
| 84 |
+
ax.set_title('Trigonometric Functions')
|
| 85 |
+
ax.legend()
|
| 86 |
+
ax.grid(True, alpha=0.3)
|
| 87 |
+
|
| 88 |
+
# Save and/or display
|
| 89 |
+
plt.savefig('plot.png', dpi=300, bbox_inches='tight')
|
| 90 |
+
plt.show()
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
### 2. Multiple Subplots
|
| 94 |
+
|
| 95 |
+
**Creating subplot layouts:**
|
| 96 |
+
```python
|
| 97 |
+
# Method 1: Regular grid
|
| 98 |
+
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
|
| 99 |
+
axes[0, 0].plot(x, y1)
|
| 100 |
+
axes[0, 1].scatter(x, y2)
|
| 101 |
+
axes[1, 0].bar(categories, values)
|
| 102 |
+
axes[1, 1].hist(data, bins=30)
|
| 103 |
+
|
| 104 |
+
# Method 2: Mosaic layout (more flexible)
|
| 105 |
+
fig, axes = plt.subplot_mosaic([['left', 'right_top'],
|
| 106 |
+
['left', 'right_bottom']],
|
| 107 |
+
figsize=(10, 8))
|
| 108 |
+
axes['left'].plot(x, y)
|
| 109 |
+
axes['right_top'].scatter(x, y)
|
| 110 |
+
axes['right_bottom'].hist(data)
|
| 111 |
+
|
| 112 |
+
# Method 3: GridSpec (maximum control)
|
| 113 |
+
from matplotlib.gridspec import GridSpec
|
| 114 |
+
fig = plt.figure(figsize=(12, 8))
|
| 115 |
+
gs = GridSpec(3, 3, figure=fig)
|
| 116 |
+
ax1 = fig.add_subplot(gs[0, :]) # Top row, all columns
|
| 117 |
+
ax2 = fig.add_subplot(gs[1:, 0]) # Bottom two rows, first column
|
| 118 |
+
ax3 = fig.add_subplot(gs[1:, 1:]) # Bottom two rows, last two columns
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
### 3. Plot Types and Use Cases
|
| 122 |
+
|
| 123 |
+
**Line plots** - Time series, continuous data, trends
|
| 124 |
+
```python
|
| 125 |
+
ax.plot(x, y, linewidth=2, linestyle='--', marker='o', color='blue')
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
**Scatter plots** - Relationships between variables, correlations
|
| 129 |
+
```python
|
| 130 |
+
ax.scatter(x, y, s=sizes, c=colors, alpha=0.6, cmap='viridis')
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
**Bar charts** - Categorical comparisons
|
| 134 |
+
```python
|
| 135 |
+
ax.bar(categories, values, color='steelblue', edgecolor='black')
|
| 136 |
+
# For horizontal bars:
|
| 137 |
+
ax.barh(categories, values)
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
**Histograms** - Distributions
|
| 141 |
+
```python
|
| 142 |
+
ax.hist(data, bins=30, edgecolor='black', alpha=0.7)
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
**Heatmaps** - Matrix data, correlations
|
| 146 |
+
```python
|
| 147 |
+
im = ax.imshow(matrix, cmap='coolwarm', aspect='auto')
|
| 148 |
+
plt.colorbar(im, ax=ax)
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
**Contour plots** - 3D data on 2D plane
|
| 152 |
+
```python
|
| 153 |
+
contour = ax.contour(X, Y, Z, levels=10)
|
| 154 |
+
ax.clabel(contour, inline=True, fontsize=8)
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
**Box plots** - Statistical distributions
|
| 158 |
+
```python
|
| 159 |
+
ax.boxplot([data1, data2, data3], labels=['A', 'B', 'C'])
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
**Violin plots** - Distribution densities
|
| 163 |
+
```python
|
| 164 |
+
ax.violinplot([data1, data2, data3], positions=[1, 2, 3])
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
For comprehensive plot type examples and variations, refer to `references/plot_types.md`.
|
| 168 |
+
|
| 169 |
+
### 4. Styling and Customization
|
| 170 |
+
|
| 171 |
+
**Color specification methods:**
|
| 172 |
+
- Named colors: `'red'`, `'blue'`, `'steelblue'`
|
| 173 |
+
- Hex codes: `'#FF5733'`
|
| 174 |
+
- RGB tuples: `(0.1, 0.2, 0.3)`
|
| 175 |
+
- Colormaps: `cmap='viridis'`, `cmap='plasma'`, `cmap='coolwarm'`
|
| 176 |
+
|
| 177 |
+
**Using style sheets:**
|
| 178 |
+
```python
|
| 179 |
+
plt.style.use('seaborn-v0_8-darkgrid') # Apply predefined style
|
| 180 |
+
# Available styles: 'ggplot', 'bmh', 'fivethirtyeight', etc.
|
| 181 |
+
print(plt.style.available) # List all available styles
|
| 182 |
+
```
|
| 183 |
+
|
| 184 |
+
**Customizing with rcParams:**
|
| 185 |
+
```python
|
| 186 |
+
plt.rcParams['font.size'] = 12
|
| 187 |
+
plt.rcParams['axes.labelsize'] = 14
|
| 188 |
+
plt.rcParams['axes.titlesize'] = 16
|
| 189 |
+
plt.rcParams['xtick.labelsize'] = 10
|
| 190 |
+
plt.rcParams['ytick.labelsize'] = 10
|
| 191 |
+
plt.rcParams['legend.fontsize'] = 12
|
| 192 |
+
plt.rcParams['figure.titlesize'] = 18
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
**Text and annotations:**
|
| 196 |
+
```python
|
| 197 |
+
ax.text(x, y, 'annotation', fontsize=12, ha='center')
|
| 198 |
+
ax.annotate('important point', xy=(x, y), xytext=(x+1, y+1),
|
| 199 |
+
arrowprops=dict(arrowstyle='->', color='red'))
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
For detailed styling options and colormap guidelines, see `references/styling_guide.md`.
|
| 203 |
+
|
| 204 |
+
### 5. Saving Figures
|
| 205 |
+
|
| 206 |
+
**Export to various formats:**
|
| 207 |
+
```python
|
| 208 |
+
# High-resolution PNG for presentations/papers
|
| 209 |
+
plt.savefig('figure.png', dpi=300, bbox_inches='tight', facecolor='white')
|
| 210 |
+
|
| 211 |
+
# Vector format for publications (scalable)
|
| 212 |
+
plt.savefig('figure.pdf', bbox_inches='tight')
|
| 213 |
+
plt.savefig('figure.svg', bbox_inches='tight')
|
| 214 |
+
|
| 215 |
+
# Transparent background
|
| 216 |
+
plt.savefig('figure.png', dpi=300, bbox_inches='tight', transparent=True)
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
**Important parameters:**
|
| 220 |
+
- `dpi`: Resolution (300 for publications, 150 for web, 72 for screen)
|
| 221 |
+
- `bbox_inches='tight'`: Removes excess whitespace
|
| 222 |
+
- `facecolor='white'`: Ensures white background (useful for transparent themes)
|
| 223 |
+
- `transparent=True`: Transparent background
|
| 224 |
+
|
| 225 |
+
### 6. Working with 3D Plots
|
| 226 |
+
|
| 227 |
+
```python
|
| 228 |
+
from mpl_toolkits.mplot3d import Axes3D
|
| 229 |
+
|
| 230 |
+
fig = plt.figure(figsize=(10, 8))
|
| 231 |
+
ax = fig.add_subplot(111, projection='3d')
|
| 232 |
+
|
| 233 |
+
# Surface plot
|
| 234 |
+
ax.plot_surface(X, Y, Z, cmap='viridis')
|
| 235 |
+
|
| 236 |
+
# 3D scatter
|
| 237 |
+
ax.scatter(x, y, z, c=colors, marker='o')
|
| 238 |
+
|
| 239 |
+
# 3D line plot
|
| 240 |
+
ax.plot(x, y, z, linewidth=2)
|
| 241 |
+
|
| 242 |
+
# Labels
|
| 243 |
+
ax.set_xlabel('X Label')
|
| 244 |
+
ax.set_ylabel('Y Label')
|
| 245 |
+
ax.set_zlabel('Z Label')
|
| 246 |
+
```
|
| 247 |
+
|
| 248 |
+
## Best Practices
|
| 249 |
+
|
| 250 |
+
### 1. Interface Selection
|
| 251 |
+
- **Use the object-oriented interface** (fig, ax = plt.subplots()) for production code
|
| 252 |
+
- Reserve pyplot interface for quick interactive exploration only
|
| 253 |
+
- Always create figures explicitly rather than relying on implicit state
|
| 254 |
+
|
| 255 |
+
### 2. Figure Size and DPI
|
| 256 |
+
- Set figsize at creation: `fig, ax = plt.subplots(figsize=(10, 6))`
|
| 257 |
+
- Use appropriate DPI for output medium:
|
| 258 |
+
- Screen/notebook: 72-100 dpi
|
| 259 |
+
- Web: 150 dpi
|
| 260 |
+
- Print/publications: 300 dpi
|
| 261 |
+
|
| 262 |
+
### 3. Layout Management
|
| 263 |
+
- Use `constrained_layout=True` or `tight_layout()` to prevent overlapping elements
|
| 264 |
+
- `fig, ax = plt.subplots(constrained_layout=True)` is recommended for automatic spacing
|
| 265 |
+
|
| 266 |
+
### 4. Colormap Selection
|
| 267 |
+
- **Sequential** (viridis, plasma, inferno): Ordered data with consistent progression
|
| 268 |
+
- **Diverging** (coolwarm, RdBu): Data with meaningful center point (e.g., zero)
|
| 269 |
+
- **Qualitative** (tab10, Set3): Categorical/nominal data
|
| 270 |
+
- Avoid rainbow colormaps (jet) - they are not perceptually uniform
|
| 271 |
+
|
| 272 |
+
### 5. Accessibility
|
| 273 |
+
- Use colorblind-friendly colormaps (viridis, cividis)
|
| 274 |
+
- Add patterns/hatching for bar charts in addition to colors
|
| 275 |
+
- Ensure sufficient contrast between elements
|
| 276 |
+
- Include descriptive labels and legends
|
| 277 |
+
|
| 278 |
+
### 6. Performance
|
| 279 |
+
- For large datasets, use `rasterized=True` in plot calls to reduce file size
|
| 280 |
+
- Use appropriate data reduction before plotting (e.g., downsample dense time series)
|
| 281 |
+
- For animations, use blitting for better performance
|
| 282 |
+
|
| 283 |
+
### 7. Code Organization
|
| 284 |
+
```python
|
| 285 |
+
# Good practice: Clear structure
|
| 286 |
+
def create_analysis_plot(data, title):
|
| 287 |
+
"""Create standardized analysis plot."""
|
| 288 |
+
fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
|
| 289 |
+
|
| 290 |
+
# Plot data
|
| 291 |
+
ax.plot(data['x'], data['y'], linewidth=2)
|
| 292 |
+
|
| 293 |
+
# Customize
|
| 294 |
+
ax.set_xlabel('X Axis Label', fontsize=12)
|
| 295 |
+
ax.set_ylabel('Y Axis Label', fontsize=12)
|
| 296 |
+
ax.set_title(title, fontsize=14, fontweight='bold')
|
| 297 |
+
ax.grid(True, alpha=0.3)
|
| 298 |
+
|
| 299 |
+
return fig, ax
|
| 300 |
+
|
| 301 |
+
# Use the function
|
| 302 |
+
fig, ax = create_analysis_plot(my_data, 'My Analysis')
|
| 303 |
+
plt.savefig('analysis.png', dpi=300, bbox_inches='tight')
|
| 304 |
+
```
|
| 305 |
+
|
| 306 |
+
## Quick Reference Scripts
|
| 307 |
+
|
| 308 |
+
This skill includes helper scripts in the `scripts/` directory:
|
| 309 |
+
|
| 310 |
+
### `plot_template.py`
|
| 311 |
+
Template script demonstrating various plot types with best practices. Use this as a starting point for creating new visualizations.
|
| 312 |
+
|
| 313 |
+
**Usage:**
|
| 314 |
+
```bash
|
| 315 |
+
python scripts/plot_template.py
|
| 316 |
+
```
|
| 317 |
+
|
| 318 |
+
### `style_configurator.py`
|
| 319 |
+
Interactive utility to configure matplotlib style preferences and generate custom style sheets.
|
| 320 |
+
|
| 321 |
+
**Usage:**
|
| 322 |
+
```bash
|
| 323 |
+
python scripts/style_configurator.py
|
| 324 |
+
```
|
| 325 |
+
|
| 326 |
+
## Detailed References
|
| 327 |
+
|
| 328 |
+
For comprehensive information, consult the reference documents:
|
| 329 |
+
|
| 330 |
+
- **`references/plot_types.md`** - Complete catalog of plot types with code examples and use cases
|
| 331 |
+
- **`references/styling_guide.md`** - Detailed styling options, colormaps, and customization
|
| 332 |
+
- **`references/api_reference.md`** - Core classes and methods reference
|
| 333 |
+
- **`references/common_issues.md`** - Troubleshooting guide for common problems
|
| 334 |
+
|
| 335 |
+
## Integration with Other Tools
|
| 336 |
+
|
| 337 |
+
Matplotlib integrates well with:
|
| 338 |
+
- **NumPy/Pandas** - Direct plotting from arrays and DataFrames
|
| 339 |
+
- **Seaborn** - High-level statistical visualizations built on matplotlib
|
| 340 |
+
- **Jupyter** - Interactive plotting with `%matplotlib inline` or `%matplotlib widget`
|
| 341 |
+
- **GUI frameworks** - Embedding in Tkinter, Qt, wxPython applications
|
| 342 |
+
|
| 343 |
+
## Common Gotchas
|
| 344 |
+
|
| 345 |
+
1. **Overlapping elements**: Use `constrained_layout=True` or `tight_layout()`
|
| 346 |
+
2. **State confusion**: Use OO interface to avoid pyplot state machine issues
|
| 347 |
+
3. **Memory issues with many figures**: Close figures explicitly with `plt.close(fig)`
|
| 348 |
+
4. **Font warnings**: Install fonts or suppress warnings with `plt.rcParams['font.sans-serif']`
|
| 349 |
+
5. **DPI confusion**: Remember that figsize is in inches, not pixels: `pixels = dpi * inches`
|
| 350 |
+
|
| 351 |
+
## Additional Resources
|
| 352 |
+
|
| 353 |
+
- Official documentation: https://matplotlib.org/
|
| 354 |
+
- Gallery: https://matplotlib.org/stable/gallery/index.html
|
| 355 |
+
- Cheatsheets: https://matplotlib.org/cheatsheets/
|
| 356 |
+
- Tutorials: https://matplotlib.org/stable/tutorials/index.html
|
.scider/skills/matplotlib/references/api_reference.md
ADDED
|
@@ -0,0 +1,412 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Matplotlib API Reference
|
| 2 |
+
|
| 3 |
+
This document provides a quick reference for the most commonly used matplotlib classes and methods.
|
| 4 |
+
|
| 5 |
+
## Core Classes
|
| 6 |
+
|
| 7 |
+
### Figure
|
| 8 |
+
|
| 9 |
+
The top-level container for all plot elements.
|
| 10 |
+
|
| 11 |
+
**Creation:**
|
| 12 |
+
```python
|
| 13 |
+
fig = plt.figure(figsize=(10, 6), dpi=100, facecolor='white')
|
| 14 |
+
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6))
|
| 15 |
+
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
**Key Methods:**
|
| 19 |
+
- `fig.add_subplot(nrows, ncols, index)` - Add a subplot
|
| 20 |
+
- `fig.add_axes([left, bottom, width, height])` - Add axes at specific position
|
| 21 |
+
- `fig.savefig(filename, dpi=300, bbox_inches='tight')` - Save figure
|
| 22 |
+
- `fig.tight_layout()` - Adjust spacing to prevent overlaps
|
| 23 |
+
- `fig.suptitle(title)` - Set figure title
|
| 24 |
+
- `fig.legend()` - Create figure-level legend
|
| 25 |
+
- `fig.colorbar(mappable)` - Add colorbar to figure
|
| 26 |
+
- `plt.close(fig)` - Close figure to free memory
|
| 27 |
+
|
| 28 |
+
**Key Attributes:**
|
| 29 |
+
- `fig.axes` - List of all axes in the figure
|
| 30 |
+
- `fig.dpi` - Resolution in dots per inch
|
| 31 |
+
- `fig.figsize` - Figure dimensions in inches (width, height)
|
| 32 |
+
|
| 33 |
+
### Axes
|
| 34 |
+
|
| 35 |
+
The actual plotting area where data is visualized.
|
| 36 |
+
|
| 37 |
+
**Creation:**
|
| 38 |
+
```python
|
| 39 |
+
fig, ax = plt.subplots() # Single axes
|
| 40 |
+
ax = fig.add_subplot(111) # Alternative method
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
**Plotting Methods:**
|
| 44 |
+
|
| 45 |
+
**Line plots:**
|
| 46 |
+
- `ax.plot(x, y, **kwargs)` - Line plot
|
| 47 |
+
- `ax.step(x, y, where='pre'/'mid'/'post')` - Step plot
|
| 48 |
+
- `ax.errorbar(x, y, yerr, xerr)` - Error bars
|
| 49 |
+
|
| 50 |
+
**Scatter plots:**
|
| 51 |
+
- `ax.scatter(x, y, s=size, c=color, marker='o', alpha=0.5)` - Scatter plot
|
| 52 |
+
|
| 53 |
+
**Bar charts:**
|
| 54 |
+
- `ax.bar(x, height, width=0.8, align='center')` - Vertical bar chart
|
| 55 |
+
- `ax.barh(y, width)` - Horizontal bar chart
|
| 56 |
+
|
| 57 |
+
**Statistical plots:**
|
| 58 |
+
- `ax.hist(data, bins=10, density=False)` - Histogram
|
| 59 |
+
- `ax.boxplot(data, labels=None)` - Box plot
|
| 60 |
+
- `ax.violinplot(data)` - Violin plot
|
| 61 |
+
|
| 62 |
+
**2D plots:**
|
| 63 |
+
- `ax.imshow(array, cmap='viridis', aspect='auto')` - Display image/matrix
|
| 64 |
+
- `ax.contour(X, Y, Z, levels=10)` - Contour lines
|
| 65 |
+
- `ax.contourf(X, Y, Z, levels=10)` - Filled contours
|
| 66 |
+
- `ax.pcolormesh(X, Y, Z)` - Pseudocolor plot
|
| 67 |
+
|
| 68 |
+
**Filling:**
|
| 69 |
+
- `ax.fill_between(x, y1, y2, alpha=0.3)` - Fill between curves
|
| 70 |
+
- `ax.fill_betweenx(y, x1, x2)` - Fill between vertical curves
|
| 71 |
+
|
| 72 |
+
**Text and annotations:**
|
| 73 |
+
- `ax.text(x, y, text, fontsize=12)` - Add text
|
| 74 |
+
- `ax.annotate(text, xy=(x, y), xytext=(x2, y2), arrowprops={})` - Annotate with arrow
|
| 75 |
+
|
| 76 |
+
**Customization Methods:**
|
| 77 |
+
|
| 78 |
+
**Labels and titles:**
|
| 79 |
+
- `ax.set_xlabel(label, fontsize=12)` - Set x-axis label
|
| 80 |
+
- `ax.set_ylabel(label, fontsize=12)` - Set y-axis label
|
| 81 |
+
- `ax.set_title(title, fontsize=14)` - Set axes title
|
| 82 |
+
|
| 83 |
+
**Limits and scales:**
|
| 84 |
+
- `ax.set_xlim(left, right)` - Set x-axis limits
|
| 85 |
+
- `ax.set_ylim(bottom, top)` - Set y-axis limits
|
| 86 |
+
- `ax.set_xscale('linear'/'log'/'symlog')` - Set x-axis scale
|
| 87 |
+
- `ax.set_yscale('linear'/'log'/'symlog')` - Set y-axis scale
|
| 88 |
+
|
| 89 |
+
**Ticks:**
|
| 90 |
+
- `ax.set_xticks(positions)` - Set x-tick positions
|
| 91 |
+
- `ax.set_xticklabels(labels)` - Set x-tick labels
|
| 92 |
+
- `ax.tick_params(axis='both', labelsize=10)` - Customize tick appearance
|
| 93 |
+
|
| 94 |
+
**Grid and spines:**
|
| 95 |
+
- `ax.grid(True, alpha=0.3, linestyle='--')` - Add grid
|
| 96 |
+
- `ax.spines['top'].set_visible(False)` - Hide top spine
|
| 97 |
+
- `ax.spines['right'].set_visible(False)` - Hide right spine
|
| 98 |
+
|
| 99 |
+
**Legend:**
|
| 100 |
+
- `ax.legend(loc='best', fontsize=10, frameon=True)` - Add legend
|
| 101 |
+
- `ax.legend(handles, labels)` - Custom legend
|
| 102 |
+
|
| 103 |
+
**Aspect and layout:**
|
| 104 |
+
- `ax.set_aspect('equal'/'auto'/ratio)` - Set aspect ratio
|
| 105 |
+
- `ax.invert_xaxis()` - Invert x-axis
|
| 106 |
+
- `ax.invert_yaxis()` - Invert y-axis
|
| 107 |
+
|
| 108 |
+
### pyplot Module
|
| 109 |
+
|
| 110 |
+
High-level interface for quick plotting.
|
| 111 |
+
|
| 112 |
+
**Figure creation:**
|
| 113 |
+
- `plt.figure()` - Create new figure
|
| 114 |
+
- `plt.subplots()` - Create figure and axes
|
| 115 |
+
- `plt.subplot()` - Add subplot to current figure
|
| 116 |
+
|
| 117 |
+
**Plotting (uses current axes):**
|
| 118 |
+
- `plt.plot()` - Line plot
|
| 119 |
+
- `plt.scatter()` - Scatter plot
|
| 120 |
+
- `plt.bar()` - Bar chart
|
| 121 |
+
- `plt.hist()` - Histogram
|
| 122 |
+
- (All axes methods available)
|
| 123 |
+
|
| 124 |
+
**Display and save:**
|
| 125 |
+
- `plt.show()` - Display figure
|
| 126 |
+
- `plt.savefig()` - Save figure
|
| 127 |
+
- `plt.close()` - Close figure
|
| 128 |
+
|
| 129 |
+
**Style:**
|
| 130 |
+
- `plt.style.use(style_name)` - Apply style sheet
|
| 131 |
+
- `plt.style.available` - List available styles
|
| 132 |
+
|
| 133 |
+
**State management:**
|
| 134 |
+
- `plt.gca()` - Get current axes
|
| 135 |
+
- `plt.gcf()` - Get current figure
|
| 136 |
+
- `plt.sca(ax)` - Set current axes
|
| 137 |
+
- `plt.clf()` - Clear current figure
|
| 138 |
+
- `plt.cla()` - Clear current axes
|
| 139 |
+
|
| 140 |
+
## Line and Marker Styles
|
| 141 |
+
|
| 142 |
+
### Line Styles
|
| 143 |
+
- `'-'` or `'solid'` - Solid line
|
| 144 |
+
- `'--'` or `'dashed'` - Dashed line
|
| 145 |
+
- `'-.'` or `'dashdot'` - Dash-dot line
|
| 146 |
+
- `':'` or `'dotted'` - Dotted line
|
| 147 |
+
- `''` or `' '` or `'None'` - No line
|
| 148 |
+
|
| 149 |
+
### Marker Styles
|
| 150 |
+
- `'.'` - Point marker
|
| 151 |
+
- `'o'` - Circle marker
|
| 152 |
+
- `'v'`, `'^'`, `'<'`, `'>'` - Triangle markers
|
| 153 |
+
- `'s'` - Square marker
|
| 154 |
+
- `'p'` - Pentagon marker
|
| 155 |
+
- `'*'` - Star marker
|
| 156 |
+
- `'h'`, `'H'` - Hexagon markers
|
| 157 |
+
- `'+'` - Plus marker
|
| 158 |
+
- `'x'` - X marker
|
| 159 |
+
- `'D'`, `'d'` - Diamond markers
|
| 160 |
+
|
| 161 |
+
### Color Specifications
|
| 162 |
+
|
| 163 |
+
**Single character shortcuts:**
|
| 164 |
+
- `'b'` - Blue
|
| 165 |
+
- `'g'` - Green
|
| 166 |
+
- `'r'` - Red
|
| 167 |
+
- `'c'` - Cyan
|
| 168 |
+
- `'m'` - Magenta
|
| 169 |
+
- `'y'` - Yellow
|
| 170 |
+
- `'k'` - Black
|
| 171 |
+
- `'w'` - White
|
| 172 |
+
|
| 173 |
+
**Named colors:**
|
| 174 |
+
- `'steelblue'`, `'coral'`, `'teal'`, etc.
|
| 175 |
+
- See full list: https://matplotlib.org/stable/gallery/color/named_colors.html
|
| 176 |
+
|
| 177 |
+
**Other formats:**
|
| 178 |
+
- Hex: `'#FF5733'`
|
| 179 |
+
- RGB tuple: `(0.1, 0.2, 0.3)`
|
| 180 |
+
- RGBA tuple: `(0.1, 0.2, 0.3, 0.5)`
|
| 181 |
+
|
| 182 |
+
## Common Parameters
|
| 183 |
+
|
| 184 |
+
### Plot Function Parameters
|
| 185 |
+
|
| 186 |
+
```python
|
| 187 |
+
ax.plot(x, y,
|
| 188 |
+
color='blue', # Line color
|
| 189 |
+
linewidth=2, # Line width
|
| 190 |
+
linestyle='--', # Line style
|
| 191 |
+
marker='o', # Marker style
|
| 192 |
+
markersize=8, # Marker size
|
| 193 |
+
markerfacecolor='red', # Marker fill color
|
| 194 |
+
markeredgecolor='black',# Marker edge color
|
| 195 |
+
markeredgewidth=1, # Marker edge width
|
| 196 |
+
alpha=0.7, # Transparency (0-1)
|
| 197 |
+
label='data', # Legend label
|
| 198 |
+
zorder=2, # Drawing order
|
| 199 |
+
rasterized=True # Rasterize for smaller file size
|
| 200 |
+
)
|
| 201 |
+
```
|
| 202 |
+
|
| 203 |
+
### Scatter Function Parameters
|
| 204 |
+
|
| 205 |
+
```python
|
| 206 |
+
ax.scatter(x, y,
|
| 207 |
+
s=50, # Size (scalar or array)
|
| 208 |
+
c='blue', # Color (scalar, array, or sequence)
|
| 209 |
+
marker='o', # Marker style
|
| 210 |
+
cmap='viridis', # Colormap (if c is numeric)
|
| 211 |
+
alpha=0.5, # Transparency
|
| 212 |
+
edgecolors='black', # Edge color
|
| 213 |
+
linewidths=1, # Edge width
|
| 214 |
+
vmin=0, vmax=1, # Color scale limits
|
| 215 |
+
label='data' # Legend label
|
| 216 |
+
)
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
### Text Parameters
|
| 220 |
+
|
| 221 |
+
```python
|
| 222 |
+
ax.text(x, y, text,
|
| 223 |
+
fontsize=12, # Font size
|
| 224 |
+
fontweight='normal', # 'normal', 'bold', 'heavy', 'light'
|
| 225 |
+
fontstyle='normal', # 'normal', 'italic', 'oblique'
|
| 226 |
+
fontfamily='sans-serif',# Font family
|
| 227 |
+
color='black', # Text color
|
| 228 |
+
alpha=1.0, # Transparency
|
| 229 |
+
ha='center', # Horizontal alignment: 'left', 'center', 'right'
|
| 230 |
+
va='center', # Vertical alignment: 'top', 'center', 'bottom', 'baseline'
|
| 231 |
+
rotation=0, # Rotation angle in degrees
|
| 232 |
+
bbox=dict( # Background box
|
| 233 |
+
facecolor='white',
|
| 234 |
+
edgecolor='black',
|
| 235 |
+
boxstyle='round'
|
| 236 |
+
)
|
| 237 |
+
)
|
| 238 |
+
```
|
| 239 |
+
|
| 240 |
+
## rcParams Configuration
|
| 241 |
+
|
| 242 |
+
Common rcParams settings for global customization:
|
| 243 |
+
|
| 244 |
+
```python
|
| 245 |
+
# Font settings
|
| 246 |
+
plt.rcParams['font.family'] = 'sans-serif'
|
| 247 |
+
plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica']
|
| 248 |
+
plt.rcParams['font.size'] = 12
|
| 249 |
+
|
| 250 |
+
# Figure settings
|
| 251 |
+
plt.rcParams['figure.figsize'] = (10, 6)
|
| 252 |
+
plt.rcParams['figure.dpi'] = 100
|
| 253 |
+
plt.rcParams['figure.facecolor'] = 'white'
|
| 254 |
+
plt.rcParams['savefig.dpi'] = 300
|
| 255 |
+
plt.rcParams['savefig.bbox'] = 'tight'
|
| 256 |
+
|
| 257 |
+
# Axes settings
|
| 258 |
+
plt.rcParams['axes.labelsize'] = 14
|
| 259 |
+
plt.rcParams['axes.titlesize'] = 16
|
| 260 |
+
plt.rcParams['axes.grid'] = True
|
| 261 |
+
plt.rcParams['axes.grid.alpha'] = 0.3
|
| 262 |
+
|
| 263 |
+
# Line settings
|
| 264 |
+
plt.rcParams['lines.linewidth'] = 2
|
| 265 |
+
plt.rcParams['lines.markersize'] = 8
|
| 266 |
+
|
| 267 |
+
# Tick settings
|
| 268 |
+
plt.rcParams['xtick.labelsize'] = 10
|
| 269 |
+
plt.rcParams['ytick.labelsize'] = 10
|
| 270 |
+
plt.rcParams['xtick.direction'] = 'in' # 'in', 'out', 'inout'
|
| 271 |
+
plt.rcParams['ytick.direction'] = 'in'
|
| 272 |
+
|
| 273 |
+
# Legend settings
|
| 274 |
+
plt.rcParams['legend.fontsize'] = 12
|
| 275 |
+
plt.rcParams['legend.frameon'] = True
|
| 276 |
+
plt.rcParams['legend.framealpha'] = 0.8
|
| 277 |
+
|
| 278 |
+
# Grid settings
|
| 279 |
+
plt.rcParams['grid.alpha'] = 0.3
|
| 280 |
+
plt.rcParams['grid.linestyle'] = '--'
|
| 281 |
+
```
|
| 282 |
+
|
| 283 |
+
## GridSpec for Complex Layouts
|
| 284 |
+
|
| 285 |
+
```python
|
| 286 |
+
from matplotlib.gridspec import GridSpec
|
| 287 |
+
|
| 288 |
+
fig = plt.figure(figsize=(12, 8))
|
| 289 |
+
gs = GridSpec(3, 3, figure=fig, hspace=0.3, wspace=0.3)
|
| 290 |
+
|
| 291 |
+
# Span multiple cells
|
| 292 |
+
ax1 = fig.add_subplot(gs[0, :]) # Top row, all columns
|
| 293 |
+
ax2 = fig.add_subplot(gs[1:, 0]) # Bottom two rows, first column
|
| 294 |
+
ax3 = fig.add_subplot(gs[1, 1:]) # Middle row, last two columns
|
| 295 |
+
ax4 = fig.add_subplot(gs[2, 1]) # Bottom row, middle column
|
| 296 |
+
ax5 = fig.add_subplot(gs[2, 2]) # Bottom row, right column
|
| 297 |
+
```
|
| 298 |
+
|
| 299 |
+
## 3D Plotting
|
| 300 |
+
|
| 301 |
+
```python
|
| 302 |
+
from mpl_toolkits.mplot3d import Axes3D
|
| 303 |
+
|
| 304 |
+
fig = plt.figure()
|
| 305 |
+
ax = fig.add_subplot(111, projection='3d')
|
| 306 |
+
|
| 307 |
+
# Plot types
|
| 308 |
+
ax.plot(x, y, z) # 3D line
|
| 309 |
+
ax.scatter(x, y, z) # 3D scatter
|
| 310 |
+
ax.plot_surface(X, Y, Z) # 3D surface
|
| 311 |
+
ax.plot_wireframe(X, Y, Z) # 3D wireframe
|
| 312 |
+
ax.contour(X, Y, Z) # 3D contour
|
| 313 |
+
ax.bar3d(x, y, z, dx, dy, dz) # 3D bar
|
| 314 |
+
|
| 315 |
+
# Customization
|
| 316 |
+
ax.set_xlabel('X')
|
| 317 |
+
ax.set_ylabel('Y')
|
| 318 |
+
ax.set_zlabel('Z')
|
| 319 |
+
ax.view_init(elev=30, azim=45) # Set viewing angle
|
| 320 |
+
```
|
| 321 |
+
|
| 322 |
+
## Animation
|
| 323 |
+
|
| 324 |
+
```python
|
| 325 |
+
from matplotlib.animation import FuncAnimation
|
| 326 |
+
|
| 327 |
+
fig, ax = plt.subplots()
|
| 328 |
+
line, = ax.plot([], [])
|
| 329 |
+
|
| 330 |
+
def init():
|
| 331 |
+
ax.set_xlim(0, 2*np.pi)
|
| 332 |
+
ax.set_ylim(-1, 1)
|
| 333 |
+
return line,
|
| 334 |
+
|
| 335 |
+
def update(frame):
|
| 336 |
+
x = np.linspace(0, 2*np.pi, 100)
|
| 337 |
+
y = np.sin(x + frame/10)
|
| 338 |
+
line.set_data(x, y)
|
| 339 |
+
return line,
|
| 340 |
+
|
| 341 |
+
anim = FuncAnimation(fig, update, init_func=init,
|
| 342 |
+
frames=100, interval=50, blit=True)
|
| 343 |
+
|
| 344 |
+
# Save animation
|
| 345 |
+
anim.save('animation.gif', writer='pillow', fps=20)
|
| 346 |
+
anim.save('animation.mp4', writer='ffmpeg', fps=20)
|
| 347 |
+
```
|
| 348 |
+
|
| 349 |
+
## Image Operations
|
| 350 |
+
|
| 351 |
+
```python
|
| 352 |
+
# Read and display image
|
| 353 |
+
img = plt.imread('image.png')
|
| 354 |
+
ax.imshow(img)
|
| 355 |
+
|
| 356 |
+
# Display matrix as image
|
| 357 |
+
ax.imshow(matrix, cmap='viridis', aspect='auto',
|
| 358 |
+
interpolation='nearest', origin='lower')
|
| 359 |
+
|
| 360 |
+
# Colorbar
|
| 361 |
+
cbar = plt.colorbar(im, ax=ax)
|
| 362 |
+
cbar.set_label('Values')
|
| 363 |
+
|
| 364 |
+
# Image extent (set coordinates)
|
| 365 |
+
ax.imshow(img, extent=[x_min, x_max, y_min, y_max])
|
| 366 |
+
```
|
| 367 |
+
|
| 368 |
+
## Event Handling
|
| 369 |
+
|
| 370 |
+
```python
|
| 371 |
+
# Mouse click event
|
| 372 |
+
def on_click(event):
|
| 373 |
+
if event.inaxes:
|
| 374 |
+
print(f'Clicked at x={event.xdata:.2f}, y={event.ydata:.2f}')
|
| 375 |
+
|
| 376 |
+
fig.canvas.mpl_connect('button_press_event', on_click)
|
| 377 |
+
|
| 378 |
+
# Key press event
|
| 379 |
+
def on_key(event):
|
| 380 |
+
print(f'Key pressed: {event.key}')
|
| 381 |
+
|
| 382 |
+
fig.canvas.mpl_connect('key_press_event', on_key)
|
| 383 |
+
```
|
| 384 |
+
|
| 385 |
+
## Useful Utilities
|
| 386 |
+
|
| 387 |
+
```python
|
| 388 |
+
# Get current axis limits
|
| 389 |
+
xlims = ax.get_xlim()
|
| 390 |
+
ylims = ax.get_ylim()
|
| 391 |
+
|
| 392 |
+
# Set equal aspect ratio
|
| 393 |
+
ax.set_aspect('equal', adjustable='box')
|
| 394 |
+
|
| 395 |
+
# Share axes between subplots
|
| 396 |
+
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
|
| 397 |
+
|
| 398 |
+
# Twin axes (two y-axes)
|
| 399 |
+
ax2 = ax1.twinx()
|
| 400 |
+
|
| 401 |
+
# Remove tick labels
|
| 402 |
+
ax.set_xticklabels([])
|
| 403 |
+
ax.set_yticklabels([])
|
| 404 |
+
|
| 405 |
+
# Scientific notation
|
| 406 |
+
ax.ticklabel_format(style='scientific', axis='y', scilimits=(0,0))
|
| 407 |
+
|
| 408 |
+
# Date formatting
|
| 409 |
+
import matplotlib.dates as mdates
|
| 410 |
+
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
|
| 411 |
+
ax.xaxis.set_major_locator(mdates.DayLocator(interval=7))
|
| 412 |
+
```
|
.scider/skills/matplotlib/references/common_issues.md
ADDED
|
@@ -0,0 +1,563 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Matplotlib Common Issues and Solutions
|
| 2 |
+
|
| 3 |
+
Troubleshooting guide for frequently encountered matplotlib problems.
|
| 4 |
+
|
| 5 |
+
## Display and Backend Issues
|
| 6 |
+
|
| 7 |
+
### Issue: Plots Not Showing
|
| 8 |
+
|
| 9 |
+
**Problem:** `plt.show()` doesn't display anything
|
| 10 |
+
|
| 11 |
+
**Solutions:**
|
| 12 |
+
```python
|
| 13 |
+
# 1. Check if backend is properly set (for interactive use)
|
| 14 |
+
import matplotlib
|
| 15 |
+
print(matplotlib.get_backend())
|
| 16 |
+
|
| 17 |
+
# 2. Try different backends
|
| 18 |
+
matplotlib.use('TkAgg') # or 'Qt5Agg', 'MacOSX'
|
| 19 |
+
import matplotlib.pyplot as plt
|
| 20 |
+
|
| 21 |
+
# 3. In Jupyter notebooks, use magic command
|
| 22 |
+
%matplotlib inline # Static images
|
| 23 |
+
# or
|
| 24 |
+
%matplotlib widget # Interactive plots
|
| 25 |
+
|
| 26 |
+
# 4. Ensure plt.show() is called
|
| 27 |
+
plt.plot([1, 2, 3])
|
| 28 |
+
plt.show()
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
### Issue: "RuntimeError: main thread is not in main loop"
|
| 32 |
+
|
| 33 |
+
**Problem:** Interactive mode issues with threading
|
| 34 |
+
|
| 35 |
+
**Solution:**
|
| 36 |
+
```python
|
| 37 |
+
# Switch to non-interactive backend
|
| 38 |
+
import matplotlib
|
| 39 |
+
matplotlib.use('Agg')
|
| 40 |
+
import matplotlib.pyplot as plt
|
| 41 |
+
|
| 42 |
+
# Or turn off interactive mode
|
| 43 |
+
plt.ioff()
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
### Issue: Figures Not Updating Interactively
|
| 47 |
+
|
| 48 |
+
**Problem:** Changes not reflected in interactive windows
|
| 49 |
+
|
| 50 |
+
**Solution:**
|
| 51 |
+
```python
|
| 52 |
+
# Enable interactive mode
|
| 53 |
+
plt.ion()
|
| 54 |
+
|
| 55 |
+
# Draw after each change
|
| 56 |
+
plt.plot(x, y)
|
| 57 |
+
plt.draw()
|
| 58 |
+
plt.pause(0.001) # Brief pause to update display
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
## Layout and Spacing Issues
|
| 62 |
+
|
| 63 |
+
### Issue: Overlapping Labels and Titles
|
| 64 |
+
|
| 65 |
+
**Problem:** Labels, titles, or tick labels overlap or get cut off
|
| 66 |
+
|
| 67 |
+
**Solutions:**
|
| 68 |
+
```python
|
| 69 |
+
# Solution 1: Constrained layout (RECOMMENDED)
|
| 70 |
+
fig, ax = plt.subplots(constrained_layout=True)
|
| 71 |
+
|
| 72 |
+
# Solution 2: Tight layout
|
| 73 |
+
fig, ax = plt.subplots()
|
| 74 |
+
plt.tight_layout()
|
| 75 |
+
|
| 76 |
+
# Solution 3: Adjust margins manually
|
| 77 |
+
plt.subplots_adjust(left=0.15, right=0.95, top=0.95, bottom=0.15)
|
| 78 |
+
|
| 79 |
+
# Solution 4: Save with bbox_inches='tight'
|
| 80 |
+
plt.savefig('figure.png', bbox_inches='tight')
|
| 81 |
+
|
| 82 |
+
# Solution 5: Rotate long tick labels
|
| 83 |
+
ax.set_xticklabels(labels, rotation=45, ha='right')
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
### Issue: Colorbar Affects Subplot Size
|
| 87 |
+
|
| 88 |
+
**Problem:** Adding colorbar shrinks the plot
|
| 89 |
+
|
| 90 |
+
**Solution:**
|
| 91 |
+
```python
|
| 92 |
+
# Solution 1: Use constrained layout
|
| 93 |
+
fig, ax = plt.subplots(constrained_layout=True)
|
| 94 |
+
im = ax.imshow(data)
|
| 95 |
+
plt.colorbar(im, ax=ax)
|
| 96 |
+
|
| 97 |
+
# Solution 2: Manually specify colorbar dimensions
|
| 98 |
+
from mpl_toolkits.axes_grid1 import make_axes_locatable
|
| 99 |
+
divider = make_axes_locatable(ax)
|
| 100 |
+
cax = divider.append_axes("right", size="5%", pad=0.05)
|
| 101 |
+
plt.colorbar(im, cax=cax)
|
| 102 |
+
|
| 103 |
+
# Solution 3: For multiple subplots, share colorbar
|
| 104 |
+
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
|
| 105 |
+
for ax in axes:
|
| 106 |
+
im = ax.imshow(data)
|
| 107 |
+
fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.95)
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
### Issue: Subplots Too Close Together
|
| 111 |
+
|
| 112 |
+
**Problem:** Multiple subplots overlapping
|
| 113 |
+
|
| 114 |
+
**Solution:**
|
| 115 |
+
```python
|
| 116 |
+
# Solution 1: Use constrained_layout
|
| 117 |
+
fig, axes = plt.subplots(2, 2, constrained_layout=True)
|
| 118 |
+
|
| 119 |
+
# Solution 2: Adjust spacing with subplots_adjust
|
| 120 |
+
fig, axes = plt.subplots(2, 2)
|
| 121 |
+
plt.subplots_adjust(hspace=0.4, wspace=0.4)
|
| 122 |
+
|
| 123 |
+
# Solution 3: Specify spacing in tight_layout
|
| 124 |
+
plt.tight_layout(h_pad=2.0, w_pad=2.0)
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
## Memory and Performance Issues
|
| 128 |
+
|
| 129 |
+
### Issue: Memory Leak with Multiple Figures
|
| 130 |
+
|
| 131 |
+
**Problem:** Memory usage grows when creating many figures
|
| 132 |
+
|
| 133 |
+
**Solution:**
|
| 134 |
+
```python
|
| 135 |
+
# Close figures explicitly
|
| 136 |
+
fig, ax = plt.subplots()
|
| 137 |
+
ax.plot(x, y)
|
| 138 |
+
plt.savefig('plot.png')
|
| 139 |
+
plt.close(fig) # or plt.close('all')
|
| 140 |
+
|
| 141 |
+
# Clear current figure without closing
|
| 142 |
+
plt.clf()
|
| 143 |
+
|
| 144 |
+
# Clear current axes
|
| 145 |
+
plt.cla()
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
### Issue: Large File Sizes
|
| 149 |
+
|
| 150 |
+
**Problem:** Saved figures are too large
|
| 151 |
+
|
| 152 |
+
**Solutions:**
|
| 153 |
+
```python
|
| 154 |
+
# Solution 1: Reduce DPI
|
| 155 |
+
plt.savefig('figure.png', dpi=150) # Instead of 300
|
| 156 |
+
|
| 157 |
+
# Solution 2: Use rasterization for complex plots
|
| 158 |
+
ax.plot(x, y, rasterized=True)
|
| 159 |
+
|
| 160 |
+
# Solution 3: Use vector format for simple plots
|
| 161 |
+
plt.savefig('figure.pdf') # or .svg
|
| 162 |
+
|
| 163 |
+
# Solution 4: Compress PNG
|
| 164 |
+
plt.savefig('figure.png', dpi=300, optimize=True)
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
### Issue: Slow Plotting with Large Datasets
|
| 168 |
+
|
| 169 |
+
**Problem:** Plotting takes too long with many points
|
| 170 |
+
|
| 171 |
+
**Solutions:**
|
| 172 |
+
```python
|
| 173 |
+
# Solution 1: Downsample data
|
| 174 |
+
from scipy.signal import decimate
|
| 175 |
+
y_downsampled = decimate(y, 10) # Keep every 10th point
|
| 176 |
+
|
| 177 |
+
# Solution 2: Use rasterization
|
| 178 |
+
ax.plot(x, y, rasterized=True)
|
| 179 |
+
|
| 180 |
+
# Solution 3: Use line simplification
|
| 181 |
+
ax.plot(x, y)
|
| 182 |
+
for line in ax.get_lines():
|
| 183 |
+
line.set_rasterized(True)
|
| 184 |
+
|
| 185 |
+
# Solution 4: For scatter plots, consider hexbin or 2d histogram
|
| 186 |
+
ax.hexbin(x, y, gridsize=50, cmap='viridis')
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
## Font and Text Issues
|
| 190 |
+
|
| 191 |
+
### Issue: Font Warnings
|
| 192 |
+
|
| 193 |
+
**Problem:** "findfont: Font family [...] not found"
|
| 194 |
+
|
| 195 |
+
**Solutions:**
|
| 196 |
+
```python
|
| 197 |
+
# Solution 1: Use available fonts
|
| 198 |
+
from matplotlib.font_manager import findfont, FontProperties
|
| 199 |
+
print(findfont(FontProperties(family='sans-serif')))
|
| 200 |
+
|
| 201 |
+
# Solution 2: Rebuild font cache
|
| 202 |
+
import matplotlib.font_manager
|
| 203 |
+
matplotlib.font_manager._rebuild()
|
| 204 |
+
|
| 205 |
+
# Solution 3: Suppress warnings
|
| 206 |
+
import warnings
|
| 207 |
+
warnings.filterwarnings("ignore", category=UserWarning)
|
| 208 |
+
|
| 209 |
+
# Solution 4: Specify fallback fonts
|
| 210 |
+
plt.rcParams['font.sans-serif'] = ['Arial', 'DejaVu Sans', 'sans-serif']
|
| 211 |
+
```
|
| 212 |
+
|
| 213 |
+
### Issue: LaTeX Rendering Errors
|
| 214 |
+
|
| 215 |
+
**Problem:** Math text not rendering correctly
|
| 216 |
+
|
| 217 |
+
**Solutions:**
|
| 218 |
+
```python
|
| 219 |
+
# Solution 1: Use raw strings with r prefix
|
| 220 |
+
ax.set_xlabel(r'$\alpha$') # Not '\alpha'
|
| 221 |
+
|
| 222 |
+
# Solution 2: Escape backslashes in regular strings
|
| 223 |
+
ax.set_xlabel('$\\alpha$')
|
| 224 |
+
|
| 225 |
+
# Solution 3: Disable LaTeX if not installed
|
| 226 |
+
plt.rcParams['text.usetex'] = False
|
| 227 |
+
|
| 228 |
+
# Solution 4: Use mathtext instead of full LaTeX
|
| 229 |
+
# Mathtext is always available, no LaTeX installation needed
|
| 230 |
+
ax.text(x, y, r'$\int_0^\infty e^{-x} dx$')
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
### Issue: Text Cut Off or Outside Figure
|
| 234 |
+
|
| 235 |
+
**Problem:** Labels or annotations appear outside figure bounds
|
| 236 |
+
|
| 237 |
+
**Solutions:**
|
| 238 |
+
```python
|
| 239 |
+
# Solution 1: Use bbox_inches='tight'
|
| 240 |
+
plt.savefig('figure.png', bbox_inches='tight')
|
| 241 |
+
|
| 242 |
+
# Solution 2: Adjust figure bounds
|
| 243 |
+
plt.subplots_adjust(left=0.15, right=0.85, top=0.85, bottom=0.15)
|
| 244 |
+
|
| 245 |
+
# Solution 3: Clip text to axes
|
| 246 |
+
ax.text(x, y, 'text', clip_on=True)
|
| 247 |
+
|
| 248 |
+
# Solution 4: Use constrained_layout
|
| 249 |
+
fig, ax = plt.subplots(constrained_layout=True)
|
| 250 |
+
```
|
| 251 |
+
|
| 252 |
+
## Color and Colormap Issues
|
| 253 |
+
|
| 254 |
+
### Issue: Colorbar Not Matching Plot
|
| 255 |
+
|
| 256 |
+
**Problem:** Colorbar shows different range than data
|
| 257 |
+
|
| 258 |
+
**Solution:**
|
| 259 |
+
```python
|
| 260 |
+
# Explicitly set vmin and vmax
|
| 261 |
+
im = ax.imshow(data, vmin=0, vmax=1, cmap='viridis')
|
| 262 |
+
plt.colorbar(im, ax=ax)
|
| 263 |
+
|
| 264 |
+
# Or use the same norm for multiple plots
|
| 265 |
+
import matplotlib.colors as mcolors
|
| 266 |
+
norm = mcolors.Normalize(vmin=data.min(), vmax=data.max())
|
| 267 |
+
im1 = ax1.imshow(data1, norm=norm, cmap='viridis')
|
| 268 |
+
im2 = ax2.imshow(data2, norm=norm, cmap='viridis')
|
| 269 |
+
```
|
| 270 |
+
|
| 271 |
+
### Issue: Colors Look Wrong
|
| 272 |
+
|
| 273 |
+
**Problem:** Unexpected colors in plots
|
| 274 |
+
|
| 275 |
+
**Solutions:**
|
| 276 |
+
```python
|
| 277 |
+
# Solution 1: Check color specification format
|
| 278 |
+
ax.plot(x, y, color='blue') # Correct
|
| 279 |
+
ax.plot(x, y, color=(0, 0, 1)) # Correct RGB
|
| 280 |
+
ax.plot(x, y, color='#0000FF') # Correct hex
|
| 281 |
+
|
| 282 |
+
# Solution 2: Verify colormap exists
|
| 283 |
+
print(plt.colormaps()) # List available colormaps
|
| 284 |
+
|
| 285 |
+
# Solution 3: For scatter plots, ensure c shape matches
|
| 286 |
+
ax.scatter(x, y, c=colors) # colors should have same length as x, y
|
| 287 |
+
|
| 288 |
+
# Solution 4: Check if alpha is set correctly
|
| 289 |
+
ax.plot(x, y, alpha=1.0) # 0=transparent, 1=opaque
|
| 290 |
+
```
|
| 291 |
+
|
| 292 |
+
### Issue: Reversed Colormap
|
| 293 |
+
|
| 294 |
+
**Problem:** Colormap direction is backwards
|
| 295 |
+
|
| 296 |
+
**Solution:**
|
| 297 |
+
```python
|
| 298 |
+
# Add _r suffix to reverse any colormap
|
| 299 |
+
ax.imshow(data, cmap='viridis_r')
|
| 300 |
+
```
|
| 301 |
+
|
| 302 |
+
## Axis and Scale Issues
|
| 303 |
+
|
| 304 |
+
### Issue: Axis Limits Not Working
|
| 305 |
+
|
| 306 |
+
**Problem:** `set_xlim` or `set_ylim` not taking effect
|
| 307 |
+
|
| 308 |
+
**Solutions:**
|
| 309 |
+
```python
|
| 310 |
+
# Solution 1: Set after plotting
|
| 311 |
+
ax.plot(x, y)
|
| 312 |
+
ax.set_xlim(0, 10)
|
| 313 |
+
ax.set_ylim(-1, 1)
|
| 314 |
+
|
| 315 |
+
# Solution 2: Disable autoscaling
|
| 316 |
+
ax.autoscale(False)
|
| 317 |
+
ax.set_xlim(0, 10)
|
| 318 |
+
|
| 319 |
+
# Solution 3: Use axis method
|
| 320 |
+
ax.axis([xmin, xmax, ymin, ymax])
|
| 321 |
+
```
|
| 322 |
+
|
| 323 |
+
### Issue: Log Scale with Zero or Negative Values
|
| 324 |
+
|
| 325 |
+
**Problem:** ValueError when using log scale with data ≤ 0
|
| 326 |
+
|
| 327 |
+
**Solutions:**
|
| 328 |
+
```python
|
| 329 |
+
# Solution 1: Filter out non-positive values
|
| 330 |
+
mask = (data > 0)
|
| 331 |
+
ax.plot(x[mask], data[mask])
|
| 332 |
+
ax.set_yscale('log')
|
| 333 |
+
|
| 334 |
+
# Solution 2: Use symlog for data with positive and negative values
|
| 335 |
+
ax.set_yscale('symlog')
|
| 336 |
+
|
| 337 |
+
# Solution 3: Add small offset
|
| 338 |
+
ax.plot(x, data + 1e-10)
|
| 339 |
+
ax.set_yscale('log')
|
| 340 |
+
```
|
| 341 |
+
|
| 342 |
+
### Issue: Dates Not Displaying Correctly
|
| 343 |
+
|
| 344 |
+
**Problem:** Date axis shows numbers instead of dates
|
| 345 |
+
|
| 346 |
+
**Solution:**
|
| 347 |
+
```python
|
| 348 |
+
import matplotlib.dates as mdates
|
| 349 |
+
import pandas as pd
|
| 350 |
+
|
| 351 |
+
# Convert to datetime if needed
|
| 352 |
+
dates = pd.to_datetime(date_strings)
|
| 353 |
+
|
| 354 |
+
ax.plot(dates, values)
|
| 355 |
+
|
| 356 |
+
# Format date axis
|
| 357 |
+
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
|
| 358 |
+
ax.xaxis.set_major_locator(mdates.DayLocator(interval=7))
|
| 359 |
+
plt.xticks(rotation=45)
|
| 360 |
+
```
|
| 361 |
+
|
| 362 |
+
## Legend Issues
|
| 363 |
+
|
| 364 |
+
### Issue: Legend Covers Data
|
| 365 |
+
|
| 366 |
+
**Problem:** Legend obscures important parts of plot
|
| 367 |
+
|
| 368 |
+
**Solutions:**
|
| 369 |
+
```python
|
| 370 |
+
# Solution 1: Use 'best' location
|
| 371 |
+
ax.legend(loc='best')
|
| 372 |
+
|
| 373 |
+
# Solution 2: Place outside plot area
|
| 374 |
+
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
|
| 375 |
+
|
| 376 |
+
# Solution 3: Make legend semi-transparent
|
| 377 |
+
ax.legend(framealpha=0.7)
|
| 378 |
+
|
| 379 |
+
# Solution 4: Put legend below plot
|
| 380 |
+
ax.legend(bbox_to_anchor=(0.5, -0.15), loc='upper center', ncol=3)
|
| 381 |
+
```
|
| 382 |
+
|
| 383 |
+
### Issue: Too Many Items in Legend
|
| 384 |
+
|
| 385 |
+
**Problem:** Legend is cluttered with many entries
|
| 386 |
+
|
| 387 |
+
**Solutions:**
|
| 388 |
+
```python
|
| 389 |
+
# Solution 1: Only label selected items
|
| 390 |
+
for i, (x, y) in enumerate(data):
|
| 391 |
+
label = f'Data {i}' if i % 5 == 0 else None
|
| 392 |
+
ax.plot(x, y, label=label)
|
| 393 |
+
|
| 394 |
+
# Solution 2: Use multiple columns
|
| 395 |
+
ax.legend(ncol=3)
|
| 396 |
+
|
| 397 |
+
# Solution 3: Create custom legend with fewer entries
|
| 398 |
+
from matplotlib.lines import Line2D
|
| 399 |
+
custom_lines = [Line2D([0], [0], color='r'),
|
| 400 |
+
Line2D([0], [0], color='b')]
|
| 401 |
+
ax.legend(custom_lines, ['Category A', 'Category B'])
|
| 402 |
+
|
| 403 |
+
# Solution 4: Use separate legend figure
|
| 404 |
+
fig_leg = plt.figure(figsize=(3, 2))
|
| 405 |
+
ax_leg = fig_leg.add_subplot(111)
|
| 406 |
+
ax_leg.legend(*ax.get_legend_handles_labels(), loc='center')
|
| 407 |
+
ax_leg.axis('off')
|
| 408 |
+
```
|
| 409 |
+
|
| 410 |
+
## 3D Plot Issues
|
| 411 |
+
|
| 412 |
+
### Issue: 3D Plots Look Flat
|
| 413 |
+
|
| 414 |
+
**Problem:** Difficult to perceive depth in 3D plots
|
| 415 |
+
|
| 416 |
+
**Solutions:**
|
| 417 |
+
```python
|
| 418 |
+
# Solution 1: Adjust viewing angle
|
| 419 |
+
ax.view_init(elev=30, azim=45)
|
| 420 |
+
|
| 421 |
+
# Solution 2: Add gridlines
|
| 422 |
+
ax.grid(True)
|
| 423 |
+
|
| 424 |
+
# Solution 3: Use color for depth
|
| 425 |
+
scatter = ax.scatter(x, y, z, c=z, cmap='viridis')
|
| 426 |
+
|
| 427 |
+
# Solution 4: Rotate interactively (if using interactive backend)
|
| 428 |
+
# User can click and drag to rotate
|
| 429 |
+
```
|
| 430 |
+
|
| 431 |
+
### Issue: 3D Axis Labels Cut Off
|
| 432 |
+
|
| 433 |
+
**Problem:** 3D axis labels appear outside figure
|
| 434 |
+
|
| 435 |
+
**Solution:**
|
| 436 |
+
```python
|
| 437 |
+
from mpl_toolkits.mplot3d import Axes3D
|
| 438 |
+
|
| 439 |
+
fig = plt.figure(figsize=(10, 8))
|
| 440 |
+
ax = fig.add_subplot(111, projection='3d')
|
| 441 |
+
ax.plot_surface(X, Y, Z)
|
| 442 |
+
|
| 443 |
+
# Add padding
|
| 444 |
+
fig.tight_layout(pad=3.0)
|
| 445 |
+
|
| 446 |
+
# Or save with tight bounding box
|
| 447 |
+
plt.savefig('3d_plot.png', bbox_inches='tight', pad_inches=0.5)
|
| 448 |
+
```
|
| 449 |
+
|
| 450 |
+
## Image and Colorbar Issues
|
| 451 |
+
|
| 452 |
+
### Issue: Images Appear Flipped
|
| 453 |
+
|
| 454 |
+
**Problem:** Image orientation is wrong
|
| 455 |
+
|
| 456 |
+
**Solution:**
|
| 457 |
+
```python
|
| 458 |
+
# Set origin parameter
|
| 459 |
+
ax.imshow(img, origin='lower') # or 'upper' (default)
|
| 460 |
+
|
| 461 |
+
# Or flip array
|
| 462 |
+
ax.imshow(np.flipud(img))
|
| 463 |
+
```
|
| 464 |
+
|
| 465 |
+
### Issue: Images Look Pixelated
|
| 466 |
+
|
| 467 |
+
**Problem:** Image appears blocky when zoomed
|
| 468 |
+
|
| 469 |
+
**Solutions:**
|
| 470 |
+
```python
|
| 471 |
+
# Solution 1: Use interpolation
|
| 472 |
+
ax.imshow(img, interpolation='bilinear')
|
| 473 |
+
# Options: 'nearest', 'bilinear', 'bicubic', 'spline16', 'spline36', etc.
|
| 474 |
+
|
| 475 |
+
# Solution 2: Increase DPI when saving
|
| 476 |
+
plt.savefig('figure.png', dpi=300)
|
| 477 |
+
|
| 478 |
+
# Solution 3: Use vector format if appropriate
|
| 479 |
+
plt.savefig('figure.pdf')
|
| 480 |
+
```
|
| 481 |
+
|
| 482 |
+
## Common Errors and Fixes
|
| 483 |
+
|
| 484 |
+
### "TypeError: 'AxesSubplot' object is not subscriptable"
|
| 485 |
+
|
| 486 |
+
**Problem:** Trying to index single axes
|
| 487 |
+
```python
|
| 488 |
+
# Wrong
|
| 489 |
+
fig, ax = plt.subplots()
|
| 490 |
+
ax[0].plot(x, y) # Error!
|
| 491 |
+
|
| 492 |
+
# Correct
|
| 493 |
+
fig, ax = plt.subplots()
|
| 494 |
+
ax.plot(x, y)
|
| 495 |
+
```
|
| 496 |
+
|
| 497 |
+
### "ValueError: x and y must have same first dimension"
|
| 498 |
+
|
| 499 |
+
**Problem:** Data arrays have mismatched lengths
|
| 500 |
+
```python
|
| 501 |
+
# Check shapes
|
| 502 |
+
print(f"x shape: {x.shape}, y shape: {y.shape}")
|
| 503 |
+
|
| 504 |
+
# Ensure they match
|
| 505 |
+
assert len(x) == len(y), "x and y must have same length"
|
| 506 |
+
```
|
| 507 |
+
|
| 508 |
+
### "AttributeError: 'numpy.ndarray' object has no attribute 'plot'"
|
| 509 |
+
|
| 510 |
+
**Problem:** Calling plot on array instead of axes
|
| 511 |
+
```python
|
| 512 |
+
# Wrong
|
| 513 |
+
data.plot(x, y)
|
| 514 |
+
|
| 515 |
+
# Correct
|
| 516 |
+
ax.plot(x, y)
|
| 517 |
+
# or for pandas
|
| 518 |
+
data.plot(ax=ax)
|
| 519 |
+
```
|
| 520 |
+
|
| 521 |
+
## Best Practices to Avoid Issues
|
| 522 |
+
|
| 523 |
+
1. **Always use the OO interface** - Avoid pyplot state machine
|
| 524 |
+
```python
|
| 525 |
+
fig, ax = plt.subplots() # Good
|
| 526 |
+
ax.plot(x, y)
|
| 527 |
+
```
|
| 528 |
+
|
| 529 |
+
2. **Use constrained_layout** - Prevents overlap issues
|
| 530 |
+
```python
|
| 531 |
+
fig, ax = plt.subplots(constrained_layout=True)
|
| 532 |
+
```
|
| 533 |
+
|
| 534 |
+
3. **Close figures explicitly** - Prevents memory leaks
|
| 535 |
+
```python
|
| 536 |
+
plt.close(fig)
|
| 537 |
+
```
|
| 538 |
+
|
| 539 |
+
4. **Set figure size at creation** - Better than resizing later
|
| 540 |
+
```python
|
| 541 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 542 |
+
```
|
| 543 |
+
|
| 544 |
+
5. **Use raw strings for math text** - Avoids escape issues
|
| 545 |
+
```python
|
| 546 |
+
ax.set_xlabel(r'$\alpha$')
|
| 547 |
+
```
|
| 548 |
+
|
| 549 |
+
6. **Check data shapes before plotting** - Catch size mismatches early
|
| 550 |
+
```python
|
| 551 |
+
assert len(x) == len(y)
|
| 552 |
+
```
|
| 553 |
+
|
| 554 |
+
7. **Use appropriate DPI** - 300 for print, 150 for web
|
| 555 |
+
```python
|
| 556 |
+
plt.savefig('figure.png', dpi=300)
|
| 557 |
+
```
|
| 558 |
+
|
| 559 |
+
8. **Test with different backends** - If display issues occur
|
| 560 |
+
```python
|
| 561 |
+
import matplotlib
|
| 562 |
+
matplotlib.use('TkAgg')
|
| 563 |
+
```
|
.scider/skills/matplotlib/references/plot_types.md
ADDED
|
@@ -0,0 +1,476 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Matplotlib Plot Types Guide
|
| 2 |
+
|
| 3 |
+
Comprehensive guide to different plot types in matplotlib with examples and use cases.
|
| 4 |
+
|
| 5 |
+
## 1. Line Plots
|
| 6 |
+
|
| 7 |
+
**Use cases:** Time series, continuous data, trends, function visualization
|
| 8 |
+
|
| 9 |
+
### Basic Line Plot
|
| 10 |
+
```python
|
| 11 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 12 |
+
ax.plot(x, y, linewidth=2, label='Data')
|
| 13 |
+
ax.set_xlabel('X axis')
|
| 14 |
+
ax.set_ylabel('Y axis')
|
| 15 |
+
ax.legend()
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
### Multiple Lines
|
| 19 |
+
```python
|
| 20 |
+
ax.plot(x, y1, label='Dataset 1', linewidth=2)
|
| 21 |
+
ax.plot(x, y2, label='Dataset 2', linewidth=2, linestyle='--')
|
| 22 |
+
ax.plot(x, y3, label='Dataset 3', linewidth=2, linestyle=':')
|
| 23 |
+
ax.legend()
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
### Line with Markers
|
| 27 |
+
```python
|
| 28 |
+
ax.plot(x, y, marker='o', markersize=8, linestyle='-',
|
| 29 |
+
linewidth=2, markerfacecolor='red', markeredgecolor='black')
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
### Step Plot
|
| 33 |
+
```python
|
| 34 |
+
ax.step(x, y, where='mid', linewidth=2, label='Step function')
|
| 35 |
+
# where options: 'pre', 'post', 'mid'
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
### Error Bars
|
| 39 |
+
```python
|
| 40 |
+
ax.errorbar(x, y, yerr=error, fmt='o-', linewidth=2,
|
| 41 |
+
capsize=5, capthick=2, label='With uncertainty')
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
## 2. Scatter Plots
|
| 45 |
+
|
| 46 |
+
**Use cases:** Correlations, relationships between variables, clusters, outliers
|
| 47 |
+
|
| 48 |
+
### Basic Scatter
|
| 49 |
+
```python
|
| 50 |
+
ax.scatter(x, y, s=50, alpha=0.6)
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
### Sized and Colored Scatter
|
| 54 |
+
```python
|
| 55 |
+
scatter = ax.scatter(x, y, s=sizes*100, c=colors,
|
| 56 |
+
cmap='viridis', alpha=0.6, edgecolors='black')
|
| 57 |
+
plt.colorbar(scatter, ax=ax, label='Color variable')
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### Categorical Scatter
|
| 61 |
+
```python
|
| 62 |
+
for category in categories:
|
| 63 |
+
mask = data['category'] == category
|
| 64 |
+
ax.scatter(data[mask]['x'], data[mask]['y'],
|
| 65 |
+
label=category, s=50, alpha=0.7)
|
| 66 |
+
ax.legend()
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
## 3. Bar Charts
|
| 70 |
+
|
| 71 |
+
**Use cases:** Categorical comparisons, discrete data, counts
|
| 72 |
+
|
| 73 |
+
### Vertical Bar Chart
|
| 74 |
+
```python
|
| 75 |
+
ax.bar(categories, values, color='steelblue',
|
| 76 |
+
edgecolor='black', linewidth=1.5)
|
| 77 |
+
ax.set_ylabel('Values')
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
### Horizontal Bar Chart
|
| 81 |
+
```python
|
| 82 |
+
ax.barh(categories, values, color='coral',
|
| 83 |
+
edgecolor='black', linewidth=1.5)
|
| 84 |
+
ax.set_xlabel('Values')
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
### Grouped Bar Chart
|
| 88 |
+
```python
|
| 89 |
+
x = np.arange(len(categories))
|
| 90 |
+
width = 0.35
|
| 91 |
+
|
| 92 |
+
ax.bar(x - width/2, values1, width, label='Group 1')
|
| 93 |
+
ax.bar(x + width/2, values2, width, label='Group 2')
|
| 94 |
+
ax.set_xticks(x)
|
| 95 |
+
ax.set_xticklabels(categories)
|
| 96 |
+
ax.legend()
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
### Stacked Bar Chart
|
| 100 |
+
```python
|
| 101 |
+
ax.bar(categories, values1, label='Part 1')
|
| 102 |
+
ax.bar(categories, values2, bottom=values1, label='Part 2')
|
| 103 |
+
ax.bar(categories, values3, bottom=values1+values2, label='Part 3')
|
| 104 |
+
ax.legend()
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
### Bar Chart with Error Bars
|
| 108 |
+
```python
|
| 109 |
+
ax.bar(categories, values, yerr=errors, capsize=5,
|
| 110 |
+
color='steelblue', edgecolor='black')
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
### Bar Chart with Patterns
|
| 114 |
+
```python
|
| 115 |
+
bars1 = ax.bar(x - width/2, values1, width, label='Group 1',
|
| 116 |
+
color='white', edgecolor='black', hatch='//')
|
| 117 |
+
bars2 = ax.bar(x + width/2, values2, width, label='Group 2',
|
| 118 |
+
color='white', edgecolor='black', hatch='\\\\')
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
## 4. Histograms
|
| 122 |
+
|
| 123 |
+
**Use cases:** Distributions, frequency analysis
|
| 124 |
+
|
| 125 |
+
### Basic Histogram
|
| 126 |
+
```python
|
| 127 |
+
ax.hist(data, bins=30, edgecolor='black', alpha=0.7)
|
| 128 |
+
ax.set_xlabel('Value')
|
| 129 |
+
ax.set_ylabel('Frequency')
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
### Multiple Overlapping Histograms
|
| 133 |
+
```python
|
| 134 |
+
ax.hist(data1, bins=30, alpha=0.5, label='Dataset 1')
|
| 135 |
+
ax.hist(data2, bins=30, alpha=0.5, label='Dataset 2')
|
| 136 |
+
ax.legend()
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
### Normalized Histogram (Density)
|
| 140 |
+
```python
|
| 141 |
+
ax.hist(data, bins=30, density=True, alpha=0.7,
|
| 142 |
+
edgecolor='black', label='Empirical')
|
| 143 |
+
|
| 144 |
+
# Overlay theoretical distribution
|
| 145 |
+
from scipy.stats import norm
|
| 146 |
+
x = np.linspace(data.min(), data.max(), 100)
|
| 147 |
+
ax.plot(x, norm.pdf(x, data.mean(), data.std()),
|
| 148 |
+
'r-', linewidth=2, label='Normal fit')
|
| 149 |
+
ax.legend()
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
### 2D Histogram (Hexbin)
|
| 153 |
+
```python
|
| 154 |
+
hexbin = ax.hexbin(x, y, gridsize=30, cmap='Blues')
|
| 155 |
+
plt.colorbar(hexbin, ax=ax, label='Counts')
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
### 2D Histogram (hist2d)
|
| 159 |
+
```python
|
| 160 |
+
h = ax.hist2d(x, y, bins=30, cmap='Blues')
|
| 161 |
+
plt.colorbar(h[3], ax=ax, label='Counts')
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
## 5. Box and Violin Plots
|
| 165 |
+
|
| 166 |
+
**Use cases:** Statistical distributions, outlier detection, comparing distributions
|
| 167 |
+
|
| 168 |
+
### Box Plot
|
| 169 |
+
```python
|
| 170 |
+
ax.boxplot([data1, data2, data3],
|
| 171 |
+
labels=['Group A', 'Group B', 'Group C'],
|
| 172 |
+
showmeans=True, meanline=True)
|
| 173 |
+
ax.set_ylabel('Values')
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
### Horizontal Box Plot
|
| 177 |
+
```python
|
| 178 |
+
ax.boxplot([data1, data2, data3], vert=False,
|
| 179 |
+
labels=['Group A', 'Group B', 'Group C'])
|
| 180 |
+
ax.set_xlabel('Values')
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
### Violin Plot
|
| 184 |
+
```python
|
| 185 |
+
parts = ax.violinplot([data1, data2, data3],
|
| 186 |
+
positions=[1, 2, 3],
|
| 187 |
+
showmeans=True, showmedians=True)
|
| 188 |
+
ax.set_xticks([1, 2, 3])
|
| 189 |
+
ax.set_xticklabels(['Group A', 'Group B', 'Group C'])
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
## 6. Heatmaps
|
| 193 |
+
|
| 194 |
+
**Use cases:** Matrix data, correlations, intensity maps
|
| 195 |
+
|
| 196 |
+
### Basic Heatmap
|
| 197 |
+
```python
|
| 198 |
+
im = ax.imshow(matrix, cmap='coolwarm', aspect='auto')
|
| 199 |
+
plt.colorbar(im, ax=ax, label='Values')
|
| 200 |
+
ax.set_xlabel('X')
|
| 201 |
+
ax.set_ylabel('Y')
|
| 202 |
+
```
|
| 203 |
+
|
| 204 |
+
### Heatmap with Annotations
|
| 205 |
+
```python
|
| 206 |
+
im = ax.imshow(matrix, cmap='coolwarm')
|
| 207 |
+
plt.colorbar(im, ax=ax)
|
| 208 |
+
|
| 209 |
+
# Add text annotations
|
| 210 |
+
for i in range(matrix.shape[0]):
|
| 211 |
+
for j in range(matrix.shape[1]):
|
| 212 |
+
text = ax.text(j, i, f'{matrix[i, j]:.2f}',
|
| 213 |
+
ha='center', va='center', color='black')
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
### Correlation Matrix
|
| 217 |
+
```python
|
| 218 |
+
corr = data.corr()
|
| 219 |
+
im = ax.imshow(corr, cmap='RdBu_r', vmin=-1, vmax=1)
|
| 220 |
+
plt.colorbar(im, ax=ax, label='Correlation')
|
| 221 |
+
|
| 222 |
+
# Set tick labels
|
| 223 |
+
ax.set_xticks(range(len(corr)))
|
| 224 |
+
ax.set_yticks(range(len(corr)))
|
| 225 |
+
ax.set_xticklabels(corr.columns, rotation=45, ha='right')
|
| 226 |
+
ax.set_yticklabels(corr.columns)
|
| 227 |
+
```
|
| 228 |
+
|
| 229 |
+
## 7. Contour Plots
|
| 230 |
+
|
| 231 |
+
**Use cases:** 3D data on 2D plane, topography, function visualization
|
| 232 |
+
|
| 233 |
+
### Contour Lines
|
| 234 |
+
```python
|
| 235 |
+
contour = ax.contour(X, Y, Z, levels=10, cmap='viridis')
|
| 236 |
+
ax.clabel(contour, inline=True, fontsize=8)
|
| 237 |
+
plt.colorbar(contour, ax=ax)
|
| 238 |
+
```
|
| 239 |
+
|
| 240 |
+
### Filled Contours
|
| 241 |
+
```python
|
| 242 |
+
contourf = ax.contourf(X, Y, Z, levels=20, cmap='viridis')
|
| 243 |
+
plt.colorbar(contourf, ax=ax)
|
| 244 |
+
```
|
| 245 |
+
|
| 246 |
+
### Combined Contours
|
| 247 |
+
```python
|
| 248 |
+
contourf = ax.contourf(X, Y, Z, levels=20, cmap='viridis', alpha=0.8)
|
| 249 |
+
contour = ax.contour(X, Y, Z, levels=10, colors='black',
|
| 250 |
+
linewidths=0.5, alpha=0.4)
|
| 251 |
+
ax.clabel(contour, inline=True, fontsize=8)
|
| 252 |
+
plt.colorbar(contourf, ax=ax)
|
| 253 |
+
```
|
| 254 |
+
|
| 255 |
+
## 8. Pie Charts
|
| 256 |
+
|
| 257 |
+
**Use cases:** Proportions, percentages (use sparingly)
|
| 258 |
+
|
| 259 |
+
### Basic Pie Chart
|
| 260 |
+
```python
|
| 261 |
+
ax.pie(sizes, labels=labels, autopct='%1.1f%%',
|
| 262 |
+
startangle=90, colors=colors)
|
| 263 |
+
ax.axis('equal') # Equal aspect ratio ensures circular pie
|
| 264 |
+
```
|
| 265 |
+
|
| 266 |
+
### Exploded Pie Chart
|
| 267 |
+
```python
|
| 268 |
+
explode = (0.1, 0, 0, 0) # Explode first slice
|
| 269 |
+
ax.pie(sizes, explode=explode, labels=labels,
|
| 270 |
+
autopct='%1.1f%%', shadow=True, startangle=90)
|
| 271 |
+
ax.axis('equal')
|
| 272 |
+
```
|
| 273 |
+
|
| 274 |
+
### Donut Chart
|
| 275 |
+
```python
|
| 276 |
+
ax.pie(sizes, labels=labels, autopct='%1.1f%%',
|
| 277 |
+
wedgeprops=dict(width=0.5), startangle=90)
|
| 278 |
+
ax.axis('equal')
|
| 279 |
+
```
|
| 280 |
+
|
| 281 |
+
## 9. Polar Plots
|
| 282 |
+
|
| 283 |
+
**Use cases:** Cyclic data, directional data, radar charts
|
| 284 |
+
|
| 285 |
+
### Basic Polar Plot
|
| 286 |
+
```python
|
| 287 |
+
theta = np.linspace(0, 2*np.pi, 100)
|
| 288 |
+
r = np.abs(np.sin(2*theta))
|
| 289 |
+
|
| 290 |
+
ax = plt.subplot(111, projection='polar')
|
| 291 |
+
ax.plot(theta, r, linewidth=2)
|
| 292 |
+
```
|
| 293 |
+
|
| 294 |
+
### Radar Chart
|
| 295 |
+
```python
|
| 296 |
+
categories = ['A', 'B', 'C', 'D', 'E']
|
| 297 |
+
values = [4, 3, 5, 2, 4]
|
| 298 |
+
|
| 299 |
+
# Add first value to the end to close the polygon
|
| 300 |
+
angles = np.linspace(0, 2*np.pi, len(categories), endpoint=False)
|
| 301 |
+
values_closed = np.concatenate((values, [values[0]]))
|
| 302 |
+
angles_closed = np.concatenate((angles, [angles[0]]))
|
| 303 |
+
|
| 304 |
+
ax = plt.subplot(111, projection='polar')
|
| 305 |
+
ax.plot(angles_closed, values_closed, 'o-', linewidth=2)
|
| 306 |
+
ax.fill(angles_closed, values_closed, alpha=0.25)
|
| 307 |
+
ax.set_xticks(angles)
|
| 308 |
+
ax.set_xticklabels(categories)
|
| 309 |
+
```
|
| 310 |
+
|
| 311 |
+
## 10. Stream and Quiver Plots
|
| 312 |
+
|
| 313 |
+
**Use cases:** Vector fields, flow visualization
|
| 314 |
+
|
| 315 |
+
### Quiver Plot (Vector Field)
|
| 316 |
+
```python
|
| 317 |
+
ax.quiver(X, Y, U, V, alpha=0.8)
|
| 318 |
+
ax.set_xlabel('X')
|
| 319 |
+
ax.set_ylabel('Y')
|
| 320 |
+
ax.set_aspect('equal')
|
| 321 |
+
```
|
| 322 |
+
|
| 323 |
+
### Stream Plot
|
| 324 |
+
```python
|
| 325 |
+
ax.streamplot(X, Y, U, V, density=1.5, color='k', linewidth=1)
|
| 326 |
+
ax.set_xlabel('X')
|
| 327 |
+
ax.set_ylabel('Y')
|
| 328 |
+
ax.set_aspect('equal')
|
| 329 |
+
```
|
| 330 |
+
|
| 331 |
+
## 11. Fill Between
|
| 332 |
+
|
| 333 |
+
**Use cases:** Uncertainty bounds, confidence intervals, areas under curves
|
| 334 |
+
|
| 335 |
+
### Fill Between Two Curves
|
| 336 |
+
```python
|
| 337 |
+
ax.plot(x, y, 'k-', linewidth=2, label='Mean')
|
| 338 |
+
ax.fill_between(x, y - std, y + std, alpha=0.3,
|
| 339 |
+
label='±1 std dev')
|
| 340 |
+
ax.legend()
|
| 341 |
+
```
|
| 342 |
+
|
| 343 |
+
### Fill Between with Condition
|
| 344 |
+
```python
|
| 345 |
+
ax.plot(x, y1, label='Line 1')
|
| 346 |
+
ax.plot(x, y2, label='Line 2')
|
| 347 |
+
ax.fill_between(x, y1, y2, where=(y2 >= y1),
|
| 348 |
+
alpha=0.3, label='y2 > y1', interpolate=True)
|
| 349 |
+
ax.legend()
|
| 350 |
+
```
|
| 351 |
+
|
| 352 |
+
## 12. 3D Plots
|
| 353 |
+
|
| 354 |
+
**Use cases:** Three-dimensional data visualization
|
| 355 |
+
|
| 356 |
+
### 3D Scatter
|
| 357 |
+
```python
|
| 358 |
+
from mpl_toolkits.mplot3d import Axes3D
|
| 359 |
+
|
| 360 |
+
fig = plt.figure(figsize=(10, 8))
|
| 361 |
+
ax = fig.add_subplot(111, projection='3d')
|
| 362 |
+
scatter = ax.scatter(x, y, z, c=colors, cmap='viridis',
|
| 363 |
+
marker='o', s=50)
|
| 364 |
+
plt.colorbar(scatter, ax=ax)
|
| 365 |
+
ax.set_xlabel('X')
|
| 366 |
+
ax.set_ylabel('Y')
|
| 367 |
+
ax.set_zlabel('Z')
|
| 368 |
+
```
|
| 369 |
+
|
| 370 |
+
### 3D Surface Plot
|
| 371 |
+
```python
|
| 372 |
+
fig = plt.figure(figsize=(10, 8))
|
| 373 |
+
ax = fig.add_subplot(111, projection='3d')
|
| 374 |
+
surf = ax.plot_surface(X, Y, Z, cmap='viridis',
|
| 375 |
+
edgecolor='none', alpha=0.9)
|
| 376 |
+
plt.colorbar(surf, ax=ax)
|
| 377 |
+
ax.set_xlabel('X')
|
| 378 |
+
ax.set_ylabel('Y')
|
| 379 |
+
ax.set_zlabel('Z')
|
| 380 |
+
```
|
| 381 |
+
|
| 382 |
+
### 3D Wireframe
|
| 383 |
+
```python
|
| 384 |
+
fig = plt.figure(figsize=(10, 8))
|
| 385 |
+
ax = fig.add_subplot(111, projection='3d')
|
| 386 |
+
ax.plot_wireframe(X, Y, Z, color='black', linewidth=0.5)
|
| 387 |
+
ax.set_xlabel('X')
|
| 388 |
+
ax.set_ylabel('Y')
|
| 389 |
+
ax.set_zlabel('Z')
|
| 390 |
+
```
|
| 391 |
+
|
| 392 |
+
### 3D Contour
|
| 393 |
+
```python
|
| 394 |
+
fig = plt.figure(figsize=(10, 8))
|
| 395 |
+
ax = fig.add_subplot(111, projection='3d')
|
| 396 |
+
ax.contour(X, Y, Z, levels=15, cmap='viridis')
|
| 397 |
+
ax.set_xlabel('X')
|
| 398 |
+
ax.set_ylabel('Y')
|
| 399 |
+
ax.set_zlabel('Z')
|
| 400 |
+
```
|
| 401 |
+
|
| 402 |
+
## 13. Specialized Plots
|
| 403 |
+
|
| 404 |
+
### Stem Plot
|
| 405 |
+
```python
|
| 406 |
+
ax.stem(x, y, linefmt='C0-', markerfmt='C0o', basefmt='k-')
|
| 407 |
+
ax.set_xlabel('X')
|
| 408 |
+
ax.set_ylabel('Y')
|
| 409 |
+
```
|
| 410 |
+
|
| 411 |
+
### Filled Polygon
|
| 412 |
+
```python
|
| 413 |
+
vertices = [(0, 0), (1, 0), (1, 1), (0, 1)]
|
| 414 |
+
from matplotlib.patches import Polygon
|
| 415 |
+
polygon = Polygon(vertices, closed=True, edgecolor='black',
|
| 416 |
+
facecolor='lightblue', alpha=0.5)
|
| 417 |
+
ax.add_patch(polygon)
|
| 418 |
+
ax.set_xlim(-0.5, 1.5)
|
| 419 |
+
ax.set_ylim(-0.5, 1.5)
|
| 420 |
+
```
|
| 421 |
+
|
| 422 |
+
### Staircase Plot
|
| 423 |
+
```python
|
| 424 |
+
ax.stairs(values, edges, fill=True, alpha=0.5)
|
| 425 |
+
```
|
| 426 |
+
|
| 427 |
+
### Broken Barh (Gantt-style)
|
| 428 |
+
```python
|
| 429 |
+
ax.broken_barh([(10, 50), (100, 20), (130, 10)], (10, 9),
|
| 430 |
+
facecolors='tab:blue')
|
| 431 |
+
ax.broken_barh([(10, 20), (50, 50), (120, 30)], (20, 9),
|
| 432 |
+
facecolors='tab:orange')
|
| 433 |
+
ax.set_ylim(5, 35)
|
| 434 |
+
ax.set_xlim(0, 200)
|
| 435 |
+
ax.set_xlabel('Time')
|
| 436 |
+
ax.set_yticks([15, 25])
|
| 437 |
+
ax.set_yticklabels(['Task 1', 'Task 2'])
|
| 438 |
+
```
|
| 439 |
+
|
| 440 |
+
## 14. Time Series Plots
|
| 441 |
+
|
| 442 |
+
### Basic Time Series
|
| 443 |
+
```python
|
| 444 |
+
import pandas as pd
|
| 445 |
+
import matplotlib.dates as mdates
|
| 446 |
+
|
| 447 |
+
ax.plot(dates, values, linewidth=2)
|
| 448 |
+
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
|
| 449 |
+
ax.xaxis.set_major_locator(mdates.DayLocator(interval=7))
|
| 450 |
+
plt.xticks(rotation=45)
|
| 451 |
+
ax.set_xlabel('Date')
|
| 452 |
+
ax.set_ylabel('Value')
|
| 453 |
+
```
|
| 454 |
+
|
| 455 |
+
### Time Series with Shaded Regions
|
| 456 |
+
```python
|
| 457 |
+
ax.plot(dates, values, linewidth=2)
|
| 458 |
+
# Shade weekends or specific periods
|
| 459 |
+
ax.axvspan(start_date, end_date, alpha=0.2, color='gray')
|
| 460 |
+
```
|
| 461 |
+
|
| 462 |
+
## Plot Selection Guide
|
| 463 |
+
|
| 464 |
+
| Data Type | Recommended Plot | Alternative Options |
|
| 465 |
+
|-----------|-----------------|---------------------|
|
| 466 |
+
| Single continuous variable | Histogram, KDE | Box plot, Violin plot |
|
| 467 |
+
| Two continuous variables | Scatter plot | Hexbin, 2D histogram |
|
| 468 |
+
| Time series | Line plot | Area plot, Step plot |
|
| 469 |
+
| Categorical vs continuous | Bar chart, Box plot | Violin plot, Strip plot |
|
| 470 |
+
| Two categorical variables | Heatmap | Grouped bar chart |
|
| 471 |
+
| Three continuous variables | 3D scatter, Contour | Color-coded scatter |
|
| 472 |
+
| Proportions | Bar chart | Pie chart (use sparingly) |
|
| 473 |
+
| Distributions comparison | Box plot, Violin plot | Overlaid histograms |
|
| 474 |
+
| Correlation matrix | Heatmap | Clustered heatmap |
|
| 475 |
+
| Vector field | Quiver plot, Stream plot | - |
|
| 476 |
+
| Function visualization | Line plot, Contour | 3D surface |
|
.scider/skills/matplotlib/references/styling_guide.md
ADDED
|
@@ -0,0 +1,589 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Matplotlib Styling Guide
|
| 2 |
+
|
| 3 |
+
Comprehensive guide for styling and customizing matplotlib visualizations.
|
| 4 |
+
|
| 5 |
+
## Colormaps
|
| 6 |
+
|
| 7 |
+
### Colormap Categories
|
| 8 |
+
|
| 9 |
+
**1. Perceptually Uniform Sequential**
|
| 10 |
+
Best for ordered data that progresses from low to high values.
|
| 11 |
+
- `viridis` (default, colorblind-friendly)
|
| 12 |
+
- `plasma`
|
| 13 |
+
- `inferno`
|
| 14 |
+
- `magma`
|
| 15 |
+
- `cividis` (optimized for colorblind viewers)
|
| 16 |
+
|
| 17 |
+
**Usage:**
|
| 18 |
+
```python
|
| 19 |
+
im = ax.imshow(data, cmap='viridis')
|
| 20 |
+
scatter = ax.scatter(x, y, c=values, cmap='plasma')
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
**2. Sequential**
|
| 24 |
+
Traditional colormaps for ordered data.
|
| 25 |
+
- `Blues`, `Greens`, `Reds`, `Oranges`, `Purples`
|
| 26 |
+
- `YlOrBr`, `YlOrRd`, `OrRd`, `PuRd`
|
| 27 |
+
- `BuPu`, `GnBu`, `PuBu`, `YlGnBu`
|
| 28 |
+
|
| 29 |
+
**3. Diverging**
|
| 30 |
+
Best for data with a meaningful center point (e.g., zero, mean).
|
| 31 |
+
- `coolwarm` (blue to red)
|
| 32 |
+
- `RdBu` (red-blue)
|
| 33 |
+
- `RdYlBu` (red-yellow-blue)
|
| 34 |
+
- `RdYlGn` (red-yellow-green)
|
| 35 |
+
- `PiYG`, `PRGn`, `BrBG`, `PuOr`, `RdGy`
|
| 36 |
+
|
| 37 |
+
**Usage:**
|
| 38 |
+
```python
|
| 39 |
+
# Center colormap at zero
|
| 40 |
+
im = ax.imshow(data, cmap='coolwarm', vmin=-1, vmax=1)
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
**4. Qualitative**
|
| 44 |
+
Best for categorical/nominal data without inherent ordering.
|
| 45 |
+
- `tab10` (10 distinct colors)
|
| 46 |
+
- `tab20` (20 distinct colors)
|
| 47 |
+
- `Set1`, `Set2`, `Set3`
|
| 48 |
+
- `Pastel1`, `Pastel2`
|
| 49 |
+
- `Dark2`, `Accent`, `Paired`
|
| 50 |
+
|
| 51 |
+
**Usage:**
|
| 52 |
+
```python
|
| 53 |
+
colors = plt.cm.tab10(np.linspace(0, 1, n_categories))
|
| 54 |
+
for i, category in enumerate(categories):
|
| 55 |
+
ax.plot(x, y[i], color=colors[i], label=category)
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
**5. Cyclic**
|
| 59 |
+
Best for cyclic data (e.g., phase, angle).
|
| 60 |
+
- `twilight`
|
| 61 |
+
- `twilight_shifted`
|
| 62 |
+
- `hsv`
|
| 63 |
+
|
| 64 |
+
### Colormap Best Practices
|
| 65 |
+
|
| 66 |
+
1. **Avoid `jet` colormap** - Not perceptually uniform, misleading
|
| 67 |
+
2. **Use perceptually uniform colormaps** - `viridis`, `plasma`, `cividis`
|
| 68 |
+
3. **Consider colorblind users** - Use `viridis`, `cividis`, or test with colorblind simulators
|
| 69 |
+
4. **Match colormap to data type**:
|
| 70 |
+
- Sequential: increasing/decreasing data
|
| 71 |
+
- Diverging: data with meaningful center
|
| 72 |
+
- Qualitative: categories
|
| 73 |
+
5. **Reverse colormaps** - Add `_r` suffix: `viridis_r`, `coolwarm_r`
|
| 74 |
+
|
| 75 |
+
### Creating Custom Colormaps
|
| 76 |
+
|
| 77 |
+
```python
|
| 78 |
+
from matplotlib.colors import LinearSegmentedColormap
|
| 79 |
+
|
| 80 |
+
# From color list
|
| 81 |
+
colors = ['blue', 'white', 'red']
|
| 82 |
+
n_bins = 100
|
| 83 |
+
cmap = LinearSegmentedColormap.from_list('custom', colors, N=n_bins)
|
| 84 |
+
|
| 85 |
+
# From RGB values
|
| 86 |
+
colors = [(0, 0, 1), (1, 1, 1), (1, 0, 0)] # RGB tuples
|
| 87 |
+
cmap = LinearSegmentedColormap.from_list('custom', colors)
|
| 88 |
+
|
| 89 |
+
# Use the custom colormap
|
| 90 |
+
ax.imshow(data, cmap=cmap)
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
### Discrete Colormaps
|
| 94 |
+
|
| 95 |
+
```python
|
| 96 |
+
import matplotlib.colors as mcolors
|
| 97 |
+
|
| 98 |
+
# Create discrete colormap from continuous
|
| 99 |
+
cmap = plt.cm.viridis
|
| 100 |
+
bounds = np.linspace(0, 10, 11)
|
| 101 |
+
norm = mcolors.BoundaryNorm(bounds, cmap.N)
|
| 102 |
+
im = ax.imshow(data, cmap=cmap, norm=norm)
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
## Style Sheets
|
| 106 |
+
|
| 107 |
+
### Using Built-in Styles
|
| 108 |
+
|
| 109 |
+
```python
|
| 110 |
+
# List available styles
|
| 111 |
+
print(plt.style.available)
|
| 112 |
+
|
| 113 |
+
# Apply a style
|
| 114 |
+
plt.style.use('seaborn-v0_8-darkgrid')
|
| 115 |
+
|
| 116 |
+
# Apply multiple styles (later styles override earlier ones)
|
| 117 |
+
plt.style.use(['seaborn-v0_8-whitegrid', 'seaborn-v0_8-poster'])
|
| 118 |
+
|
| 119 |
+
# Temporarily use a style
|
| 120 |
+
with plt.style.context('ggplot'):
|
| 121 |
+
fig, ax = plt.subplots()
|
| 122 |
+
ax.plot(x, y)
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
### Popular Built-in Styles
|
| 126 |
+
|
| 127 |
+
- `default` - Matplotlib's default style
|
| 128 |
+
- `classic` - Classic matplotlib look (pre-2.0)
|
| 129 |
+
- `seaborn-v0_8-*` - Seaborn-inspired styles
|
| 130 |
+
- `seaborn-v0_8-darkgrid`, `seaborn-v0_8-whitegrid`
|
| 131 |
+
- `seaborn-v0_8-dark`, `seaborn-v0_8-white`
|
| 132 |
+
- `seaborn-v0_8-ticks`, `seaborn-v0_8-poster`, `seaborn-v0_8-talk`
|
| 133 |
+
- `ggplot` - ggplot2-inspired style
|
| 134 |
+
- `bmh` - Bayesian Methods for Hackers style
|
| 135 |
+
- `fivethirtyeight` - FiveThirtyEight style
|
| 136 |
+
- `grayscale` - Grayscale style
|
| 137 |
+
|
| 138 |
+
### Creating Custom Style Sheets
|
| 139 |
+
|
| 140 |
+
Create a file named `custom_style.mplstyle`:
|
| 141 |
+
|
| 142 |
+
```
|
| 143 |
+
# custom_style.mplstyle
|
| 144 |
+
|
| 145 |
+
# Figure
|
| 146 |
+
figure.figsize: 10, 6
|
| 147 |
+
figure.dpi: 100
|
| 148 |
+
figure.facecolor: white
|
| 149 |
+
|
| 150 |
+
# Font
|
| 151 |
+
font.family: sans-serif
|
| 152 |
+
font.sans-serif: Arial, Helvetica
|
| 153 |
+
font.size: 12
|
| 154 |
+
|
| 155 |
+
# Axes
|
| 156 |
+
axes.labelsize: 14
|
| 157 |
+
axes.titlesize: 16
|
| 158 |
+
axes.facecolor: white
|
| 159 |
+
axes.edgecolor: black
|
| 160 |
+
axes.linewidth: 1.5
|
| 161 |
+
axes.grid: True
|
| 162 |
+
axes.axisbelow: True
|
| 163 |
+
|
| 164 |
+
# Grid
|
| 165 |
+
grid.color: gray
|
| 166 |
+
grid.linestyle: --
|
| 167 |
+
grid.linewidth: 0.5
|
| 168 |
+
grid.alpha: 0.3
|
| 169 |
+
|
| 170 |
+
# Lines
|
| 171 |
+
lines.linewidth: 2
|
| 172 |
+
lines.markersize: 8
|
| 173 |
+
|
| 174 |
+
# Ticks
|
| 175 |
+
xtick.labelsize: 10
|
| 176 |
+
ytick.labelsize: 10
|
| 177 |
+
xtick.direction: in
|
| 178 |
+
ytick.direction: in
|
| 179 |
+
xtick.major.size: 6
|
| 180 |
+
ytick.major.size: 6
|
| 181 |
+
xtick.minor.size: 3
|
| 182 |
+
ytick.minor.size: 3
|
| 183 |
+
|
| 184 |
+
# Legend
|
| 185 |
+
legend.fontsize: 12
|
| 186 |
+
legend.frameon: True
|
| 187 |
+
legend.framealpha: 0.8
|
| 188 |
+
legend.fancybox: True
|
| 189 |
+
|
| 190 |
+
# Savefig
|
| 191 |
+
savefig.dpi: 300
|
| 192 |
+
savefig.bbox: tight
|
| 193 |
+
savefig.facecolor: white
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
Load and use:
|
| 197 |
+
```python
|
| 198 |
+
plt.style.use('path/to/custom_style.mplstyle')
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
## rcParams Configuration
|
| 202 |
+
|
| 203 |
+
### Global Configuration
|
| 204 |
+
|
| 205 |
+
```python
|
| 206 |
+
import matplotlib.pyplot as plt
|
| 207 |
+
|
| 208 |
+
# Configure globally
|
| 209 |
+
plt.rcParams['figure.figsize'] = (10, 6)
|
| 210 |
+
plt.rcParams['font.size'] = 12
|
| 211 |
+
plt.rcParams['axes.labelsize'] = 14
|
| 212 |
+
|
| 213 |
+
# Or update multiple at once
|
| 214 |
+
plt.rcParams.update({
|
| 215 |
+
'figure.figsize': (10, 6),
|
| 216 |
+
'font.size': 12,
|
| 217 |
+
'axes.labelsize': 14,
|
| 218 |
+
'axes.titlesize': 16,
|
| 219 |
+
'lines.linewidth': 2
|
| 220 |
+
})
|
| 221 |
+
```
|
| 222 |
+
|
| 223 |
+
### Temporary Configuration
|
| 224 |
+
|
| 225 |
+
```python
|
| 226 |
+
# Context manager for temporary changes
|
| 227 |
+
with plt.rc_context({'font.size': 14, 'lines.linewidth': 2.5}):
|
| 228 |
+
fig, ax = plt.subplots()
|
| 229 |
+
ax.plot(x, y)
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
### Common rcParams
|
| 233 |
+
|
| 234 |
+
**Figure settings:**
|
| 235 |
+
```python
|
| 236 |
+
plt.rcParams['figure.figsize'] = (10, 6)
|
| 237 |
+
plt.rcParams['figure.dpi'] = 100
|
| 238 |
+
plt.rcParams['figure.facecolor'] = 'white'
|
| 239 |
+
plt.rcParams['figure.edgecolor'] = 'white'
|
| 240 |
+
plt.rcParams['figure.autolayout'] = False
|
| 241 |
+
plt.rcParams['figure.constrained_layout.use'] = True
|
| 242 |
+
```
|
| 243 |
+
|
| 244 |
+
**Font settings:**
|
| 245 |
+
```python
|
| 246 |
+
plt.rcParams['font.family'] = 'sans-serif'
|
| 247 |
+
plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']
|
| 248 |
+
plt.rcParams['font.size'] = 12
|
| 249 |
+
plt.rcParams['font.weight'] = 'normal'
|
| 250 |
+
```
|
| 251 |
+
|
| 252 |
+
**Axes settings:**
|
| 253 |
+
```python
|
| 254 |
+
plt.rcParams['axes.facecolor'] = 'white'
|
| 255 |
+
plt.rcParams['axes.edgecolor'] = 'black'
|
| 256 |
+
plt.rcParams['axes.linewidth'] = 1.5
|
| 257 |
+
plt.rcParams['axes.grid'] = True
|
| 258 |
+
plt.rcParams['axes.labelsize'] = 14
|
| 259 |
+
plt.rcParams['axes.titlesize'] = 16
|
| 260 |
+
plt.rcParams['axes.labelweight'] = 'normal'
|
| 261 |
+
plt.rcParams['axes.spines.top'] = True
|
| 262 |
+
plt.rcParams['axes.spines.right'] = True
|
| 263 |
+
```
|
| 264 |
+
|
| 265 |
+
**Line settings:**
|
| 266 |
+
```python
|
| 267 |
+
plt.rcParams['lines.linewidth'] = 2
|
| 268 |
+
plt.rcParams['lines.linestyle'] = '-'
|
| 269 |
+
plt.rcParams['lines.marker'] = 'None'
|
| 270 |
+
plt.rcParams['lines.markersize'] = 6
|
| 271 |
+
```
|
| 272 |
+
|
| 273 |
+
**Save settings:**
|
| 274 |
+
```python
|
| 275 |
+
plt.rcParams['savefig.dpi'] = 300
|
| 276 |
+
plt.rcParams['savefig.format'] = 'png'
|
| 277 |
+
plt.rcParams['savefig.bbox'] = 'tight'
|
| 278 |
+
plt.rcParams['savefig.pad_inches'] = 0.1
|
| 279 |
+
plt.rcParams['savefig.transparent'] = False
|
| 280 |
+
```
|
| 281 |
+
|
| 282 |
+
## Color Palettes
|
| 283 |
+
|
| 284 |
+
### Named Color Sets
|
| 285 |
+
|
| 286 |
+
```python
|
| 287 |
+
# Tableau colors
|
| 288 |
+
tableau_colors = plt.cm.tab10.colors
|
| 289 |
+
|
| 290 |
+
# CSS4 colors (subset)
|
| 291 |
+
css_colors = ['steelblue', 'coral', 'teal', 'goldenrod', 'crimson']
|
| 292 |
+
|
| 293 |
+
# Manual definition
|
| 294 |
+
custom_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
|
| 295 |
+
```
|
| 296 |
+
|
| 297 |
+
### Color Cycles
|
| 298 |
+
|
| 299 |
+
```python
|
| 300 |
+
# Set default color cycle
|
| 301 |
+
from cycler import cycler
|
| 302 |
+
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
|
| 303 |
+
plt.rcParams['axes.prop_cycle'] = cycler(color=colors)
|
| 304 |
+
|
| 305 |
+
# Or combine color and line style
|
| 306 |
+
plt.rcParams['axes.prop_cycle'] = cycler(color=colors) + cycler(linestyle=['-', '--', ':', '-.'])
|
| 307 |
+
```
|
| 308 |
+
|
| 309 |
+
### Palette Generation
|
| 310 |
+
|
| 311 |
+
```python
|
| 312 |
+
# Evenly spaced colors from colormap
|
| 313 |
+
n_colors = 5
|
| 314 |
+
colors = plt.cm.viridis(np.linspace(0, 1, n_colors))
|
| 315 |
+
|
| 316 |
+
# Use in plot
|
| 317 |
+
for i, (x, y) in enumerate(data):
|
| 318 |
+
ax.plot(x, y, color=colors[i])
|
| 319 |
+
```
|
| 320 |
+
|
| 321 |
+
## Typography
|
| 322 |
+
|
| 323 |
+
### Font Configuration
|
| 324 |
+
|
| 325 |
+
```python
|
| 326 |
+
# Set font family
|
| 327 |
+
plt.rcParams['font.family'] = 'serif'
|
| 328 |
+
plt.rcParams['font.serif'] = ['Times New Roman', 'DejaVu Serif']
|
| 329 |
+
|
| 330 |
+
# Or sans-serif
|
| 331 |
+
plt.rcParams['font.family'] = 'sans-serif'
|
| 332 |
+
plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica']
|
| 333 |
+
|
| 334 |
+
# Or monospace
|
| 335 |
+
plt.rcParams['font.family'] = 'monospace'
|
| 336 |
+
plt.rcParams['font.monospace'] = ['Courier New', 'DejaVu Sans Mono']
|
| 337 |
+
```
|
| 338 |
+
|
| 339 |
+
### Font Properties in Text
|
| 340 |
+
|
| 341 |
+
```python
|
| 342 |
+
from matplotlib import font_manager
|
| 343 |
+
|
| 344 |
+
# Specify font properties
|
| 345 |
+
ax.text(x, y, 'Text',
|
| 346 |
+
fontsize=14,
|
| 347 |
+
fontweight='bold', # 'normal', 'bold', 'heavy', 'light'
|
| 348 |
+
fontstyle='italic', # 'normal', 'italic', 'oblique'
|
| 349 |
+
fontfamily='serif')
|
| 350 |
+
|
| 351 |
+
# Use specific font file
|
| 352 |
+
prop = font_manager.FontProperties(fname='path/to/font.ttf')
|
| 353 |
+
ax.text(x, y, 'Text', fontproperties=prop)
|
| 354 |
+
```
|
| 355 |
+
|
| 356 |
+
### Mathematical Text
|
| 357 |
+
|
| 358 |
+
```python
|
| 359 |
+
# LaTeX-style math
|
| 360 |
+
ax.set_title(r'$\alpha > \beta$')
|
| 361 |
+
ax.set_xlabel(r'$\mu \pm \sigma$')
|
| 362 |
+
ax.text(x, y, r'$\int_0^\infty e^{-x} dx = 1$')
|
| 363 |
+
|
| 364 |
+
# Subscripts and superscripts
|
| 365 |
+
ax.set_ylabel(r'$y = x^2 + 2x + 1$')
|
| 366 |
+
ax.text(x, y, r'$x_1, x_2, \ldots, x_n$')
|
| 367 |
+
|
| 368 |
+
# Greek letters
|
| 369 |
+
ax.text(x, y, r'$\alpha, \beta, \gamma, \delta, \epsilon$')
|
| 370 |
+
```
|
| 371 |
+
|
| 372 |
+
### Using Full LaTeX
|
| 373 |
+
|
| 374 |
+
```python
|
| 375 |
+
# Enable full LaTeX rendering (requires LaTeX installation)
|
| 376 |
+
plt.rcParams['text.usetex'] = True
|
| 377 |
+
plt.rcParams['text.latex.preamble'] = r'\usepackage{amsmath}'
|
| 378 |
+
|
| 379 |
+
ax.set_title(r'\textbf{Bold Title}')
|
| 380 |
+
ax.set_xlabel(r'Time $t$ (s)')
|
| 381 |
+
```
|
| 382 |
+
|
| 383 |
+
## Spines and Grids
|
| 384 |
+
|
| 385 |
+
### Spine Customization
|
| 386 |
+
|
| 387 |
+
```python
|
| 388 |
+
# Hide specific spines
|
| 389 |
+
ax.spines['top'].set_visible(False)
|
| 390 |
+
ax.spines['right'].set_visible(False)
|
| 391 |
+
|
| 392 |
+
# Move spine position
|
| 393 |
+
ax.spines['left'].set_position(('outward', 10))
|
| 394 |
+
ax.spines['bottom'].set_position(('data', 0))
|
| 395 |
+
|
| 396 |
+
# Change spine color and width
|
| 397 |
+
ax.spines['left'].set_color('red')
|
| 398 |
+
ax.spines['bottom'].set_linewidth(2)
|
| 399 |
+
```
|
| 400 |
+
|
| 401 |
+
### Grid Customization
|
| 402 |
+
|
| 403 |
+
```python
|
| 404 |
+
# Basic grid
|
| 405 |
+
ax.grid(True)
|
| 406 |
+
|
| 407 |
+
# Customized grid
|
| 408 |
+
ax.grid(True, which='major', linestyle='--', linewidth=0.8, alpha=0.3)
|
| 409 |
+
ax.grid(True, which='minor', linestyle=':', linewidth=0.5, alpha=0.2)
|
| 410 |
+
|
| 411 |
+
# Grid for specific axis
|
| 412 |
+
ax.grid(True, axis='x') # Only vertical lines
|
| 413 |
+
ax.grid(True, axis='y') # Only horizontal lines
|
| 414 |
+
|
| 415 |
+
# Grid behind or in front of data
|
| 416 |
+
ax.set_axisbelow(True) # Grid behind data
|
| 417 |
+
```
|
| 418 |
+
|
| 419 |
+
## Legend Customization
|
| 420 |
+
|
| 421 |
+
### Legend Positioning
|
| 422 |
+
|
| 423 |
+
```python
|
| 424 |
+
# Location strings
|
| 425 |
+
ax.legend(loc='best') # Automatic best position
|
| 426 |
+
ax.legend(loc='upper right')
|
| 427 |
+
ax.legend(loc='upper left')
|
| 428 |
+
ax.legend(loc='lower right')
|
| 429 |
+
ax.legend(loc='lower left')
|
| 430 |
+
ax.legend(loc='center')
|
| 431 |
+
ax.legend(loc='upper center')
|
| 432 |
+
ax.legend(loc='lower center')
|
| 433 |
+
ax.legend(loc='center left')
|
| 434 |
+
ax.legend(loc='center right')
|
| 435 |
+
|
| 436 |
+
# Precise positioning (bbox_to_anchor)
|
| 437 |
+
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') # Outside plot area
|
| 438 |
+
ax.legend(bbox_to_anchor=(0.5, -0.15), loc='upper center', ncol=3) # Below plot
|
| 439 |
+
```
|
| 440 |
+
|
| 441 |
+
### Legend Styling
|
| 442 |
+
|
| 443 |
+
```python
|
| 444 |
+
ax.legend(
|
| 445 |
+
fontsize=12,
|
| 446 |
+
frameon=True, # Show frame
|
| 447 |
+
framealpha=0.9, # Frame transparency
|
| 448 |
+
fancybox=True, # Rounded corners
|
| 449 |
+
shadow=True, # Shadow effect
|
| 450 |
+
ncol=2, # Number of columns
|
| 451 |
+
title='Legend Title', # Legend title
|
| 452 |
+
title_fontsize=14, # Title font size
|
| 453 |
+
edgecolor='black', # Frame edge color
|
| 454 |
+
facecolor='white' # Frame background color
|
| 455 |
+
)
|
| 456 |
+
```
|
| 457 |
+
|
| 458 |
+
### Custom Legend Entries
|
| 459 |
+
|
| 460 |
+
```python
|
| 461 |
+
from matplotlib.lines import Line2D
|
| 462 |
+
|
| 463 |
+
# Create custom legend handles
|
| 464 |
+
custom_lines = [Line2D([0], [0], color='red', lw=2),
|
| 465 |
+
Line2D([0], [0], color='blue', lw=2, linestyle='--'),
|
| 466 |
+
Line2D([0], [0], marker='o', color='w', markerfacecolor='green', markersize=10)]
|
| 467 |
+
|
| 468 |
+
ax.legend(custom_lines, ['Label 1', 'Label 2', 'Label 3'])
|
| 469 |
+
```
|
| 470 |
+
|
| 471 |
+
## Layout and Spacing
|
| 472 |
+
|
| 473 |
+
### Constrained Layout
|
| 474 |
+
|
| 475 |
+
```python
|
| 476 |
+
# Preferred method (automatic adjustment)
|
| 477 |
+
fig, axes = plt.subplots(2, 2, constrained_layout=True)
|
| 478 |
+
```
|
| 479 |
+
|
| 480 |
+
### Tight Layout
|
| 481 |
+
|
| 482 |
+
```python
|
| 483 |
+
# Alternative method
|
| 484 |
+
fig, axes = plt.subplots(2, 2)
|
| 485 |
+
plt.tight_layout(pad=1.5, h_pad=2.0, w_pad=2.0)
|
| 486 |
+
```
|
| 487 |
+
|
| 488 |
+
### Manual Adjustment
|
| 489 |
+
|
| 490 |
+
```python
|
| 491 |
+
# Fine-grained control
|
| 492 |
+
plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1,
|
| 493 |
+
hspace=0.3, wspace=0.4)
|
| 494 |
+
```
|
| 495 |
+
|
| 496 |
+
## Professional Publication Style
|
| 497 |
+
|
| 498 |
+
Example configuration for publication-quality figures:
|
| 499 |
+
|
| 500 |
+
```python
|
| 501 |
+
# Publication style configuration
|
| 502 |
+
plt.rcParams.update({
|
| 503 |
+
# Figure
|
| 504 |
+
'figure.figsize': (8, 6),
|
| 505 |
+
'figure.dpi': 100,
|
| 506 |
+
'savefig.dpi': 300,
|
| 507 |
+
'savefig.bbox': 'tight',
|
| 508 |
+
'savefig.pad_inches': 0.1,
|
| 509 |
+
|
| 510 |
+
# Font
|
| 511 |
+
'font.family': 'sans-serif',
|
| 512 |
+
'font.sans-serif': ['Arial', 'Helvetica'],
|
| 513 |
+
'font.size': 11,
|
| 514 |
+
|
| 515 |
+
# Axes
|
| 516 |
+
'axes.labelsize': 12,
|
| 517 |
+
'axes.titlesize': 14,
|
| 518 |
+
'axes.linewidth': 1.5,
|
| 519 |
+
'axes.grid': False,
|
| 520 |
+
'axes.spines.top': False,
|
| 521 |
+
'axes.spines.right': False,
|
| 522 |
+
|
| 523 |
+
# Lines
|
| 524 |
+
'lines.linewidth': 2,
|
| 525 |
+
'lines.markersize': 8,
|
| 526 |
+
|
| 527 |
+
# Ticks
|
| 528 |
+
'xtick.labelsize': 10,
|
| 529 |
+
'ytick.labelsize': 10,
|
| 530 |
+
'xtick.major.size': 6,
|
| 531 |
+
'ytick.major.size': 6,
|
| 532 |
+
'xtick.major.width': 1.5,
|
| 533 |
+
'ytick.major.width': 1.5,
|
| 534 |
+
'xtick.direction': 'in',
|
| 535 |
+
'ytick.direction': 'in',
|
| 536 |
+
|
| 537 |
+
# Legend
|
| 538 |
+
'legend.fontsize': 10,
|
| 539 |
+
'legend.frameon': True,
|
| 540 |
+
'legend.framealpha': 1.0,
|
| 541 |
+
'legend.edgecolor': 'black'
|
| 542 |
+
})
|
| 543 |
+
```
|
| 544 |
+
|
| 545 |
+
## Dark Theme
|
| 546 |
+
|
| 547 |
+
```python
|
| 548 |
+
# Dark background style
|
| 549 |
+
plt.style.use('dark_background')
|
| 550 |
+
|
| 551 |
+
# Or manual configuration
|
| 552 |
+
plt.rcParams.update({
|
| 553 |
+
'figure.facecolor': '#1e1e1e',
|
| 554 |
+
'axes.facecolor': '#1e1e1e',
|
| 555 |
+
'axes.edgecolor': 'white',
|
| 556 |
+
'axes.labelcolor': 'white',
|
| 557 |
+
'text.color': 'white',
|
| 558 |
+
'xtick.color': 'white',
|
| 559 |
+
'ytick.color': 'white',
|
| 560 |
+
'grid.color': 'gray',
|
| 561 |
+
'legend.facecolor': '#1e1e1e',
|
| 562 |
+
'legend.edgecolor': 'white'
|
| 563 |
+
})
|
| 564 |
+
```
|
| 565 |
+
|
| 566 |
+
## Color Accessibility
|
| 567 |
+
|
| 568 |
+
### Colorblind-Friendly Palettes
|
| 569 |
+
|
| 570 |
+
```python
|
| 571 |
+
# Use colorblind-friendly colormaps
|
| 572 |
+
colorblind_friendly = ['viridis', 'plasma', 'cividis']
|
| 573 |
+
|
| 574 |
+
# Colorblind-friendly discrete colors
|
| 575 |
+
cb_colors = ['#0173B2', '#DE8F05', '#029E73', '#CC78BC',
|
| 576 |
+
'#CA9161', '#949494', '#ECE133', '#56B4E9']
|
| 577 |
+
|
| 578 |
+
# Test with simulation tools or use these validated palettes
|
| 579 |
+
```
|
| 580 |
+
|
| 581 |
+
### High Contrast
|
| 582 |
+
|
| 583 |
+
```python
|
| 584 |
+
# Ensure sufficient contrast
|
| 585 |
+
plt.rcParams['axes.edgecolor'] = 'black'
|
| 586 |
+
plt.rcParams['axes.linewidth'] = 2
|
| 587 |
+
plt.rcParams['xtick.major.width'] = 2
|
| 588 |
+
plt.rcParams['ytick.major.width'] = 2
|
| 589 |
+
```
|
.scider/skills/matplotlib/scripts/plot_template.py
ADDED
|
@@ -0,0 +1,446 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Matplotlib Plot Template
|
| 4 |
+
|
| 5 |
+
Comprehensive template demonstrating various plot types and best practices.
|
| 6 |
+
Use this as a starting point for creating publication-quality visualizations.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python plot_template.py [--plot-type TYPE] [--style STYLE] [--output FILE]
|
| 10 |
+
|
| 11 |
+
Plot types:
|
| 12 |
+
line, scatter, bar, histogram, heatmap, contour, box, violin, 3d, all
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import argparse
|
| 16 |
+
|
| 17 |
+
import matplotlib.pyplot as plt
|
| 18 |
+
import numpy as np
|
| 19 |
+
from matplotlib.gridspec import GridSpec
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def set_publication_style():
|
| 23 |
+
"""Configure matplotlib for publication-quality figures."""
|
| 24 |
+
plt.rcParams.update(
|
| 25 |
+
{
|
| 26 |
+
"figure.figsize": (10, 6),
|
| 27 |
+
"figure.dpi": 100,
|
| 28 |
+
"savefig.dpi": 300,
|
| 29 |
+
"savefig.bbox": "tight",
|
| 30 |
+
"font.size": 11,
|
| 31 |
+
"axes.labelsize": 12,
|
| 32 |
+
"axes.titlesize": 14,
|
| 33 |
+
"xtick.labelsize": 10,
|
| 34 |
+
"ytick.labelsize": 10,
|
| 35 |
+
"legend.fontsize": 10,
|
| 36 |
+
"lines.linewidth": 2,
|
| 37 |
+
"axes.linewidth": 1.5,
|
| 38 |
+
}
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def generate_sample_data():
|
| 43 |
+
"""Generate sample data for demonstrations."""
|
| 44 |
+
np.random.seed(42)
|
| 45 |
+
x = np.linspace(0, 10, 100)
|
| 46 |
+
y1 = np.sin(x)
|
| 47 |
+
y2 = np.cos(x)
|
| 48 |
+
scatter_x = np.random.randn(200)
|
| 49 |
+
scatter_y = np.random.randn(200)
|
| 50 |
+
categories = ["A", "B", "C", "D", "E"]
|
| 51 |
+
bar_values = np.random.randint(10, 100, len(categories))
|
| 52 |
+
hist_data = np.random.normal(0, 1, 1000)
|
| 53 |
+
matrix = np.random.rand(10, 10)
|
| 54 |
+
|
| 55 |
+
X, Y = np.meshgrid(np.linspace(-3, 3, 100), np.linspace(-3, 3, 100))
|
| 56 |
+
Z = np.sin(np.sqrt(X**2 + Y**2))
|
| 57 |
+
|
| 58 |
+
return {
|
| 59 |
+
"x": x,
|
| 60 |
+
"y1": y1,
|
| 61 |
+
"y2": y2,
|
| 62 |
+
"scatter_x": scatter_x,
|
| 63 |
+
"scatter_y": scatter_y,
|
| 64 |
+
"categories": categories,
|
| 65 |
+
"bar_values": bar_values,
|
| 66 |
+
"hist_data": hist_data,
|
| 67 |
+
"matrix": matrix,
|
| 68 |
+
"X": X,
|
| 69 |
+
"Y": Y,
|
| 70 |
+
"Z": Z,
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def create_line_plot(data, ax=None):
|
| 75 |
+
"""Create line plot with best practices."""
|
| 76 |
+
if ax is None:
|
| 77 |
+
fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
|
| 78 |
+
|
| 79 |
+
ax.plot(
|
| 80 |
+
data["x"], data["y1"], label="sin(x)", linewidth=2, marker="o", markevery=10, markersize=6
|
| 81 |
+
)
|
| 82 |
+
ax.plot(data["x"], data["y2"], label="cos(x)", linewidth=2, linestyle="--")
|
| 83 |
+
|
| 84 |
+
ax.set_xlabel("x")
|
| 85 |
+
ax.set_ylabel("y")
|
| 86 |
+
ax.set_title("Line Plot Example")
|
| 87 |
+
ax.legend(loc="best", framealpha=0.9)
|
| 88 |
+
ax.grid(True, alpha=0.3, linestyle="--")
|
| 89 |
+
|
| 90 |
+
# Remove top and right spines for cleaner look
|
| 91 |
+
ax.spines["top"].set_visible(False)
|
| 92 |
+
ax.spines["right"].set_visible(False)
|
| 93 |
+
|
| 94 |
+
if ax is None:
|
| 95 |
+
return fig
|
| 96 |
+
return ax
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def create_scatter_plot(data, ax=None):
|
| 100 |
+
"""Create scatter plot with color and size variations."""
|
| 101 |
+
if ax is None:
|
| 102 |
+
fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
|
| 103 |
+
|
| 104 |
+
# Color based on distance from origin
|
| 105 |
+
colors = np.sqrt(data["scatter_x"] ** 2 + data["scatter_y"] ** 2)
|
| 106 |
+
sizes = 50 * (1 + np.abs(data["scatter_x"]))
|
| 107 |
+
|
| 108 |
+
scatter = ax.scatter(
|
| 109 |
+
data["scatter_x"],
|
| 110 |
+
data["scatter_y"],
|
| 111 |
+
c=colors,
|
| 112 |
+
s=sizes,
|
| 113 |
+
alpha=0.6,
|
| 114 |
+
cmap="viridis",
|
| 115 |
+
edgecolors="black",
|
| 116 |
+
linewidth=0.5,
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
ax.set_xlabel("X")
|
| 120 |
+
ax.set_ylabel("Y")
|
| 121 |
+
ax.set_title("Scatter Plot Example")
|
| 122 |
+
ax.grid(True, alpha=0.3, linestyle="--")
|
| 123 |
+
|
| 124 |
+
# Add colorbar
|
| 125 |
+
cbar = plt.colorbar(scatter, ax=ax)
|
| 126 |
+
cbar.set_label("Distance from origin")
|
| 127 |
+
|
| 128 |
+
if ax is None:
|
| 129 |
+
return fig
|
| 130 |
+
return ax
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def create_bar_chart(data, ax=None):
|
| 134 |
+
"""Create bar chart with error bars and styling."""
|
| 135 |
+
if ax is None:
|
| 136 |
+
fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
|
| 137 |
+
|
| 138 |
+
x_pos = np.arange(len(data["categories"]))
|
| 139 |
+
errors = np.random.randint(5, 15, len(data["categories"]))
|
| 140 |
+
|
| 141 |
+
bars = ax.bar(
|
| 142 |
+
x_pos,
|
| 143 |
+
data["bar_values"],
|
| 144 |
+
yerr=errors,
|
| 145 |
+
color="steelblue",
|
| 146 |
+
edgecolor="black",
|
| 147 |
+
linewidth=1.5,
|
| 148 |
+
capsize=5,
|
| 149 |
+
alpha=0.8,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
# Color bars by value
|
| 153 |
+
colors = plt.cm.viridis(data["bar_values"] / data["bar_values"].max())
|
| 154 |
+
for bar, color in zip(bars, colors):
|
| 155 |
+
bar.set_facecolor(color)
|
| 156 |
+
|
| 157 |
+
ax.set_xlabel("Category")
|
| 158 |
+
ax.set_ylabel("Values")
|
| 159 |
+
ax.set_title("Bar Chart Example")
|
| 160 |
+
ax.set_xticks(x_pos)
|
| 161 |
+
ax.set_xticklabels(data["categories"])
|
| 162 |
+
ax.grid(True, axis="y", alpha=0.3, linestyle="--")
|
| 163 |
+
|
| 164 |
+
# Remove top and right spines
|
| 165 |
+
ax.spines["top"].set_visible(False)
|
| 166 |
+
ax.spines["right"].set_visible(False)
|
| 167 |
+
|
| 168 |
+
if ax is None:
|
| 169 |
+
return fig
|
| 170 |
+
return ax
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def create_histogram(data, ax=None):
|
| 174 |
+
"""Create histogram with density overlay."""
|
| 175 |
+
if ax is None:
|
| 176 |
+
fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
|
| 177 |
+
|
| 178 |
+
n, bins, patches = ax.hist(
|
| 179 |
+
data["hist_data"], bins=30, density=True, alpha=0.7, edgecolor="black", color="steelblue"
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
# Overlay theoretical normal distribution
|
| 183 |
+
from scipy.stats import norm
|
| 184 |
+
|
| 185 |
+
mu, std = norm.fit(data["hist_data"])
|
| 186 |
+
x_theory = np.linspace(data["hist_data"].min(), data["hist_data"].max(), 100)
|
| 187 |
+
ax.plot(
|
| 188 |
+
x_theory,
|
| 189 |
+
norm.pdf(x_theory, mu, std),
|
| 190 |
+
"r-",
|
| 191 |
+
linewidth=2,
|
| 192 |
+
label=f"Normal fit (μ={mu:.2f}, σ={std:.2f})",
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
ax.set_xlabel("Value")
|
| 196 |
+
ax.set_ylabel("Density")
|
| 197 |
+
ax.set_title("Histogram with Normal Fit")
|
| 198 |
+
ax.legend()
|
| 199 |
+
ax.grid(True, axis="y", alpha=0.3, linestyle="--")
|
| 200 |
+
|
| 201 |
+
if ax is None:
|
| 202 |
+
return fig
|
| 203 |
+
return ax
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def create_heatmap(data, ax=None):
|
| 207 |
+
"""Create heatmap with colorbar and annotations."""
|
| 208 |
+
if ax is None:
|
| 209 |
+
fig, ax = plt.subplots(figsize=(10, 8), constrained_layout=True)
|
| 210 |
+
|
| 211 |
+
im = ax.imshow(data["matrix"], cmap="coolwarm", aspect="auto", vmin=0, vmax=1)
|
| 212 |
+
|
| 213 |
+
# Add colorbar
|
| 214 |
+
cbar = plt.colorbar(im, ax=ax)
|
| 215 |
+
cbar.set_label("Value")
|
| 216 |
+
|
| 217 |
+
# Optional: Add text annotations
|
| 218 |
+
# for i in range(data['matrix'].shape[0]):
|
| 219 |
+
# for j in range(data['matrix'].shape[1]):
|
| 220 |
+
# text = ax.text(j, i, f'{data["matrix"][i, j]:.2f}',
|
| 221 |
+
# ha='center', va='center', color='black', fontsize=8)
|
| 222 |
+
|
| 223 |
+
ax.set_xlabel("X Index")
|
| 224 |
+
ax.set_ylabel("Y Index")
|
| 225 |
+
ax.set_title("Heatmap Example")
|
| 226 |
+
|
| 227 |
+
if ax is None:
|
| 228 |
+
return fig
|
| 229 |
+
return ax
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def create_contour_plot(data, ax=None):
|
| 233 |
+
"""Create contour plot with filled contours and labels."""
|
| 234 |
+
if ax is None:
|
| 235 |
+
fig, ax = plt.subplots(figsize=(10, 8), constrained_layout=True)
|
| 236 |
+
|
| 237 |
+
# Filled contours
|
| 238 |
+
contourf = ax.contourf(data["X"], data["Y"], data["Z"], levels=20, cmap="viridis", alpha=0.8)
|
| 239 |
+
|
| 240 |
+
# Contour lines
|
| 241 |
+
contour = ax.contour(
|
| 242 |
+
data["X"], data["Y"], data["Z"], levels=10, colors="black", linewidths=0.5, alpha=0.4
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
# Add labels to contour lines
|
| 246 |
+
ax.clabel(contour, inline=True, fontsize=8)
|
| 247 |
+
|
| 248 |
+
# Add colorbar
|
| 249 |
+
cbar = plt.colorbar(contourf, ax=ax)
|
| 250 |
+
cbar.set_label("Z value")
|
| 251 |
+
|
| 252 |
+
ax.set_xlabel("X")
|
| 253 |
+
ax.set_ylabel("Y")
|
| 254 |
+
ax.set_title("Contour Plot Example")
|
| 255 |
+
ax.set_aspect("equal")
|
| 256 |
+
|
| 257 |
+
if ax is None:
|
| 258 |
+
return fig
|
| 259 |
+
return ax
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def create_box_plot(data, ax=None):
|
| 263 |
+
"""Create box plot comparing distributions."""
|
| 264 |
+
if ax is None:
|
| 265 |
+
fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
|
| 266 |
+
|
| 267 |
+
# Generate multiple distributions
|
| 268 |
+
box_data = [np.random.normal(0, std, 100) for std in range(1, 5)]
|
| 269 |
+
|
| 270 |
+
bp = ax.boxplot(
|
| 271 |
+
box_data,
|
| 272 |
+
labels=["Group 1", "Group 2", "Group 3", "Group 4"],
|
| 273 |
+
patch_artist=True,
|
| 274 |
+
showmeans=True,
|
| 275 |
+
boxprops=dict(facecolor="lightblue", edgecolor="black"),
|
| 276 |
+
medianprops=dict(color="red", linewidth=2),
|
| 277 |
+
meanprops=dict(marker="D", markerfacecolor="green", markersize=8),
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
+
ax.set_xlabel("Groups")
|
| 281 |
+
ax.set_ylabel("Values")
|
| 282 |
+
ax.set_title("Box Plot Example")
|
| 283 |
+
ax.grid(True, axis="y", alpha=0.3, linestyle="--")
|
| 284 |
+
|
| 285 |
+
if ax is None:
|
| 286 |
+
return fig
|
| 287 |
+
return ax
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
def create_violin_plot(data, ax=None):
|
| 291 |
+
"""Create violin plot showing distribution shapes."""
|
| 292 |
+
if ax is None:
|
| 293 |
+
fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
|
| 294 |
+
|
| 295 |
+
# Generate multiple distributions
|
| 296 |
+
violin_data = [np.random.normal(0, std, 100) for std in range(1, 5)]
|
| 297 |
+
|
| 298 |
+
parts = ax.violinplot(violin_data, positions=range(1, 5), showmeans=True, showmedians=True)
|
| 299 |
+
|
| 300 |
+
# Customize colors
|
| 301 |
+
for pc in parts["bodies"]:
|
| 302 |
+
pc.set_facecolor("lightblue")
|
| 303 |
+
pc.set_alpha(0.7)
|
| 304 |
+
pc.set_edgecolor("black")
|
| 305 |
+
|
| 306 |
+
ax.set_xlabel("Groups")
|
| 307 |
+
ax.set_ylabel("Values")
|
| 308 |
+
ax.set_title("Violin Plot Example")
|
| 309 |
+
ax.set_xticks(range(1, 5))
|
| 310 |
+
ax.set_xticklabels(["Group 1", "Group 2", "Group 3", "Group 4"])
|
| 311 |
+
ax.grid(True, axis="y", alpha=0.3, linestyle="--")
|
| 312 |
+
|
| 313 |
+
if ax is None:
|
| 314 |
+
return fig
|
| 315 |
+
return ax
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
def create_3d_plot():
|
| 319 |
+
"""Create 3D surface plot."""
|
| 320 |
+
from mpl_toolkits.mplot3d import Axes3D
|
| 321 |
+
|
| 322 |
+
fig = plt.figure(figsize=(12, 9))
|
| 323 |
+
ax = fig.add_subplot(111, projection="3d")
|
| 324 |
+
|
| 325 |
+
# Generate data
|
| 326 |
+
X = np.linspace(-5, 5, 50)
|
| 327 |
+
Y = np.linspace(-5, 5, 50)
|
| 328 |
+
X, Y = np.meshgrid(X, Y)
|
| 329 |
+
Z = np.sin(np.sqrt(X**2 + Y**2))
|
| 330 |
+
|
| 331 |
+
# Create surface plot
|
| 332 |
+
surf = ax.plot_surface(X, Y, Z, cmap="viridis", edgecolor="none", alpha=0.9)
|
| 333 |
+
|
| 334 |
+
# Add colorbar
|
| 335 |
+
fig.colorbar(surf, ax=ax, shrink=0.5)
|
| 336 |
+
|
| 337 |
+
ax.set_xlabel("X")
|
| 338 |
+
ax.set_ylabel("Y")
|
| 339 |
+
ax.set_zlabel("Z")
|
| 340 |
+
ax.set_title("3D Surface Plot Example")
|
| 341 |
+
|
| 342 |
+
# Set viewing angle
|
| 343 |
+
ax.view_init(elev=30, azim=45)
|
| 344 |
+
|
| 345 |
+
plt.tight_layout()
|
| 346 |
+
return fig
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
def create_comprehensive_figure():
|
| 350 |
+
"""Create a comprehensive figure with multiple subplots."""
|
| 351 |
+
data = generate_sample_data()
|
| 352 |
+
|
| 353 |
+
fig = plt.figure(figsize=(16, 12), constrained_layout=True)
|
| 354 |
+
gs = GridSpec(3, 3, figure=fig)
|
| 355 |
+
|
| 356 |
+
# Create subplots
|
| 357 |
+
ax1 = fig.add_subplot(gs[0, :2]) # Line plot - top left, spans 2 columns
|
| 358 |
+
create_line_plot(data, ax1)
|
| 359 |
+
|
| 360 |
+
ax2 = fig.add_subplot(gs[0, 2]) # Bar chart - top right
|
| 361 |
+
create_bar_chart(data, ax2)
|
| 362 |
+
|
| 363 |
+
ax3 = fig.add_subplot(gs[1, 0]) # Scatter plot - middle left
|
| 364 |
+
create_scatter_plot(data, ax3)
|
| 365 |
+
|
| 366 |
+
ax4 = fig.add_subplot(gs[1, 1]) # Histogram - middle center
|
| 367 |
+
create_histogram(data, ax4)
|
| 368 |
+
|
| 369 |
+
ax5 = fig.add_subplot(gs[1, 2]) # Box plot - middle right
|
| 370 |
+
create_box_plot(data, ax5)
|
| 371 |
+
|
| 372 |
+
ax6 = fig.add_subplot(gs[2, :2]) # Contour plot - bottom left, spans 2 columns
|
| 373 |
+
create_contour_plot(data, ax6)
|
| 374 |
+
|
| 375 |
+
ax7 = fig.add_subplot(gs[2, 2]) # Heatmap - bottom right
|
| 376 |
+
create_heatmap(data, ax7)
|
| 377 |
+
|
| 378 |
+
fig.suptitle("Comprehensive Matplotlib Template", fontsize=18, fontweight="bold")
|
| 379 |
+
|
| 380 |
+
return fig
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
def main():
|
| 384 |
+
"""Main function to run the template."""
|
| 385 |
+
parser = argparse.ArgumentParser(description="Matplotlib plot template")
|
| 386 |
+
parser.add_argument(
|
| 387 |
+
"--plot-type",
|
| 388 |
+
type=str,
|
| 389 |
+
default="all",
|
| 390 |
+
choices=[
|
| 391 |
+
"line",
|
| 392 |
+
"scatter",
|
| 393 |
+
"bar",
|
| 394 |
+
"histogram",
|
| 395 |
+
"heatmap",
|
| 396 |
+
"contour",
|
| 397 |
+
"box",
|
| 398 |
+
"violin",
|
| 399 |
+
"3d",
|
| 400 |
+
"all",
|
| 401 |
+
],
|
| 402 |
+
help="Type of plot to create",
|
| 403 |
+
)
|
| 404 |
+
parser.add_argument("--style", type=str, default="default", help="Matplotlib style to use")
|
| 405 |
+
parser.add_argument("--output", type=str, default="plot.png", help="Output filename")
|
| 406 |
+
|
| 407 |
+
args = parser.parse_args()
|
| 408 |
+
|
| 409 |
+
# Set style
|
| 410 |
+
if args.style != "default":
|
| 411 |
+
plt.style.use(args.style)
|
| 412 |
+
else:
|
| 413 |
+
set_publication_style()
|
| 414 |
+
|
| 415 |
+
# Generate data
|
| 416 |
+
data = generate_sample_data()
|
| 417 |
+
|
| 418 |
+
# Create plot based on type
|
| 419 |
+
plot_functions = {
|
| 420 |
+
"line": create_line_plot,
|
| 421 |
+
"scatter": create_scatter_plot,
|
| 422 |
+
"bar": create_bar_chart,
|
| 423 |
+
"histogram": create_histogram,
|
| 424 |
+
"heatmap": create_heatmap,
|
| 425 |
+
"contour": create_contour_plot,
|
| 426 |
+
"box": create_box_plot,
|
| 427 |
+
"violin": create_violin_plot,
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
if args.plot_type == "3d":
|
| 431 |
+
fig = create_3d_plot()
|
| 432 |
+
elif args.plot_type == "all":
|
| 433 |
+
fig = create_comprehensive_figure()
|
| 434 |
+
else:
|
| 435 |
+
fig = plot_functions[args.plot_type](data)
|
| 436 |
+
|
| 437 |
+
# Save figure
|
| 438 |
+
plt.savefig(args.output, dpi=300, bbox_inches="tight")
|
| 439 |
+
print(f"Plot saved to {args.output}")
|
| 440 |
+
|
| 441 |
+
# Display
|
| 442 |
+
plt.show()
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
if __name__ == "__main__":
|
| 446 |
+
main()
|
.scider/skills/matplotlib/scripts/style_configurator.py
ADDED
|
@@ -0,0 +1,413 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Matplotlib Style Configurator
|
| 4 |
+
|
| 5 |
+
Interactive utility to configure matplotlib style preferences and generate
|
| 6 |
+
custom style sheets. Creates a preview of the style and optionally saves
|
| 7 |
+
it as a .mplstyle file.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
python style_configurator.py [--preset PRESET] [--output FILE] [--preview]
|
| 11 |
+
|
| 12 |
+
Presets:
|
| 13 |
+
publication, presentation, web, dark, minimal
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import argparse
|
| 17 |
+
import os
|
| 18 |
+
|
| 19 |
+
import matplotlib.pyplot as plt
|
| 20 |
+
import numpy as np
|
| 21 |
+
from matplotlib.gridspec import GridSpec
|
| 22 |
+
|
| 23 |
+
# Predefined style presets
|
| 24 |
+
STYLE_PRESETS = {
|
| 25 |
+
"publication": {
|
| 26 |
+
"figure.figsize": (8, 6),
|
| 27 |
+
"figure.dpi": 100,
|
| 28 |
+
"savefig.dpi": 300,
|
| 29 |
+
"savefig.bbox": "tight",
|
| 30 |
+
"font.family": "sans-serif",
|
| 31 |
+
"font.sans-serif": ["Arial", "Helvetica"],
|
| 32 |
+
"font.size": 11,
|
| 33 |
+
"axes.labelsize": 12,
|
| 34 |
+
"axes.titlesize": 14,
|
| 35 |
+
"axes.linewidth": 1.5,
|
| 36 |
+
"axes.grid": False,
|
| 37 |
+
"axes.spines.top": False,
|
| 38 |
+
"axes.spines.right": False,
|
| 39 |
+
"lines.linewidth": 2,
|
| 40 |
+
"lines.markersize": 8,
|
| 41 |
+
"xtick.labelsize": 10,
|
| 42 |
+
"ytick.labelsize": 10,
|
| 43 |
+
"xtick.direction": "in",
|
| 44 |
+
"ytick.direction": "in",
|
| 45 |
+
"xtick.major.size": 6,
|
| 46 |
+
"ytick.major.size": 6,
|
| 47 |
+
"xtick.major.width": 1.5,
|
| 48 |
+
"ytick.major.width": 1.5,
|
| 49 |
+
"legend.fontsize": 10,
|
| 50 |
+
"legend.frameon": True,
|
| 51 |
+
"legend.framealpha": 1.0,
|
| 52 |
+
"legend.edgecolor": "black",
|
| 53 |
+
},
|
| 54 |
+
"presentation": {
|
| 55 |
+
"figure.figsize": (12, 8),
|
| 56 |
+
"figure.dpi": 100,
|
| 57 |
+
"savefig.dpi": 150,
|
| 58 |
+
"font.size": 16,
|
| 59 |
+
"axes.labelsize": 20,
|
| 60 |
+
"axes.titlesize": 24,
|
| 61 |
+
"axes.linewidth": 2,
|
| 62 |
+
"lines.linewidth": 3,
|
| 63 |
+
"lines.markersize": 12,
|
| 64 |
+
"xtick.labelsize": 16,
|
| 65 |
+
"ytick.labelsize": 16,
|
| 66 |
+
"legend.fontsize": 16,
|
| 67 |
+
"axes.grid": True,
|
| 68 |
+
"grid.alpha": 0.3,
|
| 69 |
+
},
|
| 70 |
+
"web": {
|
| 71 |
+
"figure.figsize": (10, 6),
|
| 72 |
+
"figure.dpi": 96,
|
| 73 |
+
"savefig.dpi": 150,
|
| 74 |
+
"font.size": 11,
|
| 75 |
+
"axes.labelsize": 12,
|
| 76 |
+
"axes.titlesize": 14,
|
| 77 |
+
"lines.linewidth": 2,
|
| 78 |
+
"axes.grid": True,
|
| 79 |
+
"grid.alpha": 0.2,
|
| 80 |
+
"grid.linestyle": "--",
|
| 81 |
+
},
|
| 82 |
+
"dark": {
|
| 83 |
+
"figure.facecolor": "#1e1e1e",
|
| 84 |
+
"figure.edgecolor": "#1e1e1e",
|
| 85 |
+
"axes.facecolor": "#1e1e1e",
|
| 86 |
+
"axes.edgecolor": "white",
|
| 87 |
+
"axes.labelcolor": "white",
|
| 88 |
+
"text.color": "white",
|
| 89 |
+
"xtick.color": "white",
|
| 90 |
+
"ytick.color": "white",
|
| 91 |
+
"grid.color": "gray",
|
| 92 |
+
"grid.alpha": 0.3,
|
| 93 |
+
"axes.grid": True,
|
| 94 |
+
"legend.facecolor": "#1e1e1e",
|
| 95 |
+
"legend.edgecolor": "white",
|
| 96 |
+
"savefig.facecolor": "#1e1e1e",
|
| 97 |
+
},
|
| 98 |
+
"minimal": {
|
| 99 |
+
"figure.figsize": (10, 6),
|
| 100 |
+
"axes.spines.top": False,
|
| 101 |
+
"axes.spines.right": False,
|
| 102 |
+
"axes.spines.left": False,
|
| 103 |
+
"axes.spines.bottom": False,
|
| 104 |
+
"axes.grid": False,
|
| 105 |
+
"xtick.bottom": True,
|
| 106 |
+
"ytick.left": True,
|
| 107 |
+
"axes.axisbelow": True,
|
| 108 |
+
"lines.linewidth": 2.5,
|
| 109 |
+
"font.size": 12,
|
| 110 |
+
},
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def generate_preview_data():
|
| 115 |
+
"""Generate sample data for style preview."""
|
| 116 |
+
np.random.seed(42)
|
| 117 |
+
x = np.linspace(0, 10, 100)
|
| 118 |
+
y1 = np.sin(x) + 0.1 * np.random.randn(100)
|
| 119 |
+
y2 = np.cos(x) + 0.1 * np.random.randn(100)
|
| 120 |
+
scatter_x = np.random.randn(100)
|
| 121 |
+
scatter_y = 2 * scatter_x + np.random.randn(100)
|
| 122 |
+
categories = ["A", "B", "C", "D", "E"]
|
| 123 |
+
bar_values = [25, 40, 30, 55, 45]
|
| 124 |
+
|
| 125 |
+
return {
|
| 126 |
+
"x": x,
|
| 127 |
+
"y1": y1,
|
| 128 |
+
"y2": y2,
|
| 129 |
+
"scatter_x": scatter_x,
|
| 130 |
+
"scatter_y": scatter_y,
|
| 131 |
+
"categories": categories,
|
| 132 |
+
"bar_values": bar_values,
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def create_style_preview(style_dict=None):
|
| 137 |
+
"""Create a preview figure demonstrating the style."""
|
| 138 |
+
if style_dict:
|
| 139 |
+
plt.rcParams.update(style_dict)
|
| 140 |
+
|
| 141 |
+
data = generate_preview_data()
|
| 142 |
+
|
| 143 |
+
fig = plt.figure(figsize=(14, 10))
|
| 144 |
+
gs = GridSpec(2, 2, figure=fig, hspace=0.3, wspace=0.3)
|
| 145 |
+
|
| 146 |
+
# Line plot
|
| 147 |
+
ax1 = fig.add_subplot(gs[0, 0])
|
| 148 |
+
ax1.plot(data["x"], data["y1"], label="sin(x)", marker="o", markevery=10)
|
| 149 |
+
ax1.plot(data["x"], data["y2"], label="cos(x)", linestyle="--")
|
| 150 |
+
ax1.set_xlabel("X axis")
|
| 151 |
+
ax1.set_ylabel("Y axis")
|
| 152 |
+
ax1.set_title("Line Plot")
|
| 153 |
+
ax1.legend()
|
| 154 |
+
ax1.grid(True, alpha=0.3)
|
| 155 |
+
|
| 156 |
+
# Scatter plot
|
| 157 |
+
ax2 = fig.add_subplot(gs[0, 1])
|
| 158 |
+
colors = np.sqrt(data["scatter_x"] ** 2 + data["scatter_y"] ** 2)
|
| 159 |
+
scatter = ax2.scatter(
|
| 160 |
+
data["scatter_x"], data["scatter_y"], c=colors, cmap="viridis", alpha=0.6, s=50
|
| 161 |
+
)
|
| 162 |
+
ax2.set_xlabel("X axis")
|
| 163 |
+
ax2.set_ylabel("Y axis")
|
| 164 |
+
ax2.set_title("Scatter Plot")
|
| 165 |
+
cbar = plt.colorbar(scatter, ax=ax2)
|
| 166 |
+
cbar.set_label("Distance")
|
| 167 |
+
ax2.grid(True, alpha=0.3)
|
| 168 |
+
|
| 169 |
+
# Bar chart
|
| 170 |
+
ax3 = fig.add_subplot(gs[1, 0])
|
| 171 |
+
bars = ax3.bar(data["categories"], data["bar_values"], edgecolor="black", linewidth=1)
|
| 172 |
+
# Color bars with gradient
|
| 173 |
+
colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(bars)))
|
| 174 |
+
for bar, color in zip(bars, colors):
|
| 175 |
+
bar.set_facecolor(color)
|
| 176 |
+
ax3.set_xlabel("Categories")
|
| 177 |
+
ax3.set_ylabel("Values")
|
| 178 |
+
ax3.set_title("Bar Chart")
|
| 179 |
+
ax3.grid(True, axis="y", alpha=0.3)
|
| 180 |
+
|
| 181 |
+
# Multiple line plot with fills
|
| 182 |
+
ax4 = fig.add_subplot(gs[1, 1])
|
| 183 |
+
ax4.plot(data["x"], data["y1"], label="Signal 1", linewidth=2)
|
| 184 |
+
ax4.fill_between(data["x"], data["y1"] - 0.2, data["y1"] + 0.2, alpha=0.3, label="±1 std")
|
| 185 |
+
ax4.plot(data["x"], data["y2"], label="Signal 2", linewidth=2)
|
| 186 |
+
ax4.fill_between(data["x"], data["y2"] - 0.2, data["y2"] + 0.2, alpha=0.3)
|
| 187 |
+
ax4.set_xlabel("X axis")
|
| 188 |
+
ax4.set_ylabel("Y axis")
|
| 189 |
+
ax4.set_title("Time Series with Uncertainty")
|
| 190 |
+
ax4.legend()
|
| 191 |
+
ax4.grid(True, alpha=0.3)
|
| 192 |
+
|
| 193 |
+
fig.suptitle("Style Preview", fontsize=16, fontweight="bold")
|
| 194 |
+
|
| 195 |
+
return fig
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def save_style_file(style_dict, filename):
|
| 199 |
+
"""Save style dictionary as .mplstyle file."""
|
| 200 |
+
with open(filename, "w") as f:
|
| 201 |
+
f.write("# Custom matplotlib style\n")
|
| 202 |
+
f.write("# Generated by style_configurator.py\n\n")
|
| 203 |
+
|
| 204 |
+
# Group settings by category
|
| 205 |
+
categories = {
|
| 206 |
+
"Figure": ["figure."],
|
| 207 |
+
"Font": ["font."],
|
| 208 |
+
"Axes": ["axes."],
|
| 209 |
+
"Lines": ["lines."],
|
| 210 |
+
"Markers": ["markers."],
|
| 211 |
+
"Ticks": ["tick.", "xtick.", "ytick."],
|
| 212 |
+
"Grid": ["grid."],
|
| 213 |
+
"Legend": ["legend."],
|
| 214 |
+
"Savefig": ["savefig."],
|
| 215 |
+
"Text": ["text."],
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
for category, prefixes in categories.items():
|
| 219 |
+
category_items = {
|
| 220 |
+
k: v for k, v in style_dict.items() if any(k.startswith(p) for p in prefixes)
|
| 221 |
+
}
|
| 222 |
+
if category_items:
|
| 223 |
+
f.write(f"# {category}\n")
|
| 224 |
+
for key, value in sorted(category_items.items()):
|
| 225 |
+
# Format value appropriately
|
| 226 |
+
if isinstance(value, (list, tuple)):
|
| 227 |
+
value_str = ", ".join(str(v) for v in value)
|
| 228 |
+
elif isinstance(value, bool):
|
| 229 |
+
value_str = str(value)
|
| 230 |
+
else:
|
| 231 |
+
value_str = str(value)
|
| 232 |
+
f.write(f"{key}: {value_str}\n")
|
| 233 |
+
f.write("\n")
|
| 234 |
+
|
| 235 |
+
print(f"Style saved to {filename}")
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def print_style_info(style_dict):
|
| 239 |
+
"""Print information about the style."""
|
| 240 |
+
print("\n" + "=" * 60)
|
| 241 |
+
print("STYLE CONFIGURATION")
|
| 242 |
+
print("=" * 60)
|
| 243 |
+
|
| 244 |
+
categories = {
|
| 245 |
+
"Figure Settings": ["figure."],
|
| 246 |
+
"Font Settings": ["font."],
|
| 247 |
+
"Axes Settings": ["axes."],
|
| 248 |
+
"Line Settings": ["lines."],
|
| 249 |
+
"Grid Settings": ["grid."],
|
| 250 |
+
"Legend Settings": ["legend."],
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
for category, prefixes in categories.items():
|
| 254 |
+
category_items = {
|
| 255 |
+
k: v for k, v in style_dict.items() if any(k.startswith(p) for p in prefixes)
|
| 256 |
+
}
|
| 257 |
+
if category_items:
|
| 258 |
+
print(f"\n{category}:")
|
| 259 |
+
for key, value in sorted(category_items.items()):
|
| 260 |
+
print(f" {key}: {value}")
|
| 261 |
+
|
| 262 |
+
print("\n" + "=" * 60 + "\n")
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def list_available_presets():
|
| 266 |
+
"""Print available style presets."""
|
| 267 |
+
print("\nAvailable style presets:")
|
| 268 |
+
print("-" * 40)
|
| 269 |
+
descriptions = {
|
| 270 |
+
"publication": "Optimized for academic publications",
|
| 271 |
+
"presentation": "Large fonts for presentations",
|
| 272 |
+
"web": "Optimized for web display",
|
| 273 |
+
"dark": "Dark background theme",
|
| 274 |
+
"minimal": "Minimal, clean style",
|
| 275 |
+
}
|
| 276 |
+
for preset, desc in descriptions.items():
|
| 277 |
+
print(f" {preset:15s} - {desc}")
|
| 278 |
+
print("-" * 40 + "\n")
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def interactive_mode():
|
| 282 |
+
"""Run interactive mode to customize style settings."""
|
| 283 |
+
print("\n" + "=" * 60)
|
| 284 |
+
print("MATPLOTLIB STYLE CONFIGURATOR - Interactive Mode")
|
| 285 |
+
print("=" * 60)
|
| 286 |
+
|
| 287 |
+
list_available_presets()
|
| 288 |
+
|
| 289 |
+
preset = input("Choose a preset to start from (or 'custom' for default): ").strip().lower()
|
| 290 |
+
|
| 291 |
+
if preset in STYLE_PRESETS:
|
| 292 |
+
style_dict = STYLE_PRESETS[preset].copy()
|
| 293 |
+
print(f"\nStarting from '{preset}' preset")
|
| 294 |
+
else:
|
| 295 |
+
style_dict = {}
|
| 296 |
+
print("\nStarting from default matplotlib style")
|
| 297 |
+
|
| 298 |
+
print("\nCommon settings you might want to customize:")
|
| 299 |
+
print(" 1. Figure size")
|
| 300 |
+
print(" 2. Font sizes")
|
| 301 |
+
print(" 3. Line widths")
|
| 302 |
+
print(" 4. Grid settings")
|
| 303 |
+
print(" 5. Color scheme")
|
| 304 |
+
print(" 6. Done, show preview")
|
| 305 |
+
|
| 306 |
+
while True:
|
| 307 |
+
choice = input("\nSelect option (1-6): ").strip()
|
| 308 |
+
|
| 309 |
+
if choice == "1":
|
| 310 |
+
width = input(" Figure width (inches, default 10): ").strip() or "10"
|
| 311 |
+
height = input(" Figure height (inches, default 6): ").strip() or "6"
|
| 312 |
+
style_dict["figure.figsize"] = (float(width), float(height))
|
| 313 |
+
|
| 314 |
+
elif choice == "2":
|
| 315 |
+
base = input(" Base font size (default 12): ").strip() or "12"
|
| 316 |
+
style_dict["font.size"] = float(base)
|
| 317 |
+
style_dict["axes.labelsize"] = float(base) + 2
|
| 318 |
+
style_dict["axes.titlesize"] = float(base) + 4
|
| 319 |
+
|
| 320 |
+
elif choice == "3":
|
| 321 |
+
lw = input(" Line width (default 2): ").strip() or "2"
|
| 322 |
+
style_dict["lines.linewidth"] = float(lw)
|
| 323 |
+
|
| 324 |
+
elif choice == "4":
|
| 325 |
+
grid = input(" Enable grid? (y/n): ").strip().lower()
|
| 326 |
+
style_dict["axes.grid"] = grid == "y"
|
| 327 |
+
if style_dict["axes.grid"]:
|
| 328 |
+
alpha = input(" Grid transparency (0-1, default 0.3): ").strip() or "0.3"
|
| 329 |
+
style_dict["grid.alpha"] = float(alpha)
|
| 330 |
+
|
| 331 |
+
elif choice == "5":
|
| 332 |
+
print(" Theme options: 1=Light, 2=Dark")
|
| 333 |
+
theme = input(" Select theme (1-2): ").strip()
|
| 334 |
+
if theme == "2":
|
| 335 |
+
style_dict.update(STYLE_PRESETS["dark"])
|
| 336 |
+
|
| 337 |
+
elif choice == "6":
|
| 338 |
+
break
|
| 339 |
+
|
| 340 |
+
return style_dict
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
def main():
|
| 344 |
+
"""Main function."""
|
| 345 |
+
parser = argparse.ArgumentParser(
|
| 346 |
+
description="Matplotlib style configurator",
|
| 347 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 348 |
+
epilog="""
|
| 349 |
+
Examples:
|
| 350 |
+
# Show available presets
|
| 351 |
+
python style_configurator.py --list
|
| 352 |
+
|
| 353 |
+
# Preview a preset
|
| 354 |
+
python style_configurator.py --preset publication --preview
|
| 355 |
+
|
| 356 |
+
# Save a preset as .mplstyle file
|
| 357 |
+
python style_configurator.py --preset publication --output my_style.mplstyle
|
| 358 |
+
|
| 359 |
+
# Interactive mode
|
| 360 |
+
python style_configurator.py --interactive
|
| 361 |
+
""",
|
| 362 |
+
)
|
| 363 |
+
parser.add_argument(
|
| 364 |
+
"--preset",
|
| 365 |
+
type=str,
|
| 366 |
+
choices=list(STYLE_PRESETS.keys()),
|
| 367 |
+
help="Use a predefined style preset",
|
| 368 |
+
)
|
| 369 |
+
parser.add_argument("--output", type=str, help="Save style to .mplstyle file")
|
| 370 |
+
parser.add_argument("--preview", action="store_true", help="Show style preview")
|
| 371 |
+
parser.add_argument("--list", action="store_true", help="List available presets")
|
| 372 |
+
parser.add_argument("--interactive", action="store_true", help="Run in interactive mode")
|
| 373 |
+
|
| 374 |
+
args = parser.parse_args()
|
| 375 |
+
|
| 376 |
+
if args.list:
|
| 377 |
+
list_available_presets()
|
| 378 |
+
# Also show currently available matplotlib styles
|
| 379 |
+
print("\nBuilt-in matplotlib styles:")
|
| 380 |
+
print("-" * 40)
|
| 381 |
+
for style in sorted(plt.style.available):
|
| 382 |
+
print(f" {style}")
|
| 383 |
+
return
|
| 384 |
+
|
| 385 |
+
if args.interactive:
|
| 386 |
+
style_dict = interactive_mode()
|
| 387 |
+
elif args.preset:
|
| 388 |
+
style_dict = STYLE_PRESETS[args.preset].copy()
|
| 389 |
+
print(f"Using '{args.preset}' preset")
|
| 390 |
+
else:
|
| 391 |
+
print("No preset or interactive mode specified. Showing default preview.")
|
| 392 |
+
style_dict = {}
|
| 393 |
+
|
| 394 |
+
if style_dict:
|
| 395 |
+
print_style_info(style_dict)
|
| 396 |
+
|
| 397 |
+
if args.output:
|
| 398 |
+
save_style_file(style_dict, args.output)
|
| 399 |
+
|
| 400 |
+
if args.preview or args.interactive:
|
| 401 |
+
print("Creating style preview...")
|
| 402 |
+
fig = create_style_preview(style_dict if style_dict else None)
|
| 403 |
+
|
| 404 |
+
if args.output:
|
| 405 |
+
preview_filename = args.output.replace(".mplstyle", "_preview.png")
|
| 406 |
+
plt.savefig(preview_filename, dpi=150, bbox_inches="tight")
|
| 407 |
+
print(f"Preview saved to {preview_filename}")
|
| 408 |
+
|
| 409 |
+
plt.show()
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
if __name__ == "__main__":
|
| 413 |
+
main()
|