Spaces:

AI4Research
/

scider

Sleeping

App Files Files Community

leonardklin commited on Apr 14

Commit

978fed5

verified ·

1 Parent(s): 4d50c13

Upload 328 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +31 -0
.env.template +64 -0
.gitattributes +35 -0
.gitignore +231 -0
.gitmodules +9 -0
.pre-commit-config.yaml +22 -0
.python-version +1 -0
.scider/SCIDER.md +11 -0
.scider/skills/content-refinement-agent/SKILL.md +256 -0
.scider/skills/content-refinement-agent/references/halt-rules.md +125 -0
.scider/skills/content-refinement-agent/references/prompt.md +136 -0
.scider/skills/content-refinement-agent/references/reviewer-rubric.md +131 -0
.scider/skills/content-refinement-agent/references/safe-revision-rules.md +129 -0
.scider/skills/content-refinement-agent/scripts/apply_worklog.py +94 -0
.scider/skills/content-refinement-agent/scripts/score_delta.py +164 -0
.scider/skills/content-refinement-agent/scripts/snapshot.py +47 -0
.scider/skills/exploratory-data-analysis/SKILL.md +442 -0
.scider/skills/exploratory-data-analysis/assets/report_template.md +196 -0
.scider/skills/exploratory-data-analysis/references/bioinformatics_genomics_formats.md +664 -0
.scider/skills/exploratory-data-analysis/references/chemistry_molecular_formats.md +664 -0
.scider/skills/exploratory-data-analysis/references/general_scientific_formats.md +518 -0
.scider/skills/exploratory-data-analysis/references/microscopy_imaging_formats.md +620 -0
.scider/skills/exploratory-data-analysis/references/proteomics_metabolomics_formats.md +517 -0
.scider/skills/exploratory-data-analysis/references/spectroscopy_analytical_formats.md +633 -0
.scider/skills/exploratory-data-analysis/scripts/eda_analyzer.py +548 -0
.scider/skills/literature-review-agent/SKILL.md +357 -0
.scider/skills/literature-review-agent/references/citation-density-rule.md +71 -0
.scider/skills/literature-review-agent/references/discovery-pipeline.md +151 -0
.scider/skills/literature-review-agent/references/exa-search-cookbook.md +245 -0
.scider/skills/literature-review-agent/references/prompt.md +77 -0
.scider/skills/literature-review-agent/references/s2-api-cookbook.md +138 -0
.scider/skills/literature-review-agent/references/verification-rules.md +100 -0
.scider/skills/literature-review-agent/scripts/bibtex_format.py +211 -0
.scider/skills/literature-review-agent/scripts/check_cutoff.py +63 -0
.scider/skills/literature-review-agent/scripts/citation_coverage.py +104 -0
.scider/skills/literature-review-agent/scripts/dedupe_by_id.py +98 -0
.scider/skills/literature-review-agent/scripts/exa_search.py +169 -0
.scider/skills/literature-review-agent/scripts/levenshtein_match.py +73 -0
.scider/skills/literature-review-agent/scripts/pre_dedup_candidates.py +156 -0
.scider/skills/literature-review-agent/scripts/s2_cache.py +113 -0
.scider/skills/literature-review-agent/scripts/s2_search.py +208 -0
.scider/skills/literature-review-agent/scripts/sync_keys.py +119 -0
.scider/skills/literature-review-agent/scripts/validate_pool.py +145 -0
.scider/skills/matplotlib/SKILL.md +356 -0
.scider/skills/matplotlib/references/api_reference.md +412 -0
.scider/skills/matplotlib/references/common_issues.md +563 -0
.scider/skills/matplotlib/references/plot_types.md +476 -0
.scider/skills/matplotlib/references/styling_guide.md +589 -0
.scider/skills/matplotlib/scripts/plot_template.py +446 -0
.scider/skills/matplotlib/scripts/style_configurator.py +413 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,31 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.so
+*.egg
+*.egg-info/
+dist/
+build/
+.env
+.venv/
+venv/
+ENV/
+env/
+*.log
+workspace/
+.pytest_cache/
+.coverage
+htmlcov/
+.DS_Store
+*.swp
+*.swo
+*~
+.git/
+.github/
+.claude/
+benchmarks/
+tmp_*
+rsync_tmp_*
+*.ipynb

.env.template ADDED Viewed

	@@ -0,0 +1,64 @@

+# --- SciDER ---
+# Provide any combination of provider keys. SciDER's unified model catalog
+# (model_settings/catalog.yaml) lets you mix-and-match providers per role —
+# e.g. ideation on Gemini, experiment_coding on GPT-5. Models whose key is
+# missing are simply marked unavailable in the frontend.
+OPENAI_API_KEY=...
+GEMINI_API_KEY=...
+ANTHROPIC_API_KEY=...
+# Optional: Semantic Scholar API key for better rate limits (https://www.semanticscholar.org/product/api)
+# S2_API_KEY=...
+## User Approval
+# Set to true to enable interactive user approval at critical agent steps
+USER_APPROVAL_ENABLED=true
+## HuggingFace Dataset Download
+# Set to true to allow using HuggingFace repo names as data paths
+HF_DATASET_DOWNLOAD_ENABLED=false
+# HF_DATASET_CACHE_DIR=tmp_hf_datasets
+# Maximum dataset size in MB (default 100)
+# HF_DATASET_MAX_SIZE_MB=100
+## Logging
+# LOGURU_LEVEL=INFO
+LOGURU_LEVEL=DEBUG
+LOG_SYSTEM_PROMPT=false
+## Coding Agent Switch
+# choice: claude_sdk (default), native, openhands (requires SCIDER_ENABLE_OPENHANDS=1)
+# - claude_sdk: Claude Agent SDK (requires ANTHROPIC_API_KEY)
+# - native: SciDER's built-in coding agent (uses experiment_coding model, any LiteLLM provider)
+# - openhands: OpenHands sandbox (requires SCIDER_ENABLE_OPENHANDS=1)
+# legacy aliases: v3 = claude_sdk, v2 = openhands
+CODING_AGENT_VERSION=claude_sdk
+# choice: See https://platform.claude.com/docs/en/about-claude/models/overview
+CLAUDE_SDK_MODEL=claude-haiku-4-5
+## Openhands
+SCIDER_ENABLE_OPENHANDS=false
+OPENHANDS_MODEL=gemini/gemini-2.5-flash
+OPENHANDS_API_KEY=...
+## Context Compression Pipeline (runs in query() before each LLM call)
+# Level 1: Persist oversized tool results to disk
+COMPACT_TOOL_RESULT_MAX_CHARS=50000
+# COMPACT_TOOL_RESULT_PREVIEW_CHARS=2000
+# Level 2: Snip old tool results (keep N most recent)
+COMPACT_SNIP_KEEP_RECENT=5
+# Level 3: LLM-based autocompact (trigger threshold in tokens)
+COMPACT_AUTOCOMPACT_TOKEN_THRESHOLD=256000
+COMPACT_AUTOCOMPACT_MODEL=history
+# COMPACT_AUTOCOMPACT_KEEP_RATIO=0.4
+# COMPACT_AUTOCOMPACT_KEEP_FIRST_N=4
+## Permissions
+# Path to tool permission overrides (JSON file)
+# SCIDER_PERMISSIONS_FILE=.claude/permissions.json
+## Memory System (file-based cross-session memory in .scider/memory/)
+# SCIDER_MEMORY_READ=true    # Load memory index into agent context (default: true)
+# SCIDER_MEMORY_WRITE=true   # Allow agents to write new memories (default: true)

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,231 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# temporary files
+tmp_*
+rsync_tmp_*
+.aider*
+data_analysis.md
+software-agent-sdk
+env
+streamlit-client/case-study-memory/
+saved_chats/
+# vibe coding
+.claude/
+.agents/
+.windsurf/
+# Ignore .scider/ contents but allow specific entries to be tracked
+.scider/*
+!.scider/skills/
+!.scider/rules/
+!.scider/SCIDER.md
+workspace/

.gitmodules ADDED Viewed

	@@ -0,0 +1,9 @@

+[submodule "benchmarks/mlebench/mle-bench"]
+	path = benchmarks/mlebench/mle-bench
+	url = git@github.com:leonardodalinky/mle-bench.git
+[submodule "benchmarks/scicodebench/SciCode"]
+	path = benchmarks/scicodebench/SciCode
+	url = git@github.com:leonardodalinky/SciCode.git
+[submodule "benchmarks/aiideabench/AI_Idea_Bench"]
+	path = benchmarks/aiideabench/AI_Idea_Bench
+	url = git@github.com:leonardodalinky/AI_Idea_Bench_2025.git

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v6.0.0
+  hooks:
+  - id: trailing-whitespace
+  - id: end-of-file-fixer
+  - id: name-tests-test
+  - id: requirements-txt-fixer
+- repo: https://github.com/pycqa/isort
+  rev: 5.13.2
+  hooks:
+    - id: isort
+      args: ["--profile", "black", "--line-length=100", "--python-version=310"]
+- repo: https://github.com/psf/black
+  rev: 25.1.0
+  hooks:
+    - id: black
+      args: ["--line-length=100", "--target-version=py310"]
+- repo: https://github.com/kynan/nbstripout
+  rev: 0.8.2
+  hooks:
+    - id: nbstripout

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.13

.scider/SCIDER.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# SCIDER.md
+## Approach
+- Think before acting. Read existing files before writing code.
+- Be concise in output but thorough in reasoning.
+- Prefer editing over rewriting whole files.
+- Do not re-read files you have already read unless the file may have changed.
+- Test your code before declaring done.
+- No sycophantic openers or closing fluff.
+- Keep solutions simple and direct.
+- User instructions always override this file.

.scider/skills/content-refinement-agent/SKILL.md ADDED Viewed

	@@ -0,0 +1,256 @@

+---
+name: content-refinement-agent
+description: Step 5 of the PaperOrchestra pipeline (arXiv:2604.05018). Iteratively refine drafts/paper.tex by simulating peer review and applying targeted revisions, with strict accept/revert halt rules. Maintains a worklog and snapshots each iteration so revert is real, not symbolic. TRIGGER when the orchestrator delegates Step 5 or when the user asks to "refine the draft", "iterate on the paper", or "run peer review on this paper".
+allowed_agents: [writing]
+---
+# Content Refinement Agent (Step 5)
+Faithful implementation of the Content Refinement Agent from PaperOrchestra
+(Song et al., 2026, arXiv:2604.05018, §4 Step 5, App. F.1 pp. 49–51).
+**Cost: ~5–7 LLM calls** (App. B), typically ~3 refinement iterations, each
+consisting of one reviewer call and one revision call.
+The paper highlights this step as one of the largest contributors to overall
+quality: refinement alone accounts for +19% (CVPR) and +22% (ICLR) absolute
+acceptance-rate improvement (Fig. 4). Get this step right.
+## Inputs
+- `workspace/drafts/paper.tex` — output of Step 4
+- `workspace/inputs/conference_guidelines.md`
+- `workspace/inputs/experimental_log.md` — used as ground truth for the
+  hallucination check
+- `workspace/citation_pool.json` / `workspace/refs.bib` — the allowed
+  bibliography
+## Outputs
+- `workspace/refinement/iter1/`, `iter2/`, `iter3/` — per-iteration snapshots
+  containing `paper.tex`, `paper.pdf`, `review.json`, `score.json`
+- `workspace/refinement/worklog.json` — append-only history of decisions
+- `workspace/final/paper.tex` and `workspace/final/paper.pdf` — copy of the
+  best accepted snapshot
+## The refinement loop
+```
+prev_score = score(paper.tex)                  # baseline from initial draft
+snapshot iter0/
+for iter in 1..ITER_CAP (default 3):
+    1. simulate_review(paper.tex) → review.json
+       (uses `references/reviewer-rubric.md` rubric)
+    2. apply_revision(paper.tex, review.json) → new_paper.tex
+       (uses verbatim Refinement Agent prompt at `references/prompt.md`)
+    3. snapshot iter<N>/ with new_paper.tex, review.json
+       latexmk -pdf new_paper.tex → iter<N>/paper.pdf
+    4. score(new_paper.tex) → curr_score
+    5. decide via score_delta.py:
+       - if curr.overall > prev.overall:                       ACCEPT
+       - elif curr.overall == prev.overall and net_subaxis ≥0: ACCEPT
+       - else:                                                 REVERT
+    6. apply_worklog.py to append the decision
+    7. if REVERT or no actionable weaknesses or iter == ITER_CAP: HALT
+    paper.tex ← new_paper.tex   (only on ACCEPT)
+    prev_score ← curr_score
+cp <best iter>/paper.tex → workspace/final/paper.tex
+```
+The "best" snapshot at HALT is the one with the highest accepted overall
+score. On a REVERT halt, the best is the iteration immediately before the
+revert.
+## Step-by-step
+### 0. Snapshot the initial draft
+```bash
+python skills/content-refinement-agent/scripts/snapshot.py \
+    --src workspace/drafts/paper.tex \
+    --dst workspace/refinement/iter0/
+```
+This creates `iter0/paper.tex`. Then compile to `iter0/paper.pdf`:
+```bash
+cd workspace/refinement/iter0/ && latexmk -pdf -interaction=nonstopmode paper.tex
+```
+Score it (see Step 1 below) → `iter0/score.json`.
+### 1. Simulate peer review
+For each iteration N starting from 1:
+Load `references/reviewer-rubric.md` as the system prompt for the simulated
+reviewer call. The reviewer reads `iter<N-1>/paper.pdf` (or `paper.tex` if
+your host LLM lacks PDF input) and produces a JSON of strengths,
+weaknesses, questions, and per-axis scores.
+The rubric is structured to mimic AgentReview (Jin et al., 2024) — the
+paper's chosen evaluator. We ship a faithful rubric in the references
+directory; the host agent's LLM does the actual reviewing.
+Save to `workspace/refinement/iter<N>/review.json`.
+### 2. Score the draft
+The reviewer call produces both qualitative feedback and a per-axis score:
+```json
+{
+  "axis_scores": {
+    "scientific_depth":     {"score": 65, "justification": "..."},
+    "technical_execution":  {"score": 70, "justification": "..."},
+    "logical_flow":         {"score": 60, "justification": "..."},
+    "writing_clarity":      {"score": 55, "justification": "..."},
+    "evidence_presentation":{"score": 72, "justification": "..."},
+    "academic_style":       {"score": 68, "justification": "..."}
+  },
+  "overall_score": 64.5,
+  "strengths": [...],
+  "weaknesses": [...],
+  "questions": [...]
+}
+```
+Save to `iter<N>/score.json`. (Combined with `review.json` if your host
+emits one document; the schemas overlap.)
+### 3. Apply revision
+Load the **verbatim Content Refinement Agent prompt** at `references/prompt.md`.
+Prepend the Anti-Leakage Prompt. Inputs:
+- `paper.tex` — current draft
+- `paper.pdf` — compiled PDF (multimodal context if available)
+- `conference_guidelines.md`
+- `experimental_log.md` — ground truth for numeric claims
+- `worklog.json` — history of previous changes
+- `citation_pool.json` — the allowed bibliography
+- `reviewer_feedback` — the JSON from Step 1
+The prompt instructs the model to address weaknesses, integrate question
+answers, and emit two output blocks:
+1. A worklog JSON `{addressed_weaknesses[], integrated_answers[], actions_taken[]}`
+2. The full revised LaTeX code
+Save the revised LaTeX as `iter<N>/paper.tex`. Append the worklog JSON to
+`workspace/refinement/worklog.json` via `apply_worklog.py`.
+### 4. Compile and re-score
+```bash
+cd workspace/refinement/iter<N>/ && latexmk -pdf -interaction=nonstopmode paper.tex
+```
+Then re-run the simulated review on the new draft → updated `score.json`
+for the new iteration. (This is the "re-score after revision" call.)
+### 5. Apply the accept/revert decision
+The calling loop must track `CONSECUTIVE_SMALL` (starts at 0) and pass it
+on each call so `score_delta.py` can detect the plateau:
+```bash
+python skills/content-refinement-agent/scripts/score_delta.py \
+    --prev workspace/refinement/iter<N-1>/score.json \
+    --curr workspace/refinement/iter<N>/score.json \
+    --plateau-threshold 1.0 \
+    --plateau-streak 3 \
+    --consecutive-small $CONSECUTIVE_SMALL \
+    > workspace/refinement/iter<N>/delta.json
+EXIT=$?
+# Update streak for next iteration:
+CONSECUTIVE_SMALL=$(python3 -c "
+import json
+d = json.load(open('workspace/refinement/iter<N>/delta.json'))
+print(d['consecutive_small'])
+")
+```
+Exit codes:
+- `0` — ACCEPT (overall improved or tied with non-negative net sub-axis, no plateau)
+- `1` — REVERT (overall decreased)
+- `2` — REVERT (tied overall, but net sub-axis change negative)
+- `4` — HALT_PLATEAU (accepted but N consecutive iterations below threshold — stop early)
+Behavior:
+- **ACCEPT (exit 0)**: keep `iter<N>/paper.tex` as the new best. Continue to iter N+1.
+- **REVERT (exit 1 or 2)**: copy `iter<N-1>/paper.tex` back as canonical, halt.
+- **HALT_PLATEAU (exit 4)**: keep current (it was accepted), but stop — further
+  iterations are unlikely to yield meaningful gains. In practice ~85% of
+  refinement gain comes in iteration 1; the plateau fires when subsequent
+  iterations improve by less than 1 point for 3 consecutive rounds.
+Always log the decision via `apply_worklog.py --decision ...`.
+### 6. Halt rules
+Halt the loop when ANY of these is true:
+1. Iteration count reaches `ITER_CAP` (default 3).
+2. `score_delta.py` returned exit code 1 or 2 (REVERT).
+3. The simulated reviewer's `weaknesses` list is empty (no actionable
+   feedback to apply).
+4. `score_delta.py` returned exit code 4 (HALT_PLATEAU — plateau early-stop).
+### 7. Promote the best snapshot
+Identify the iteration with the highest accepted `overall_score` (this may
+be the latest accepted iteration, OR an earlier one if a later iteration
+was reverted). Copy:
+```bash
+cp workspace/refinement/iter<best>/paper.tex workspace/final/paper.tex
+cp workspace/refinement/iter<best>/paper.pdf workspace/final/paper.pdf
+```
+Then in the final report, tell the user:
+- How many iterations were run
+- The final overall score
+- The score trajectory (e.g., "iter0 64.5 → iter1 67.3 (accept) → iter2 69.1 (accept) → iter3 68.9 (revert, halt)")
+- Which iteration was promoted
+## Critical safety constraints (App. F.1 page 50–51)
+The paper explicitly notes that early versions of the Refinement Agent
+"exploited the automated reviewer's scoring function by superficially
+listing missing baselines as limitations to artificially inflate
+acceptance scores." The verbatim prompt forbids this. **You must honor it:**
+- **Ignore reviewer requests for new experiments, ablations, or baselines.**
+  The Refinement Agent's job is presentation, not new science. If the
+  reviewer asks for missing data, simply skip those points — do NOT add
+  fabricated experiments, do NOT add a "future work" item promising them.
+- **Never explicitly state a limitation.** The phrase "we acknowledge as a
+  limitation that..." is forbidden. The model can address weaknesses
+  through clearer explanation, but must not game the evaluator by listing
+  them defensively.
+- **All numeric claims MUST be verified against `experimental_log.md`.**
+  The agent cannot introduce new numbers, only re-present existing ones.
+These rules prevent reward hacking and keep the refinement loop honest.
+## Resources
+- `references/prompt.md` — verbatim Content Refinement Agent prompt from App. F.1
+- `references/reviewer-rubric.md` — AgentReview-style scoring rubric (6 axes)
+- `references/halt-rules.md` — accept/revert/halt logic in formal pseudocode
+- `references/safe-revision-rules.md` — anti-reward-hack constraints
+- `scripts/score_delta.py` — accept/revert decision from two score JSONs
+- `scripts/apply_worklog.py` — append iteration entries to worklog.json
+- `scripts/snapshot.py` — copy paper.tex/paper.pdf into iter<N>/ for rollback

.scider/skills/content-refinement-agent/references/halt-rules.md ADDED Viewed

	@@ -0,0 +1,125 @@

+# Halt Rules
+Source: arXiv:2604.05018, §4 Step 5 ("Iterative Content Refinement"):
+> After modifying the LaTeX source to address weaknesses, revisions are
+> accepted if the overall score increases, or if it ties when net sub-axis
+> gains are non-negative. The agent immediately reverts to the previous
+> version and halts upon any overall score decrease, negative tie-breaker,
+> or reaching the iteration limit.
+Encoded as deterministic logic in `scripts/score_delta.py`. This file is the
+human-readable specification.
+## Definitions
+Let:
+- `prev` = score JSON from the previous accepted iteration
+- `curr` = score JSON from the just-completed iteration
+- `prev.overall` = `prev.overall_score`
+- `curr.overall` = `curr.overall_score`
+- `subaxis_delta(axis)` = `curr.axis_scores[axis].score - prev.axis_scores[axis].score`
+- `net_subaxis_delta` = `sum(subaxis_delta(a) for a in 6 axes)`
+## Decision rules (in order)
+```
+if curr.overall > prev.overall:
+    DECISION = ACCEPT_IMPROVED
+elif curr.overall == prev.overall:
+    if net_subaxis_delta >= 0:
+        DECISION = ACCEPT_TIED_NON_NEGATIVE
+    else:
+        DECISION = REVERT_TIED_NEGATIVE_SUBAXIS
+else:  # curr.overall < prev.overall
+    DECISION = REVERT_OVERALL_DECREASED
+```
+The script exits with:
+| Exit code | Meaning | Loop action |
+|---|---|---|
+| 0 | ACCEPT_IMPROVED | keep new draft, continue loop |
+| 0 | ACCEPT_TIED_NON_NEGATIVE | keep new draft, continue loop |
+| 1 | REVERT_OVERALL_DECREASED | rollback to prev, halt loop |
+| 2 | REVERT_TIED_NEGATIVE_SUBAXIS | rollback to prev, halt loop |
+The script also prints a one-line decision string and a JSON object on
+stdout for the host agent to log.
+## Loop-level halt conditions
+In addition to the per-iteration accept/revert decision, the loop halts
+when ANY of these is true:
+1. **Iteration cap reached.** Default 3 (configurable via env var
+   `PO_REFINE_MAX_ITER`). Per the paper Table 7, the typical
+   refinement count is "3× content refinement loop".
+2. **REVERT decision** from `score_delta.py` (exit code 1 or 2).
+3. **Empty weaknesses list.** If the simulated reviewer's `weaknesses`
+   array is empty, there is nothing to fix — halt.
+4. **Plateau early-stop (exit code 4).** `score_delta.py` returns
+   `HALT_PLATEAU` when `N` consecutive accepted iterations each have
+   `overall_delta < threshold`. Default: threshold=1.0 points, N=3.
+   Configurable via `--plateau-threshold` and `--plateau-streak`.
+   The calling loop must pass `--consecutive-small <count>` to
+   `score_delta.py` to track the streak across iterations:
+   ```bash
+   CONSECUTIVE_SMALL=0
+   for iter in 1 2 3 ...; do
+     # ... run refinement LLM call ...
+     python score_delta.py \
+         --prev iter$((iter-1))/score.json \
+         --curr iter${iter}/score.json \
+         --plateau-threshold 1.0 \
+         --plateau-streak 3 \
+         --consecutive-small $CONSECUTIVE_SMALL
+     EXIT=$?
+     # Update streak counter from script output
+     CONSECUTIVE_SMALL=$(python -c "import json,sys; \
+         d=json.loads(open('iter${iter}/delta.json').read()); \
+         print(d['consecutive_small'])")
+     if [ $EXIT -ne 0 ]; then break; fi
+   done
+   ```
+   **Why this matters**: in practice, ~85% of the refinement gain comes
+   in the first iteration (scores jump 5-8 points). Subsequent iterations
+   typically improve by <1 point. Without early-stop, the loop runs 3 full
+   LLM calls even when iterations 2 and 3 contribute near-zero value.
+## Promoting the best snapshot
+After halt, identify the iteration with the highest `accepted` overall
+score:
+```python
+accepted_iters = [it for it in worklog.iterations if it.decision.startswith("ACCEPT")]
+best = max(accepted_iters, key=lambda it: it.score.overall_score)
+```
+If the loop halted on REVERT, `best` is the iteration immediately *before*
+the reverted one. Copy its `paper.tex` and `paper.pdf` to
+`workspace/final/`.
+## Worked example
+Suppose:
+| iter | overall | depth | exec | flow | clarity | evidence | style | decision |
+|---|---|---|---|---|---|---|---|---|
+| 0 | 64.5 | 65 | 70 | 60 | 55 | 72 | 68 | (baseline) |
+| 1 | 67.3 | 68 | 73 | 64 | 58 | 74 | 70 | ACCEPT_IMPROVED |
+| 2 | 67.3 | 70 | 73 | 64 | 58 | 73 | 71 | ACCEPT_TIED_NON_NEGATIVE (Σdelta = +2) |
+| 3 | 66.0 | 70 | 70 | 62 | 56 | 73 | 71 | REVERT_OVERALL_DECREASED, HALT |
+Promoted: iter 2 (`final/paper.tex` ← `iter2/paper.tex`).
+Score trajectory in the run report:
+```
+64.5 → 67.3 (accept) → 67.3 (accept tied) → 66.0 (revert, halt)
+```

.scider/skills/content-refinement-agent/references/prompt.md ADDED Viewed

	@@ -0,0 +1,136 @@

+# Content Refinement Agent — verbatim prompt
+**Source: arXiv:2604.05018, Appendix F.1, pages 49–51 (verbatim).**
+This is the exact prompt used by the Content Refinement Agent in the paper.
+Use it as your system message when applying a revision. The Anti-Leakage
+Prompt (`../paper-orchestra/references/anti-leakage-prompt.md`) MUST be
+prepended.
+---
+```
+Role: Senior AI Researcher.
+Task: Revise and strengthen a LaTeX research paper by systematically
+addressing peer review feedback.
+You are the author responsible for the "Rebuttal via Revision" phase. You
+will receive:
+  - paper.tex: The current LaTeX source code.
+  - paper.pdf: The compiled PDF context.
+  - conference_guidelines.md: The formatting and page limit rules.
+  - experimental_log.md: The Ground Truth for all data and metrics.
+  - worklog.json: History of previous changes.
+  - citation_map.json: The allowed bibliography.
+  - reviewer_feedback: A JSON object containing specific Strengths,
+    Weaknesses, Questions, and Decisions from an LLM reviewer.
+Your Goal
+  1. Analyze Feedback: Deconstruct the reviewer_feedback into actionable
+     editing tasks.
+  2. Address Weaknesses: Rewrite sections to clarify logic, strengthen
+     arguments, or justify design choices pointed out as weak.
+  3. Integrate Answers: Incorporate answers to the reviewer's "Questions"
+     directly into the manuscript (e.g., adding training cost details to
+     the Implementation section).
+  4. Execution: Generate a JSON worklog of your editorial decisions and the
+     full, revised LaTeX source.
+Critical Execution Standards
+  1. Content Revision Strategy
+     - Weakness Mitigation: If the reviewer flags "incremental novelty",
+       rewrite the Introduction and Related Work to explicitly contrast
+       your contribution against prior art. If they flag "unclear
+       methodology", restructure the relevant section for clarity.
+     - Answering Questions: Do NOT write a separate response letter. If the
+       reviewer asks "What is the inference latency?", you must find a
+       natural place in the paper (e.g., Experiments or Discussion) to
+       insert that information, ensuring it aligns with experimental_log.md.
+     - Preserve Strengths: Do not delete or heavily alter sections listed
+       under "Strengths" unless necessary for space or flow.
+  2. Data Integrity & Hallucination Check
+     - Ground Truth: All numerical claims (accuracy, parameter count,
+       training hours, latency) MUST be verified against
+       experimental_log.md.
+     - Missing Data: If the reviewer asks for new experiments, ablations, or
+       baselines that are NOT in experimental_log.md, simply ignore those
+       specific requests. Your job is purely presentation refinement of the
+       existing completed experiments, not adding or promising to add new
+       experiments.
+  3. Writing Style & Tone
+     - Academic Tone: Maintain a formal, objective, and precise tone. Avoid
+       defensive language.
+     - Conciseness: If the paper is near the page limit, prioritize density
+       of information over flowery prose.
+     - Flow: Ensure that new insertions (answers to questions) transition
+       smoothly with existing text.
+  4. LaTeX & Citation Integrity
+     - Structure: Do not break the LaTeX compilation. Keep packages and
+       environments stable. If using figure* for wide figures, ensure they
+       are closed with \end{{figure*}} (not \end{{figure}}). Check for
+       completeness.
+     - Citations: Use ONLY keys from citation_map.json.
+Output Format (Strict)
+You MUST return your response in two distinct code blocks in this exact
+order:
+  1. Worklog for the current turn (JSON):
+     {{
+       "addressed_weaknesses": [
+         "Clarified contribution novelty in Intro (Reviewer point 2)",
+         "Added justification for two-stage training (Reviewer point 1)"
+       ],
+       "integrated_answers": [
+         "Added training cost (45 GPU hours) to Implementation Details",
+         "Added epsilon hyperparameter explanation to Method section"
+       ],
+       "actions_taken": [
+         "Rewrote Section 3.2 for clarity",
+         "Inserted new paragraph in Section 5.1 regarding latency"
+       ]
+     }}
+  2. The FULL revised LaTeX code:
+     ```latex
+     ... Full revised LaTeX code here ...
+     ```
+Important Notes
+  - Completeness: Always provide the FULL LaTeX code. Do not return diffs
+    or partial snippets.
+  - Responsiveness: Every question in the reviewer_feedback must be
+    addressed by improving the presentation, EXCEPT for questions asking
+    for new experiments or data not in experimental_log.md (which should
+    be ignored). Never explicitly state a limitation.
+  - Safety: Do not remove the \documentclass or essential preamble.
+```
+---
+## Why "never explicitly state a limitation" is a hard rule
+From App. F.1 p.51, the paper explains:
+> We explicitly instruct the Content Refinement Agent to ignore reviewer
+> requests for additional experiments. This constraint is crucial to
+> prevent the agent from generating fabricated results or making false
+> promises within the paper... Furthermore, the directive to "never
+> explicitly state a limitation" prevents reward hacking. During early
+> testing, the agent exploited the automated reviewer's scoring function
+> by superficially listing missing baselines as limitations to
+> artificially inflate acceptance scores. Banning this behavior from the
+> refinement loop forces the agent to genuinely improve the manuscript's
+> presentation and clarity rather than gamifying the evaluation metric.
+`safe-revision-rules.md` formalizes this as a deterministic gate the host
+agent should run after each revision: grep the new draft for the substring
+`limitation` (case-insensitive) and reject if found.

.scider/skills/content-refinement-agent/references/reviewer-rubric.md ADDED Viewed

	@@ -0,0 +1,131 @@

+# Reviewer Rubric (AgentReview-style)
+The Content Refinement Agent loop needs a simulated reviewer that produces
+**structured, scoreable** feedback the host agent can compare iteration to
+iteration. The paper uses AgentReview (Jin et al., 2024) as its evaluator
+in §5 (App. F.1 references "AgentReview" by name and uses its output schema:
+"strengths, weaknesses, questions, decisions").
+This document defines a faithful AgentReview-style reviewer prompt to use
+under any host LLM. Use it as the system message for the simulated review
+call before each refinement iteration.
+---
+## System prompt for the simulated reviewer
+```
+You are an expert academic peer reviewer for a top-tier machine learning
+conference (CVPR, ICLR, NeurIPS, ICML). Read the provided LaTeX paper or
+PDF and produce a rigorous, structured review.
+Your review must be CONSERVATIVE. High scores are rare and must be
+explicitly justified with concrete evidence from the paper. Assume most
+drafts are not publication-ready.
+You MUST score the paper on six axes (0-100 each):
+  1. Scientific Depth & Soundness
+     - Are the theoretical foundations and experimental setups rigorous?
+     - Are claims justified and free of unsupported leaps?
+  2. Technical Execution
+     - Within the bounds of the described idea, is the methodology
+       implemented innovatively and effectively?
+     - Are the design choices justified by the experimental results?
+  3. Logical Flow
+     - Do sections transition smoothly from Abstract through Conclusion?
+     - Are subsections structured logically with clear signposting?
+  4. Writing Clarity
+     - Is the prose precise, concise, and free of repetitive phrasing?
+     - Are technical terms defined before use?
+  5. Evidence Presentation
+     - Are figures, tables, and results integrated and referenced cleanly?
+     - Do visuals support the text claims directly?
+  6. Academic Style
+     - Polished, professional academic tone?
+     - Consistent terminology throughout?
+For each axis, provide a score AND a 2-5 sentence evidence-based
+justification quoting concrete passages or pointing to specific failings.
+Then identify:
+  - Strengths: 3-5 bullet points naming things the paper does well.
+  - Weaknesses: 3-5 bullet points naming concrete, fixable issues.
+  - Questions: 2-4 specific questions the paper should answer for a
+    reader to be convinced.
+  - Decision: one of "Strong Accept", "Accept", "Borderline", "Reject",
+    "Strong Reject".
+  - Overall Score: weighted average 0-100. Use:
+        overall = 0.20*depth + 0.20*execution + 0.15*flow
+                 + 0.15*clarity + 0.20*evidence + 0.10*style
+Output STRICT JSON only. No prose outside the JSON.
+```
+## Output JSON schema
+```json
+{
+  "axis_scores": {
+    "scientific_depth": {
+      "score": 65,
+      "justification": "Loss formulation is grounded in the cited prior work but the ablation on the audio-visual fusion layer is small (n=3 seeds) and the variance bands overlap, making the claim of necessity weak. Section 3.2 introduces the cached memory without proving its necessity vs. simple pooling."
+    },
+    "technical_execution":   { "score": 70, "justification": "..." },
+    "logical_flow":          { "score": 60, "justification": "..." },
+    "writing_clarity":       { "score": 55, "justification": "..." },
+    "evidence_presentation": { "score": 72, "justification": "..." },
+    "academic_style":        { "score": 68, "justification": "..." }
+  },
+  "strengths": [
+    "Clear problem statement in the Introduction with three concrete failure cases of prior SAM-based methods.",
+    "Well-organized Related Work that contrasts the three competing paradigms.",
+    "..."
+  ],
+  "weaknesses": [
+    "The ablation in Table 2 lacks confidence intervals; 0.4 J-index gaps may not be significant.",
+    "Section 3.4 introduces the IoU loss term λ without justifying λ=1.0 vs other values.",
+    "Figure 3 is referenced once and never discussed in the prose.",
+    "..."
+  ],
+  "questions": [
+    "What is the inference latency on a single A100?",
+    "How does the temporal branch behave on videos longer than the training distribution?"
+  ],
+  "decision": "Borderline",
+  "overall_score": 64.5
+}
+```
+## How the loop uses this output
+The `score_delta.py` script reads two consecutive score JSONs and applies
+the halt rules. The `apply_worklog.py` script appends a timestamped entry
+to `workspace/refinement/worklog.json`. The Content Refinement Agent's
+revision call takes the full `review.json` as `reviewer_feedback` input.
+## Anti-inflation guardrails
+To prevent the simulated reviewer from being gameable, the rubric has hard
+caps drawn from the paper's Literature Review Quality autorater
+(App. F.3 — see also `paper-autoraters/references/litreview-quality-prompt.md`):
+| Axis | Hard cap |
+|---|---|
+| Scientific Depth | ≤60 if claims are unsupported by experiments |
+| Technical Execution | ≤55 if methodology section omits key implementation details |
+| Logical Flow | ≤60 if sections don't reference the figures/tables they need |
+| Writing Clarity | ≤60 if repetitive phrasing or undefined acronyms |
+| Evidence Presentation | ≤55 if any figure is unreferenced from the text |
+| Academic Style | ≤55 if defensive language is present |
+These caps are baked into the rubric prompt to keep the reviewer honest.
+The Content Refinement Agent's "never explicitly state a limitation" rule
+combined with these caps closes the reward-hacking loop the paper observed
+in early testing (App. F.1 p.51).

.scider/skills/content-refinement-agent/references/safe-revision-rules.md ADDED Viewed

	@@ -0,0 +1,129 @@

+# Safe Revision Rules
+The Content Refinement Agent prompt (App. F.1 p.50–51) imposes two
+anti-reward-hacking constraints. Both must be enforced not just by the
+prompt but by deterministic post-revision gates, because LLMs occasionally
+forget instructions buried in long prompts.
+## Rule 1 — Ignore reviewer requests for new experiments
+The simulated reviewer will sometimes ask:
+- "What if you ablated the temperature parameter?"
+- "How does this compare to baseline X?"
+- "Have you tried this on dataset Y?"
+The Refinement Agent must **not** fabricate answers to these. The paper:
+> If the reviewer asks for new experiments, ablations, or baselines that
+> are NOT in experimental_log.md, simply ignore those specific requests.
+> Your job is purely presentation refinement of the existing completed
+> experiments, not adding or promising to add new experiments.
+### Enforcement
+There is no fully deterministic way to grep for "fabricated experiments" —
+it requires reading the new content and cross-checking against
+`experimental_log.md`. The pragmatic check:
+1. Run the orphan-citation gate from `section-writing-agent/scripts/orphan_cite_gate.py`.
+   New numeric claims often come bundled with new (orphan) citations.
+2. Run a numeric-claim grep: extract every `\d+\.\d+%?` from the new draft,
+   intersect with `\d+\.\d+%?` in `experimental_log.md`. New numbers in the
+   draft that aren't in the log are suspicious. (False positives possible
+   for parameter counts and dates; review manually.)
+The orchestrator should re-prompt the refinement step if either gate fires
+with new fabricated claims.
+## Rule 2 — Never explicitly state a limitation
+The paper:
+> The directive to "never explicitly state a limitation" prevents reward
+> hacking. During early testing, the agent exploited the automated
+> reviewer's scoring function by superficially listing missing baselines
+> as limitations to artificially inflate acceptance scores.
+### Enforcement (deterministic)
+Grep the revised draft for the substring `limitation` (case-insensitive),
+excluding LaTeX comments. If found anywhere in the body, reject the
+revision and re-prompt:
+```bash
+# pseudocode — implement inline in the host agent
+grep -in -E '\blimitation' workspace/refinement/iter<N>/paper.tex \
+    | grep -v '^\s*%'
+```
+Allowed contexts (these are NOT violations):
+- LaTeX comments: `% address the limitation of ...`
+- Citation context: a paper title containing "limitation" cited in
+  `\cite{...}`. The grep should ignore the inside of `\cite{...}` braces.
+- Quoted prior-work descriptions: "Smith et al. acknowledge the
+  limitation..." — context-dependent. The simplest rule is "no instances
+  of the word 'limitation' in the running prose at all", and let the host
+  agent handle edge cases by re-prompting if a legitimate use is needed.
+This is a strict rule. The Refinement Agent should rewrite "we acknowledge
+the limitation that our method..." as "our method assumes..." or "the
+proposed approach is most effective when...". Reframing, not listing.
+## Rule 3 — Numeric ground truth
+> All numerical claims (accuracy, parameter count, training hours,
+> latency) MUST be verified against experimental_log.md.
+The grep heuristic above catches this partially. The host agent should
+also instruct the refinement step explicitly: "any numeric value you cite
+in your revision must already exist in experimental_log.md or
+metrics.json."
+## Rule 4 — Citation integrity
+The orphan-citation gate from
+`section-writing-agent/scripts/orphan_cite_gate.py` must pass after every
+refinement iteration. Re-run it as part of the post-revision checks:
+```bash
+python skills/section-writing-agent/scripts/orphan_cite_gate.py \
+    workspace/refinement/iter<N>/paper.tex \
+    workspace/refs.bib
+```
+If the refinement step introduced a new `\cite{KEY}` not in `refs.bib`,
+revert the iteration and re-prompt with an explicit instruction to use
+only existing keys.
+## Rule 5 — LaTeX integrity
+Re-run `latex_sanity.py` and `latexmk -pdf` after every revision. If the
+revision broke the build, revert.
+## Summary checklist for each refinement iteration
+```bash
+# 1. apply revision → iter<N>/paper.tex
+# 2. compile
+cd workspace/refinement/iter<N>/ && latexmk -pdf -interaction=nonstopmode paper.tex
+# 3. structural sanity
+python skills/section-writing-agent/scripts/latex_sanity.py paper.tex || REVERT
+python skills/section-writing-agent/scripts/orphan_cite_gate.py paper.tex ../../refs.bib || REVERT
+# 4. anti-leakage
+python skills/paper-orchestra/scripts/anti_leakage_check.py paper.tex || REVERT
+# 5. limitation grep (Rule 2)
+grep -in -E '\blimitation' paper.tex | grep -v '^\s*%' && REVERT
+# 6. score and decide
+python skills/content-refinement-agent/scripts/score_delta.py \
+    --prev ../iter<N-1>/score.json --curr score.json
+# exit 0 → keep, exit 1/2 → revert
+```
+If all gates pass and `score_delta.py` returns 0, the iteration is
+accepted.

.scider/skills/content-refinement-agent/scripts/apply_worklog.py ADDED Viewed

	@@ -0,0 +1,94 @@

+#!/usr/bin/env python3
+"""
+apply_worklog.py — Append a timestamped iteration entry to worklog.json.
+The worklog is the canonical history of the refinement loop: every
+iteration's review, score, decision, and actions taken. The orchestrator
+reads it at the end to identify the best snapshot to promote.
+Usage:
+    python apply_worklog.py \\
+        --worklog workspace/refinement/worklog.json \\
+        --iter 2 \\
+        --review iter2/review.json \\
+        --score iter2/score.json \\
+        --decision ACCEPT_IMPROVED \\
+        --actions iter2/worklog_entry.json   # the agent's emitted worklog block
+The script creates worklog.json if it doesn't exist.
+"""
+import argparse
+import datetime as dt
+import json
+import os
+import sys
+def load_json(path: str | None) -> dict | list | None:
+    if not path or not os.path.exists(path):
+        return None
+    with open(path) as f:
+        return json.load(f)
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--worklog", required=True, help="path to worklog.json")
+    p.add_argument("--iter", type=int, required=True, help="iteration number (0-indexed)")
+    p.add_argument("--review", help="path to review.json for this iteration")
+    p.add_argument("--score", help="path to score.json for this iteration")
+    p.add_argument(
+        "--decision",
+        required=True,
+        help="ACCEPT_IMPROVED / ACCEPT_TIED_NON_NEGATIVE / "
+        "REVERT_OVERALL_DECREASED / REVERT_TIED_NEGATIVE_SUBAXIS",
+    )
+    p.add_argument(
+        "--actions",
+        help="path to the agent's worklog block JSON "
+        "(addressed_weaknesses, integrated_answers, actions_taken)",
+    )
+    p.add_argument("--halted-because", help="reason if this iteration triggers a halt")
+    args = p.parse_args()
+    if os.path.exists(args.worklog):
+        with open(args.worklog) as f:
+            wl = json.load(f)
+    else:
+        wl = {"iterations": [], "halted_because": None, "best_iter": None}
+    entry = {
+        "iter": args.iter,
+        "timestamp": dt.datetime.now(dt.timezone.utc).isoformat(),
+        "decision": args.decision,
+        "review": load_json(args.review),
+        "score": load_json(args.score),
+        "actions": load_json(args.actions),
+    }
+    wl["iterations"].append(entry)
+    if args.halted_because:
+        wl["halted_because"] = args.halted_because
+    # Re-compute best_iter: highest accepted overall_score
+    accepted = [
+        it
+        for it in wl["iterations"]
+        if it.get("decision", "").startswith("ACCEPT") and it.get("score")
+    ]
+    if accepted:
+        best = max(accepted, key=lambda it: it["score"].get("overall_score", 0))
+        wl["best_iter"] = best["iter"]
+    os.makedirs(os.path.dirname(os.path.abspath(args.worklog)) or ".", exist_ok=True)
+    with open(args.worklog, "w") as f:
+        json.dump(wl, f, indent=2, ensure_ascii=False)
+    print(f"OK: appended iter {args.iter} ({args.decision}) to {args.worklog}")
+    if wl["best_iter"] is not None:
+        print(f"    current best_iter: {wl['best_iter']}")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

.scider/skills/content-refinement-agent/scripts/score_delta.py ADDED Viewed

	@@ -0,0 +1,164 @@

+#!/usr/bin/env python3
+"""
+score_delta.py — Apply the PaperOrchestra refinement halt rules from two
+score JSONs.
+Encodes the halt rules from arXiv:2604.05018 §4 Step 5:
+  - ACCEPT if curr.overall > prev.overall
+  - ACCEPT if curr.overall == prev.overall AND net sub-axis delta >= 0
+  - REVERT (overall_decreased) if curr.overall < prev.overall
+  - REVERT (tied_negative_subaxis) if curr.overall == prev.overall AND
+            net sub-axis delta < 0
+Additionally encodes the plateau early-stop rule (not in the original paper
+but added to match its cost budget of ~5-7 LLM calls):
+  - HALT_PLATEAU if the improvement is accepted but overall_delta is below
+    --plateau-threshold for --plateau-streak or more consecutive iterations.
+    Exit code 4.  The loop should stop — further iterations are unlikely to
+    yield meaningful gains.
+Exit codes:
+    0  ACCEPT (improved or tied non-negative, and no plateau)
+    1  REVERT (overall decreased)
+    2  REVERT (tied with negative sub-axis delta)
+    3  argument or input error
+    4  HALT_PLATEAU (accepted but diminishing returns detected)
+Score JSON shape (see references/reviewer-rubric.md):
+    {
+      "axis_scores": {
+        "scientific_depth":     {"score": 65, ...},
+        "technical_execution":  {"score": 70, ...},
+        "logical_flow":         {"score": 60, ...},
+        "writing_clarity":      {"score": 55, ...},
+        "evidence_presentation":{"score": 72, ...},
+        "academic_style":       {"score": 68, ...}
+      },
+      "overall_score": 64.5,
+      ...
+    }
+Usage:
+    python score_delta.py --prev iter0/score.json --curr iter1/score.json
+    python score_delta.py --prev iter2/score.json --curr iter3/score.json \\
+        --plateau-threshold 1.0 --plateau-streak 2 --consecutive-small 2
+"""
+import argparse
+import json
+import sys
+AXES = [
+    "scientific_depth",
+    "technical_execution",
+    "logical_flow",
+    "writing_clarity",
+    "evidence_presentation",
+    "academic_style",
+]
+DEFAULT_PLATEAU_THRESHOLD = 1.0  # points
+DEFAULT_PLATEAU_STREAK = 3  # consecutive iterations below threshold → halt
+def load(path: str) -> dict:
+    with open(path) as f:
+        return json.load(f)
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--prev", required=True, help="Score JSON from previous accepted iteration")
+    p.add_argument("--curr", required=True, help="Score JSON from just-completed iteration")
+    p.add_argument(
+        "--plateau-threshold",
+        type=float,
+        default=DEFAULT_PLATEAU_THRESHOLD,
+        metavar="POINTS",
+        help=f"Minimum overall_delta to not count as a 'small' improvement "
+        f"(default: {DEFAULT_PLATEAU_THRESHOLD})",
+    )
+    p.add_argument(
+        "--plateau-streak",
+        type=int,
+        default=DEFAULT_PLATEAU_STREAK,
+        metavar="N",
+        help=f"Number of consecutive small improvements before HALT_PLATEAU "
+        f"(default: {DEFAULT_PLATEAU_STREAK})",
+    )
+    p.add_argument(
+        "--consecutive-small",
+        type=int,
+        default=0,
+        metavar="N",
+        help="Number of consecutive small-delta accepted iterations so far "
+        "(maintained by the calling loop; default: 0)",
+    )
+    args = p.parse_args()
+    try:
+        prev = load(args.prev)
+        curr = load(args.curr)
+    except (OSError, json.JSONDecodeError) as e:
+        print(f"ERROR: failed to load score JSONs: {e}", file=sys.stderr)
+        return 3
+    p_overall = float(prev.get("overall_score", 0))
+    c_overall = float(curr.get("overall_score", 0))
+    overall_delta = c_overall - p_overall
+    p_axes = prev.get("axis_scores") or {}
+    c_axes = curr.get("axis_scores") or {}
+    deltas: dict[str, float] = {}
+    for ax in AXES:
+        ps = float((p_axes.get(ax) or {}).get("score", 0))
+        cs = float((c_axes.get(ax) or {}).get("score", 0))
+        deltas[ax] = cs - ps
+    net_subaxis = sum(deltas.values())
+    # --- Primary accept/revert decision ---
+    if c_overall > p_overall:
+        decision = "ACCEPT_IMPROVED"
+        exit_code = 0
+    elif c_overall == p_overall:
+        if net_subaxis >= 0:
+            decision = "ACCEPT_TIED_NON_NEGATIVE"
+            exit_code = 0
+        else:
+            decision = "REVERT_TIED_NEGATIVE_SUBAXIS"
+            exit_code = 2
+    else:
+        decision = "REVERT_OVERALL_DECREASED"
+        exit_code = 1
+    # --- Plateau early-stop (only applies to accepted iterations) ---
+    is_small_delta = overall_delta < args.plateau_threshold
+    new_consecutive_small = (args.consecutive_small + 1) if is_small_delta else 0
+    plateau_triggered = False
+    if exit_code == 0 and new_consecutive_small >= args.plateau_streak:
+        decision = "HALT_PLATEAU"
+        exit_code = 4
+        plateau_triggered = True
+    out = {
+        "decision": decision,
+        "exit_code": exit_code,
+        "overall_prev": p_overall,
+        "overall_curr": c_overall,
+        "overall_delta": overall_delta,
+        "subaxis_deltas": deltas,
+        "net_subaxis": net_subaxis,
+        "is_small_delta": is_small_delta,
+        "consecutive_small": new_consecutive_small,
+        "plateau_threshold": args.plateau_threshold,
+        "plateau_streak": args.plateau_streak,
+        "plateau_triggered": plateau_triggered,
+    }
+    print(json.dumps(out, indent=2))
+    return exit_code
+if __name__ == "__main__":
+    sys.exit(main())

.scider/skills/content-refinement-agent/scripts/snapshot.py ADDED Viewed

	@@ -0,0 +1,47 @@

+#!/usr/bin/env python3
+"""
+snapshot.py — Copy a paper.tex (and optionally paper.pdf) into a refinement
+iteration directory, so reverts are real, not symbolic.
+The PaperOrchestra refinement halt rules require the loop to roll back to
+the previous iteration on overall-score decrease or tied negative sub-axis
+delta. To do that physically, every iteration's draft must be preserved.
+Usage:
+    python snapshot.py --src paper.tex --dst iter2/
+    python snapshot.py --src paper.tex --src-pdf paper.pdf --dst iter2/
+"""
+import argparse
+import os
+import shutil
+import sys
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--src", required=True, help="source paper.tex path")
+    p.add_argument("--src-pdf", help="optional source paper.pdf path")
+    p.add_argument("--dst", required=True, help="destination iteration directory")
+    args = p.parse_args()
+    if not os.path.isfile(args.src):
+        print(f"ERROR: {args.src} not found", file=sys.stderr)
+        return 1
+    os.makedirs(args.dst, exist_ok=True)
+    dst_tex = os.path.join(args.dst, "paper.tex")
+    shutil.copy2(args.src, dst_tex)
+    print(f"OK: snapshot {args.src} → {dst_tex}")
+    if args.src_pdf:
+        if not os.path.isfile(args.src_pdf):
+            print(f"WARN: {args.src_pdf} not found, skipping PDF snapshot", file=sys.stderr)
+        else:
+            dst_pdf = os.path.join(args.dst, "paper.pdf")
+            shutil.copy2(args.src_pdf, dst_pdf)
+            print(f"OK: snapshot {args.src_pdf} → {dst_pdf}")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

.scider/skills/exploratory-data-analysis/SKILL.md ADDED Viewed

	@@ -0,0 +1,442 @@

+---
+name: exploratory-data-analysis
+description: Comprehensive EDA on scientific data files — structure, content, quality, and characteristics analysis across 200+ formats. Use when analyzing any data file to understand its structure, quality, and downstream analysis recommendations.
+allowed_agents: [data]
+preload_for: [data]
+---
+# Exploratory Data Analysis
+## Overview
+Perform comprehensive exploratory data analysis (EDA) on scientific data files across multiple domains. This skill provides automated file type detection, format-specific analysis, data quality assessment, and generates detailed markdown reports suitable for documentation and downstream analysis planning.
+**Key Capabilities:**
+- Automatic detection and analysis of 200+ scientific file formats
+- Comprehensive format-specific metadata extraction
+- Data quality and integrity assessment
+- Statistical summaries and distributions
+- Visualization recommendations
+- Downstream analysis suggestions
+- Markdown report generation
+## When to Use This Skill
+Use this skill when:
+- User provides a path to a scientific data file for analysis
+- User asks to "explore", "analyze", or "summarize" a data file
+- User wants to understand the structure and content of scientific data
+- User needs a comprehensive report of a dataset before analysis
+- User wants to assess data quality or completeness
+- User asks what type of analysis is appropriate for a file
+## Supported File Categories
+The skill has comprehensive coverage of scientific file formats organized into six major categories:
+### 1. Chemistry and Molecular Formats (60+ extensions)
+Structure files, computational chemistry outputs, molecular dynamics trajectories, and chemical databases.
+**File types include:** `.pdb`, `.cif`, `.mol`, `.mol2`, `.sdf`, `.xyz`, `.smi`, `.gro`, `.log`, `.fchk`, `.cube`, `.dcd`, `.xtc`, `.trr`, `.prmtop`, `.psf`, and more.
+**Reference file:** `references/chemistry_molecular_formats.md`
+### 2. Bioinformatics and Genomics Formats (50+ extensions)
+Sequence data, alignments, annotations, variants, and expression data.
+**File types include:** `.fasta`, `.fastq`, `.sam`, `.bam`, `.vcf`, `.bed`, `.gff`, `.gtf`, `.bigwig`, `.h5ad`, `.loom`, `.counts`, `.mtx`, and more.
+**Reference file:** `references/bioinformatics_genomics_formats.md`
+### 3. Microscopy and Imaging Formats (45+ extensions)
+Microscopy images, medical imaging, whole slide imaging, and electron microscopy.
+**File types include:** `.tif`, `.nd2`, `.lif`, `.czi`, `.ims`, `.dcm`, `.nii`, `.mrc`, `.dm3`, `.vsi`, `.svs`, `.ome.tiff`, and more.
+**Reference file:** `references/microscopy_imaging_formats.md`
+### 4. Spectroscopy and Analytical Chemistry Formats (35+ extensions)
+NMR, mass spectrometry, IR/Raman, UV-Vis, X-ray, chromatography, and other analytical techniques.
+**File types include:** `.fid`, `.mzML`, `.mzXML`, `.raw`, `.mgf`, `.spc`, `.jdx`, `.xy`, `.cif` (crystallography), `.wdf`, and more.
+**Reference file:** `references/spectroscopy_analytical_formats.md`
+### 5. Proteomics and Metabolomics Formats (30+ extensions)
+Mass spec proteomics, metabolomics, lipidomics, and multi-omics data.
+**File types include:** `.mzML`, `.pepXML`, `.protXML`, `.mzid`, `.mzTab`, `.sky`, `.mgf`, `.msp`, `.h5ad`, and more.
+**Reference file:** `references/proteomics_metabolomics_formats.md`
+### 6. General Scientific Data Formats (30+ extensions)
+Arrays, tables, hierarchical data, compressed archives, and common scientific formats.
+**File types include:** `.npy`, `.npz`, `.csv`, `.xlsx`, `.json`, `.hdf5`, `.zarr`, `.parquet`, `.mat`, `.fits`, `.nc`, `.xml`, and more.
+**Reference file:** `references/general_scientific_formats.md`
+## Workflow
+### Step 1: File Type Detection
+When a user provides a file path, first identify the file type:
+1. Extract the file extension
+2. Look up the extension in the appropriate reference file
+3. Identify the file category and format description
+4. Load format-specific information
+**Example:**
+```
+User: "Analyze data.fastq"
+→ Extension: .fastq
+→ Category: bioinformatics_genomics
+→ Format: FASTQ Format (sequence data with quality scores)
+→ Reference: references/bioinformatics_genomics_formats.md
+```
+### Step 2: Load Format-Specific Information
+Based on the file type, read the corresponding reference file to understand:
+- **Typical Data:** What kind of data this format contains
+- **Use Cases:** Common applications for this format
+- **Python Libraries:** How to read the file in Python
+- **EDA Approach:** What analyses are appropriate for this data type
+Search the reference file for the specific extension (e.g., search for "### .fastq" in `bioinformatics_genomics_formats.md`).
+### Step 3: Perform Data Analysis
+Use the `scripts/eda_analyzer.py` script OR implement custom analysis:
+**Option A: Use the analyzer script**
+```python
+# The script automatically:
+# 1. Detects file type
+# 2. Loads reference information
+# 3. Performs format-specific analysis
+# 4. Generates markdown report
+python scripts/eda_analyzer.py <filepath> [output.md]
+```
+**Option B: Custom analysis in the conversation**
+Based on the format information from the reference file, perform appropriate analysis:
+For tabular data (CSV, TSV, Excel):
+- Load with pandas
+- Check dimensions, data types
+- Analyze missing values
+- Calculate summary statistics
+- Identify outliers
+- Check for duplicates
+For sequence data (FASTA, FASTQ):
+- Count sequences
+- Analyze length distributions
+- Calculate GC content
+- Assess quality scores (FASTQ)
+For images (TIFF, ND2, CZI):
+- Check dimensions (X, Y, Z, C, T)
+- Analyze bit depth and value range
+- Extract metadata (channels, timestamps, spatial calibration)
+- Calculate intensity statistics
+For arrays (NPY, HDF5):
+- Check shape and dimensions
+- Analyze data type
+- Calculate statistical summaries
+- Check for missing/invalid values
+### Step 4: Generate Comprehensive Report
+Create a markdown report with the following sections:
+#### Required Sections:
+1. **Title and Metadata**
+   - Filename and timestamp
+   - File size and location
+2. **Basic Information**
+   - File properties
+   - Format identification
+3. **File Type Details**
+   - Format description from reference
+   - Typical data content
+   - Common use cases
+   - Python libraries for reading
+4. **Data Analysis**
+   - Structure and dimensions
+   - Statistical summaries
+   - Quality assessment
+   - Data characteristics
+5. **Key Findings**
+   - Notable patterns
+   - Potential issues
+   - Quality metrics
+6. **Recommendations**
+   - Preprocessing steps
+   - Appropriate analyses
+   - Tools and methods
+   - Visualization approaches
+#### Template Location
+Use `assets/report_template.md` as a guide for report structure.
+### Step 5: Save Report
+Save the markdown report with a descriptive filename:
+- Pattern: `{original_filename}_eda_report.md`
+- Example: `experiment_data.fastq` → `experiment_data_eda_report.md`
+## Detailed Format References
+Each reference file contains comprehensive information for dozens of file types. To find information about a specific format:
+1. Identify the category from the extension
+2. Read the appropriate reference file
+3. Search for the section heading matching the extension (e.g., "### .pdb")
+4. Extract the format information
+### Reference File Structure
+Each format entry includes:
+- **Description:** What the format is
+- **Typical Data:** What it contains
+- **Use Cases:** Common applications
+- **Python Libraries:** How to read it (with code examples)
+- **EDA Approach:** Specific analyses to perform
+**Example lookup:**
+```markdown
+### .pdb - Protein Data Bank
+**Description:** Standard format for 3D structures of biological macromolecules
+**Typical Data:** Atomic coordinates, residue information, secondary structure
+**Use Cases:** Protein structure analysis, molecular visualization, docking
+**Python Libraries:**
+- `Biopython`: `Bio.PDB`
+- `MDAnalysis`: `MDAnalysis.Universe('file.pdb')`
+**EDA Approach:**
+- Structure validation (bond lengths, angles)
+- B-factor distribution
+- Missing residues detection
+- Ramachandran plots
+```
+## Best Practices
+### Reading Reference Files
+Reference files are large (10,000+ words each). To efficiently use them:
+1. **Search by extension:** Use grep to find the specific format
+   ```python
+   import re
+   with open('references/chemistry_molecular_formats.md', 'r') as f:
+       content = f.read()
+       pattern = r'### \.pdb[^#]*?(?=###|\Z)'
+       match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
+   ```
+2. **Extract relevant sections:** Don't load entire reference files into context unnecessarily
+3. **Cache format info:** If analyzing multiple files of the same type, reuse the format information
+### Data Analysis
+1. **Sample large files:** For files with millions of records, analyze a representative sample
+2. **Handle errors gracefully:** Many scientific formats require specific libraries; provide clear installation instructions
+3. **Validate metadata:** Cross-check metadata consistency (e.g., stated dimensions vs actual data)
+4. **Consider data provenance:** Note instrument, software versions, processing steps
+### Report Generation
+1. **Be comprehensive:** Include all relevant information for downstream analysis
+2. **Be specific:** Provide concrete recommendations based on the file type
+3. **Be actionable:** Suggest specific next steps and tools
+4. **Include code examples:** Show how to load and work with the data
+## Examples
+### Example 1: Analyzing a FASTQ file
+```python
+# User provides: "Analyze reads.fastq"
+# 1. Detect file type
+extension = '.fastq'
+category = 'bioinformatics_genomics'
+# 2. Read reference info
+# Search references/bioinformatics_genomics_formats.md for "### .fastq"
+# 3. Perform analysis
+from Bio import SeqIO
+sequences = list(SeqIO.parse('reads.fastq', 'fastq'))
+# Calculate: read count, length distribution, quality scores, GC content
+# 4. Generate report
+# Include: format description, analysis results, QC recommendations
+# 5. Save as: reads_eda_report.md
+```
+### Example 2: Analyzing a CSV dataset
+```python
+# User provides: "Explore experiment_results.csv"
+# 1. Detect: .csv → general_scientific
+# 2. Load reference for CSV format
+# 3. Analyze
+import pandas as pd
+df = pd.read_csv('experiment_results.csv')
+# Dimensions, dtypes, missing values, statistics, correlations
+# 4. Generate report with:
+# - Data structure
+# - Missing value patterns
+# - Statistical summaries
+# - Correlation matrix
+# - Outlier detection results
+# 5. Save report
+```
+### Example 3: Analyzing microscopy data
+```python
+# User provides: "Analyze cells.nd2"
+# 1. Detect: .nd2 → microscopy_imaging (Nikon format)
+# 2. Read reference for ND2 format
+# Learn: multi-dimensional (XYZCT), requires nd2reader
+# 3. Analyze
+from nd2reader import ND2Reader
+with ND2Reader('cells.nd2') as images:
+    # Extract: dimensions, channels, timepoints, metadata
+    # Calculate: intensity statistics, frame info
+# 4. Generate report with:
+# - Image dimensions (XY, Z-stacks, time, channels)
+# - Channel wavelengths
+# - Pixel size and calibration
+# - Recommendations for image analysis
+# 5. Save report
+```
+## Troubleshooting
+### Missing Libraries
+Many scientific formats require specialized libraries:
+**Problem:** Import error when trying to read a file
+**Solution:** Provide clear installation instructions
+```python
+try:
+    from Bio import SeqIO
+except ImportError:
+    print("Install Biopython: uv pip install biopython")
+```
+Common requirements by category:
+- **Bioinformatics:** `biopython`, `pysam`, `pyBigWig`
+- **Chemistry:** `rdkit`, `mdanalysis`, `cclib`
+- **Microscopy:** `tifffile`, `nd2reader`, `aicsimageio`, `pydicom`
+- **Spectroscopy:** `nmrglue`, `pymzml`, `pyteomics`
+- **General:** `pandas`, `numpy`, `h5py`, `scipy`
+### Unknown File Types
+If a file extension is not in the references:
+1. Ask the user about the file format
+2. Check if it's a vendor-specific variant
+3. Attempt generic analysis based on file structure (text vs binary)
+4. Provide general recommendations
+### Large Files
+For very large files:
+1. Use sampling strategies (first N records)
+2. Use memory-mapped access (for HDF5, NPY)
+3. Process in chunks (for CSV, FASTQ)
+4. Provide estimates based on samples
+## Script Usage
+The `scripts/eda_analyzer.py` can be used directly:
+```bash
+# Basic usage
+python scripts/eda_analyzer.py data.csv
+# Specify output file
+python scripts/eda_analyzer.py data.csv output_report.md
+# The script will:
+# 1. Auto-detect file type
+# 2. Load format references
+# 3. Perform appropriate analysis
+# 4. Generate markdown report
+```
+The script supports automatic analysis for many common formats, but custom analysis in the conversation provides more flexibility and domain-specific insights.
+## Advanced Usage
+### Multi-File Analysis
+When analyzing multiple related files:
+1. Perform individual EDA on each file
+2. Create a summary comparison report
+3. Identify relationships and dependencies
+4. Suggest integration strategies
+### Quality Control
+For data quality assessment:
+1. Check format compliance
+2. Validate metadata consistency
+3. Assess completeness
+4. Identify outliers and anomalies
+5. Compare to expected ranges/distributions
+### Preprocessing Recommendations
+Based on data characteristics, recommend:
+1. Normalization strategies
+2. Missing value imputation
+3. Outlier handling
+4. Batch correction
+5. Format conversions
+## Resources
+### scripts/
+- `eda_analyzer.py`: Comprehensive analysis script that can be run directly or imported
+### references/
+- `chemistry_molecular_formats.md`: 60+ chemistry/molecular file formats
+- `bioinformatics_genomics_formats.md`: 50+ bioinformatics formats
+- `microscopy_imaging_formats.md`: 45+ imaging formats
+- `spectroscopy_analytical_formats.md`: 35+ spectroscopy formats
+- `proteomics_metabolomics_formats.md`: 30+ omics formats
+- `general_scientific_formats.md`: 30+ general formats
+### assets/
+- `report_template.md`: Comprehensive markdown template for EDA reports

.scider/skills/exploratory-data-analysis/assets/report_template.md ADDED Viewed

	@@ -0,0 +1,196 @@

+# Exploratory Data Analysis Report: {FILENAME}
+**Generated:** {TIMESTAMP}
+---
+## Executive Summary
+This report provides a comprehensive exploratory data analysis of the file `{FILENAME}`. The analysis includes file type identification, format-specific metadata extraction, data quality assessment, and recommendations for downstream analysis.
+---
+## Basic Information
+- **Filename:** `{FILENAME}`
+- **Full Path:** `{FILEPATH}`
+- **File Size:** {FILE_SIZE_HUMAN} ({FILE_SIZE_BYTES} bytes)
+- **Last Modified:** {MODIFIED_DATE}
+- **Extension:** `.{EXTENSION}`
+- **Format Category:** {CATEGORY}
+---
+## File Type Details
+### Format Description
+{FORMAT_DESCRIPTION}
+### Typical Data Content
+{TYPICAL_DATA}
+### Common Use Cases
+{USE_CASES}
+### Python Libraries for Reading
+{PYTHON_LIBRARIES}
+---
+## Data Structure Analysis
+### Overview
+{DATA_STRUCTURE_OVERVIEW}
+### Dimensions
+{DIMENSIONS}
+### Data Types
+{DATA_TYPES}
+---
+## Quality Assessment
+### Completeness
+- **Missing Values:** {MISSING_VALUES}
+- **Data Coverage:** {COVERAGE}
+### Validity
+- **Range Check:** {RANGE_CHECK}
+- **Format Compliance:** {FORMAT_COMPLIANCE}
+- **Consistency:** {CONSISTENCY}
+### Integrity
+- **Checksum/Validation:** {VALIDATION}
+- **File Corruption Check:** {CORRUPTION_CHECK}
+---
+## Statistical Summary
+### Numerical Variables
+{NUMERICAL_STATS}
+### Categorical Variables
+{CATEGORICAL_STATS}
+### Distributions
+{DISTRIBUTIONS}
+---
+## Data Characteristics
+### Temporal Properties (if applicable)
+- **Time Range:** {TIME_RANGE}
+- **Sampling Rate:** {SAMPLING_RATE}
+- **Missing Time Points:** {MISSING_TIMEPOINTS}
+### Spatial Properties (if applicable)
+- **Dimensions:** {SPATIAL_DIMENSIONS}
+- **Resolution:** {SPATIAL_RESOLUTION}
+- **Coordinate System:** {COORDINATE_SYSTEM}
+### Experimental Metadata (if applicable)
+- **Instrument:** {INSTRUMENT}
+- **Method:** {METHOD}
+- **Sample Info:** {SAMPLE_INFO}
+---
+## Key Findings
+1. **Data Volume:** {DATA_VOLUME_FINDING}
+2. **Data Quality:** {DATA_QUALITY_FINDING}
+3. **Notable Patterns:** {PATTERNS_FINDING}
+4. **Potential Issues:** {ISSUES_FINDING}
+---
+## Visualizations
+### Distribution Plots
+{DISTRIBUTION_PLOTS}
+### Correlation Analysis
+{CORRELATION_PLOTS}
+### Time Series (if applicable)
+{TIMESERIES_PLOTS}
+---
+## Recommendations for Further Analysis
+### Immediate Actions
+1. {RECOMMENDATION_1}
+2. {RECOMMENDATION_2}
+3. {RECOMMENDATION_3}
+### Preprocessing Steps
+- {PREPROCESSING_1}
+- {PREPROCESSING_2}
+- {PREPROCESSING_3}
+### Analytical Approaches
+{ANALYTICAL_APPROACHES}
+### Tools and Methods
+- **Recommended Software:** {RECOMMENDED_SOFTWARE}
+- **Statistical Methods:** {STATISTICAL_METHODS}
+- **Visualization Tools:** {VIZ_TOOLS}
+---
+## Data Processing Workflow
+```
+{WORKFLOW_DIAGRAM}
+```
+---
+## Potential Challenges
+1. **Challenge:** {CHALLENGE_1}
+   - **Mitigation:** {MITIGATION_1}
+2. **Challenge:** {CHALLENGE_2}
+   - **Mitigation:** {MITIGATION_2}
+---
+## References and Resources
+### Format Specification
+- {FORMAT_SPEC_LINK}
+### Python Libraries Documentation
+- {LIBRARY_DOCS}
+### Related Analysis Examples
+- {EXAMPLE_LINKS}
+---
+## Appendix
+### Complete File Metadata
+```json
+{COMPLETE_METADATA}
+```
+### Analysis Parameters
+```json
+{ANALYSIS_PARAMETERS}
+```
+### Software Versions
+- Python: {PYTHON_VERSION}
+- Key Libraries: {LIBRARY_VERSIONS}
+---
+*This report was automatically generated by the exploratory-data-analysis skill.*
+*For questions or issues, refer to the skill documentation.*

.scider/skills/exploratory-data-analysis/references/bioinformatics_genomics_formats.md ADDED Viewed

	@@ -0,0 +1,664 @@

+# Bioinformatics and Genomics File Formats Reference
+This reference covers file formats used in genomics, transcriptomics, sequence analysis, and related bioinformatics applications.
+## Sequence Data Formats
+### .fasta / .fa / .fna - FASTA Format
+**Description:** Text-based format for nucleotide or protein sequences
+**Typical Data:** DNA, RNA, or protein sequences with headers
+**Use Cases:** Sequence storage, BLAST searches, alignments
+**Python Libraries:**
+- `Biopython`: `SeqIO.parse('file.fasta', 'fasta')`
+- `pyfaidx`: Fast indexed FASTA access
+- `screed`: Fast sequence parsing
+**EDA Approach:**
+- Sequence count and length distribution
+- GC content analysis
+- N content (ambiguous bases)
+- Sequence ID parsing
+- Duplicate detection
+- Quality metrics for assemblies (N50, L50)
+### .fastq / .fq - FASTQ Format
+**Description:** Sequence data with base quality scores
+**Typical Data:** Raw sequencing reads with Phred quality scores
+**Use Cases:** NGS data, quality control, read mapping
+**Python Libraries:**
+- `Biopython`: `SeqIO.parse('file.fastq', 'fastq')`
+- `pysam`: Fast FASTQ/BAM operations
+- `HTSeq`: Sequencing data analysis
+**EDA Approach:**
+- Read count and length distribution
+- Quality score distribution (per-base, per-read)
+- GC content and bias
+- Duplicate rate estimation
+- Adapter contamination detection
+- k-mer frequency analysis
+- Encoding format validation (Phred33/64)
+### .sam - Sequence Alignment/Map
+**Description:** Tab-delimited text format for alignments
+**Typical Data:** Aligned sequencing reads with mapping quality
+**Use Cases:** Read alignment storage, variant calling
+**Python Libraries:**
+- `pysam`: `pysam.AlignmentFile('file.sam', 'r')`
+- `HTSeq`: `HTSeq.SAM_Reader('file.sam')`
+**EDA Approach:**
+- Mapping rate and quality distribution
+- Coverage analysis
+- Insert size distribution (paired-end)
+- Alignment flags distribution
+- CIGAR string patterns
+- Mismatch and indel rates
+- Duplicate and supplementary alignment counts
+### .bam - Binary Alignment/Map
+**Description:** Compressed binary version of SAM
+**Typical Data:** Aligned reads in compressed format
+**Use Cases:** Efficient storage and processing of alignments
+**Python Libraries:**
+- `pysam`: Full BAM support with indexing
+- `bamnostic`: Pure Python BAM reader
+**EDA Approach:**
+- Same as SAM plus:
+- Compression ratio analysis
+- Index file (.bai) validation
+- Chromosome-wise statistics
+- Strand bias detection
+- Read group analysis
+### .cram - CRAM Format
+**Description:** Highly compressed alignment format
+**Typical Data:** Reference-compressed aligned reads
+**Use Cases:** Long-term storage, space-efficient archives
+**Python Libraries:**
+- `pysam`: CRAM support (requires reference)
+- Reference genome must be accessible
+**EDA Approach:**
+- Compression efficiency vs BAM
+- Reference dependency validation
+- Lossy vs lossless compression assessment
+- Decompression performance
+- Similar alignment metrics as BAM
+### .bed - Browser Extensible Data
+**Description:** Tab-delimited format for genomic features
+**Typical Data:** Genomic intervals (chr, start, end) with annotations
+**Use Cases:** Peak calling, variant annotation, genome browsing
+**Python Libraries:**
+- `pybedtools`: `pybedtools.BedTool('file.bed')`
+- `pyranges`: `pyranges.read_bed('file.bed')`
+- `pandas`: Simple BED reading
+**EDA Approach:**
+- Feature count and size distribution
+- Chromosome distribution
+- Strand bias
+- Score distribution (if present)
+- Overlap and proximity analysis
+- Coverage statistics
+- Gap analysis between features
+### .bedGraph - BED with Graph Data
+**Description:** BED format with per-base signal values
+**Typical Data:** Continuous-valued genomic data (coverage, signals)
+**Use Cases:** Coverage tracks, ChIP-seq signals, methylation
+**Python Libraries:**
+- `pyBigWig`: Can convert to bigWig
+- `pybedtools`: BedGraph operations
+**EDA Approach:**
+- Signal distribution statistics
+- Genome coverage percentage
+- Signal dynamics (peaks, valleys)
+- Chromosome-wise signal patterns
+- Quantile analysis
+- Zero-coverage regions
+### .bigWig / .bw - Binary BigWig
+**Description:** Indexed binary format for genome-wide signal data
+**Typical Data:** Continuous genomic signals (compressed and indexed)
+**Use Cases:** Efficient genome browser tracks, large-scale data
+**Python Libraries:**
+- `pyBigWig`: `pyBigWig.open('file.bw')`
+- `pybbi`: BigWig/BigBed interface
+**EDA Approach:**
+- Signal statistics extraction
+- Zoom level analysis
+- Regional signal extraction
+- Efficient genome-wide summaries
+- Compression efficiency
+- Index structure analysis
+### .bigBed / .bb - Binary BigBed
+**Description:** Indexed binary BED format
+**Typical Data:** Genomic features (compressed and indexed)
+**Use Cases:** Large feature sets, genome browsers
+**Python Libraries:**
+- `pybbi`: BigBed reading
+- `pybigtools`: Modern BigBed interface
+**EDA Approach:**
+- Feature density analysis
+- Efficient interval queries
+- Zoom level validation
+- Index performance metrics
+- Feature size statistics
+### .gff / .gff3 - General Feature Format
+**Description:** Tab-delimited format for genomic annotations
+**Typical Data:** Gene models, transcripts, exons, regulatory elements
+**Use Cases:** Genome annotation, gene prediction
+**Python Libraries:**
+- `BCBio.GFF`: Biopython GFF module
+- `gffutils`: `gffutils.create_db('file.gff3')`
+- `pyranges`: GFF support
+**EDA Approach:**
+- Feature type distribution (gene, exon, CDS, etc.)
+- Gene structure validation
+- Strand balance
+- Hierarchical relationship validation
+- Phase validation for CDS
+- Attribute completeness
+- Gene model statistics (introns, exons per gene)
+### .gtf - Gene Transfer Format
+**Description:** GFF2-based format for gene annotations
+**Typical Data:** Gene and transcript annotations
+**Use Cases:** RNA-seq analysis, gene quantification
+**Python Libraries:**
+- `pyranges`: `pyranges.read_gtf('file.gtf')`
+- `gffutils`: GTF database creation
+- `HTSeq`: GTF reading for counts
+**EDA Approach:**
+- Transcript isoform analysis
+- Gene structure completeness
+- Exon number distribution
+- Transcript length distribution
+- TSS and TES analysis
+- Biotype distribution
+- Overlapping gene detection
+### .vcf - Variant Call Format
+**Description:** Text format for genetic variants
+**Typical Data:** SNPs, indels, structural variants with annotations
+**Use Cases:** Variant calling, population genetics, GWAS
+**Python Libraries:**
+- `pysam`: `pysam.VariantFile('file.vcf')`
+- `cyvcf2`: Fast VCF parsing
+- `PyVCF`: Older but comprehensive
+**EDA Approach:**
+- Variant count by type (SNP, indel, SV)
+- Quality score distribution
+- Allele frequency spectrum
+- Transition/transversion ratio
+- Heterozygosity rates
+- Missing genotype analysis
+- Hardy-Weinberg equilibrium
+- Annotation completeness (if annotated)
+### .bcf - Binary VCF
+**Description:** Compressed binary variant format
+**Typical Data:** Same as VCF but binary
+**Use Cases:** Efficient variant storage and processing
+**Python Libraries:**
+- `pysam`: Full BCF support
+- `cyvcf2`: Optimized BCF reading
+**EDA Approach:**
+- Same as VCF plus:
+- Compression efficiency
+- Indexing validation
+- Read performance metrics
+### .gvcf - Genomic VCF
+**Description:** VCF with reference confidence blocks
+**Typical Data:** All positions (variant and non-variant)
+**Use Cases:** Joint genotyping workflows, GATK
+**Python Libraries:**
+- `pysam`: GVCF support
+- Standard VCF parsers
+**EDA Approach:**
+- Reference block analysis
+- Coverage uniformity
+- Variant density
+- Genotype quality across genome
+- Reference confidence distribution
+## RNA-Seq and Expression Data
+### .counts - Gene Count Matrix
+**Description:** Tab-delimited gene expression counts
+**Typical Data:** Gene IDs with read counts per sample
+**Use Cases:** RNA-seq quantification, differential expression
+**Python Libraries:**
+- `pandas`: `pd.read_csv('file.counts', sep='\t')`
+- `scanpy` (for single-cell): `sc.read_csv()`
+**EDA Approach:**
+- Library size distribution
+- Detection rate (genes per sample)
+- Zero-inflation analysis
+- Count distribution (log scale)
+- Outlier sample detection
+- Correlation between replicates
+- PCA for sample relationships
+### .tpm / .fpkm - Normalized Expression
+**Description:** Normalized gene expression values
+**Typical Data:** TPM (transcripts per million) or FPKM values
+**Use Cases:** Cross-sample comparison, visualization
+**Python Libraries:**
+- `pandas`: Standard CSV reading
+- `anndata`: For integrated analysis
+**EDA Approach:**
+- Expression distribution
+- Highly expressed gene identification
+- Sample clustering
+- Batch effect detection
+- Coefficient of variation analysis
+- Dynamic range assessment
+### .mtx - Matrix Market Format
+**Description:** Sparse matrix format (common in single-cell)
+**Typical Data:** Sparse count matrices (cells × genes)
+**Use Cases:** Single-cell RNA-seq, large sparse matrices
+**Python Libraries:**
+- `scipy.io`: `scipy.io.mmread('file.mtx')`
+- `scanpy`: `sc.read_mtx('file.mtx')`
+**EDA Approach:**
+- Sparsity analysis
+- Cell and gene filtering thresholds
+- Doublet detection metrics
+- Mitochondrial fraction
+- UMI count distribution
+- Gene detection per cell
+### .h5ad - Anndata Format
+**Description:** HDF5-based annotated data matrix
+**Typical Data:** Expression matrix with metadata (cells, genes)
+**Use Cases:** Single-cell RNA-seq analysis with Scanpy
+**Python Libraries:**
+- `scanpy`: `sc.read_h5ad('file.h5ad')`
+- `anndata`: Direct AnnData manipulation
+**EDA Approach:**
+- Cell and gene counts
+- Metadata completeness
+- Layer availability (raw, normalized)
+- Embedding presence (PCA, UMAP)
+- QC metrics distribution
+- Batch information
+- Cell type annotation coverage
+### .loom - Loom Format
+**Description:** HDF5-based format for omics data
+**Typical Data:** Expression matrices with metadata
+**Use Cases:** Single-cell data, RNA velocity analysis
+**Python Libraries:**
+- `loompy`: `loompy.connect('file.loom')`
+- `scanpy`: Can import loom files
+**EDA Approach:**
+- Layer analysis (spliced, unspliced)
+- Row and column attribute exploration
+- Graph connectivity analysis
+- Cluster assignments
+- Velocity-specific metrics
+### .rds - R Data Serialization
+**Description:** R object storage (often Seurat objects)
+**Typical Data:** R analysis results, especially single-cell
+**Use Cases:** R-Python data exchange
+**Python Libraries:**
+- `pyreadr`: `pyreadr.read_r('file.rds')`
+- `rpy2`: For full R integration
+- Conversion tools to AnnData
+**EDA Approach:**
+- Object type identification
+- Data structure exploration
+- Metadata extraction
+- Conversion validation
+## Alignment and Assembly Formats
+### .maf - Multiple Alignment Format
+**Description:** Text format for multiple sequence alignments
+**Typical Data:** Genome-wide or local multiple alignments
+**Use Cases:** Comparative genomics, conservation analysis
+**Python Libraries:**
+- `Biopython`: `AlignIO.parse('file.maf', 'maf')`
+- `bx-python`: MAF-specific tools
+**EDA Approach:**
+- Alignment block statistics
+- Species coverage
+- Gap analysis
+- Conservation scoring
+- Alignment quality metrics
+- Block length distribution
+### .axt - Pairwise Alignment Format
+**Description:** Pairwise alignment format (UCSC)
+**Typical Data:** Pairwise genomic alignments
+**Use Cases:** Genome comparison, synteny analysis
+**Python Libraries:**
+- Custom parsers (simple format)
+- `bx-python`: AXT support
+**EDA Approach:**
+- Alignment score distribution
+- Identity percentage
+- Syntenic block identification
+- Gap size analysis
+- Coverage statistics
+### .chain - Chain Alignment Format
+**Description:** Genome coordinate mapping chains
+**Typical Data:** Coordinate transformations between genome builds
+**Use Cases:** Liftover, coordinate conversion
+**Python Libraries:**
+- `pyliftover`: Chain file usage
+- Custom parsers for chain format
+**EDA Approach:**
+- Chain score distribution
+- Coverage of source genome
+- Gap analysis
+- Inversion detection
+- Mapping quality assessment
+### .psl - Pattern Space Layout
+**Description:** BLAT/BLAST alignment format
+**Typical Data:** Alignment results from BLAT
+**Use Cases:** Transcript mapping, similarity searches
+**Python Libraries:**
+- Custom parsers (tab-delimited)
+- `pybedtools`: Can handle PSL
+**EDA Approach:**
+- Match percentage distribution
+- Gap statistics
+- Query coverage
+- Multiple mapping analysis
+- Alignment quality metrics
+## Genome Assembly and Annotation
+### .agp - Assembly Golden Path
+**Description:** Assembly structure description
+**Typical Data:** Scaffold composition, gap information
+**Use Cases:** Genome assembly representation
+**Python Libraries:**
+- Custom parsers (simple tab-delimited)
+- Assembly analysis tools
+**EDA Approach:**
+- Scaffold statistics (N50, L50)
+- Gap type and size distribution
+- Component length analysis
+- Assembly contiguity metrics
+- Unplaced contig analysis
+### .scaffolds / .contigs - Assembly Sequences
+**Description:** Assembled sequences (usually FASTA)
+**Typical Data:** Assembled genomic sequences
+**Use Cases:** Genome assembly output
+**Python Libraries:**
+- Same as FASTA format
+- Assembly-specific tools (QUAST)
+**EDA Approach:**
+- Assembly statistics (N50, N90, etc.)
+- Length distribution
+- Coverage analysis
+- Gap (N) content
+- Duplication assessment
+- BUSCO completeness (if annotations available)
+### .2bit - Compressed Genome Format
+**Description:** UCSC compact genome format
+**Typical Data:** Reference genomes (highly compressed)
+**Use Cases:** Efficient genome storage and access
+**Python Libraries:**
+- `py2bit`: `py2bit.open('file.2bit')`
+- `twobitreader`: Alternative reader
+**EDA Approach:**
+- Compression efficiency
+- Random access performance
+- Sequence extraction validation
+- Masked region analysis
+- N content and distribution
+### .sizes - Chromosome Sizes
+**Description:** Simple format with chromosome lengths
+**Typical Data:** Tab-delimited chromosome names and sizes
+**Use Cases:** Genome browsers, coordinate validation
+**Python Libraries:**
+- Simple file reading with pandas
+- Built into many genomic tools
+**EDA Approach:**
+- Genome size calculation
+- Chromosome count
+- Size distribution
+- Karyotype validation
+- Completeness check against reference
+## Phylogenetics and Evolution
+### .nwk / .newick - Newick Tree Format
+**Description:** Parenthetical tree representation
+**Typical Data:** Phylogenetic trees with branch lengths
+**Use Cases:** Evolutionary analysis, tree visualization
+**Python Libraries:**
+- `Biopython`: `Phylo.read('file.nwk', 'newick')`
+- `ete3`: `ete3.Tree('file.nwk')`
+- `dendropy`: Phylogenetic computing
+**EDA Approach:**
+- Tree structure analysis (tips, internal nodes)
+- Branch length distribution
+- Tree balance metrics
+- Ultrametricity check
+- Bootstrap support analysis
+- Topology validation
+### .nexus - Nexus Format
+**Description:** Rich format for phylogenetic data
+**Typical Data:** Alignments, trees, character matrices
+**Use Cases:** Phylogenetic software interchange
+**Python Libraries:**
+- `Biopython`: Nexus support
+- `dendropy`: Comprehensive Nexus handling
+**EDA Approach:**
+- Data block analysis
+- Character type distribution
+- Tree block validation
+- Taxa consistency
+- Command block parsing
+- Format compliance checking
+### .phylip - PHYLIP Format
+**Description:** Sequence alignment format (strict/relaxed)
+**Typical Data:** Multiple sequence alignments
+**Use Cases:** Phylogenetic analysis input
+**Python Libraries:**
+- `Biopython`: `AlignIO.read('file.phy', 'phylip')`
+- `dendropy`: PHYLIP support
+**EDA Approach:**
+- Alignment dimensions
+- Sequence length uniformity
+- Gap position analysis
+- Informative site calculation
+- Format variant detection (strict vs relaxed)
+### .paml - PAML Output
+**Description:** Output from PAML phylogenetic software
+**Typical Data:** Evolutionary model results, dN/dS ratios
+**Use Cases:** Molecular evolution analysis
+**Python Libraries:**
+- Custom parsers for specific PAML programs
+- `Biopython`: Basic PAML parsing
+**EDA Approach:**
+- Model parameter extraction
+- Likelihood values
+- dN/dS ratio distribution
+- Branch-specific results
+- Convergence assessment
+## Protein and Structure Data
+### .embl - EMBL Format
+**Description:** Rich sequence annotation format
+**Typical Data:** Sequences with extensive annotations
+**Use Cases:** Sequence databases, genome records
+**Python Libraries:**
+- `Biopython`: `SeqIO.read('file.embl', 'embl')`
+**EDA Approach:**
+- Feature annotation completeness
+- Sequence length and type
+- Reference information
+- Cross-reference validation
+- Feature overlap analysis
+### .genbank / .gb / .gbk - GenBank Format
+**Description:** NCBI's sequence annotation format
+**Typical Data:** Annotated sequences with features
+**Use Cases:** Sequence databases, annotation transfer
+**Python Libraries:**
+- `Biopython`: `SeqIO.parse('file.gb', 'genbank')`
+**EDA Approach:**
+- Feature type distribution
+- CDS analysis (start codons, stops)
+- Translation validation
+- Annotation completeness
+- Source organism extraction
+- Reference and publication info
+- Locus tag consistency
+### .sff - Standard Flowgram Format
+**Description:** 454/Roche sequencing data format
+**Typical Data:** Raw pyrosequencing flowgrams
+**Use Cases:** Legacy 454 sequencing data
+**Python Libraries:**
+- `Biopython`: `SeqIO.parse('file.sff', 'sff')`
+- Platform-specific tools
+**EDA Approach:**
+- Read count and length
+- Flowgram signal quality
+- Key sequence detection
+- Adapter trimming validation
+- Quality score distribution
+### .hdf5 (Genomics Specific)
+**Description:** HDF5 for genomics (10X, Hi-C, etc.)
+**Typical Data:** High-throughput genomics data
+**Use Cases:** 10X Genomics, spatial transcriptomics
+**Python Libraries:**
+- `h5py`: Low-level access
+- `scanpy`: For 10X data
+- `cooler`: For Hi-C data
+**EDA Approach:**
+- Dataset structure exploration
+- Barcode statistics
+- UMI counting
+- Feature-barcode matrix analysis
+- Spatial coordinates (if applicable)
+### .cool / .mcool - Cooler Format
+**Description:** HDF5-based Hi-C contact matrices
+**Typical Data:** Chromatin interaction matrices
+**Use Cases:** 3D genome analysis, Hi-C data
+**Python Libraries:**
+- `cooler`: `cooler.Cooler('file.cool')`
+- `hicstraw`: For .hic format
+**EDA Approach:**
+- Resolution analysis
+- Contact matrix statistics
+- Distance decay curves
+- Compartment analysis
+- TAD boundary detection
+- Balance factor validation
+### .hic - Hi-C Binary Format
+**Description:** Juicer binary Hi-C format
+**Typical Data:** Multi-resolution Hi-C matrices
+**Use Cases:** Hi-C analysis with Juicer tools
+**Python Libraries:**
+- `hicstraw`: `hicstraw.HiCFile('file.hic')`
+- `straw`: C++ library with Python bindings
+**EDA Approach:**
+- Available resolutions
+- Normalization methods
+- Contact statistics
+- Chromosomal interactions
+- Quality metrics
+### .bw (ChIP-seq / ATAC-seq specific)
+**Description:** BigWig files for epigenomics
+**Typical Data:** Coverage or enrichment signals
+**Use Cases:** ChIP-seq, ATAC-seq, DNase-seq
+**Python Libraries:**
+- `pyBigWig`: Standard bigWig access
+**EDA Approach:**
+- Peak enrichment patterns
+- Background signal analysis
+- Sample correlation
+- Signal-to-noise ratio
+- Library complexity metrics
+### .narrowPeak / .broadPeak - ENCODE Peak Formats
+**Description:** BED-based formats for peaks
+**Typical Data:** Peak calls with scores and p-values
+**Use Cases:** ChIP-seq peak calling output
+**Python Libraries:**
+- `pybedtools`: BED-compatible
+- Custom parsers for peak-specific fields
+**EDA Approach:**
+- Peak count and width distribution
+- Signal value distribution
+- Q-value and p-value analysis
+- Peak summit analysis
+- Overlap with known features
+- Motif enrichment preparation
+### .wig - Wiggle Format
+**Description:** Dense continuous genomic data
+**Typical Data:** Coverage or signal tracks
+**Use Cases:** Genome browser visualization
+**Python Libraries:**
+- `pyBigWig`: Can convert to bigWig
+- Custom parsers for wiggle format
+**EDA Approach:**
+- Signal statistics
+- Coverage metrics
+- Format variant (fixedStep vs variableStep)
+- Span parameter analysis
+- Conversion efficiency to bigWig
+### .ab1 - Sanger Sequencing Trace
+**Description:** Binary chromatogram format
+**Typical Data:** Sanger sequencing traces
+**Use Cases:** Capillary sequencing validation
+**Python Libraries:**
+- `Biopython`: `SeqIO.read('file.ab1', 'abi')`
+- `tracy` tools: For quality assessment
+**EDA Approach:**
+- Base calling quality
+- Trace quality scores
+- Mixed base detection
+- Primer and vector detection
+- Read length and quality region
+- Heterozygosity detection
+### .scf - Standard Chromatogram Format
+**Description:** Sanger sequencing chromatogram
+**Typical Data:** Base calls and confidence values
+**Use Cases:** Sequencing trace analysis
+**Python Libraries:**
+- `Biopython`: SCF format support
+**EDA Approach:**
+- Similar to AB1 format
+- Quality score profiles
+- Peak height ratios
+- Signal-to-noise metrics
+### .idx - Index Files (Generic)
+**Description:** Index files for various formats
+**Typical Data:** Fast random access indices
+**Use Cases:** Efficient data access (BAM, VCF, etc.)
+**Python Libraries:**
+- Format-specific libraries handle indices
+- `pysam`: Auto-handles BAI, CSI indices
+**EDA Approach:**
+- Index completeness validation
+- Binning strategy analysis
+- Access performance metrics
+- Index size vs data size ratio

.scider/skills/exploratory-data-analysis/references/chemistry_molecular_formats.md ADDED Viewed

	@@ -0,0 +1,664 @@

+# Chemistry and Molecular File Formats Reference
+This reference covers file formats commonly used in computational chemistry, cheminformatics, molecular modeling, and related fields.
+## Structure File Formats
+### .pdb - Protein Data Bank
+**Description:** Standard format for 3D structures of biological macromolecules
+**Typical Data:** Atomic coordinates, residue information, secondary structure, crystal structure data
+**Use Cases:** Protein structure analysis, molecular visualization, docking studies
+**Python Libraries:**
+- `Biopython`: `Bio.PDB`
+- `MDAnalysis`: `MDAnalysis.Universe('file.pdb')`
+- `PyMOL`: `pymol.cmd.load('file.pdb')`
+- `ProDy`: `prody.parsePDB('file.pdb')`
+**EDA Approach:**
+- Structure validation (bond lengths, angles, clashes)
+- Secondary structure analysis
+- B-factor distribution
+- Missing residues/atoms detection
+- Ramachandran plots for validation
+- Surface area and volume calculations
+### .cif - Crystallographic Information File
+**Description:** Structured data format for crystallographic information
+**Typical Data:** Unit cell parameters, atomic coordinates, symmetry operations, experimental data
+**Use Cases:** Crystal structure determination, structural biology, materials science
+**Python Libraries:**
+- `gemmi`: `gemmi.cif.read_file('file.cif')`
+- `PyCifRW`: `CifFile.ReadCif('file.cif')`
+- `Biopython`: `Bio.PDB.MMCIFParser()`
+**EDA Approach:**
+- Data completeness check
+- Resolution and quality metrics
+- Unit cell parameter analysis
+- Symmetry group validation
+- Atomic displacement parameters
+- R-factors and validation metrics
+### .mol - MDL Molfile
+**Description:** Chemical structure file format by MDL/Accelrys
+**Typical Data:** 2D/3D coordinates, atom types, bond orders, charges
+**Use Cases:** Chemical database storage, cheminformatics, drug design
+**Python Libraries:**
+- `RDKit`: `Chem.MolFromMolFile('file.mol')`
+- `Open Babel`: `pybel.readfile('mol', 'file.mol')`
+- `ChemoPy`: For descriptor calculation
+**EDA Approach:**
+- Molecular property calculation (MW, logP, TPSA)
+- Functional group analysis
+- Ring system detection
+- Stereochemistry validation
+- 2D/3D coordinate consistency
+- Valence and charge validation
+### .mol2 - Tripos Mol2
+**Description:** Complete 3D molecular structure format with atom typing
+**Typical Data:** Coordinates, SYBYL atom types, bond types, charges, substructures
+**Use Cases:** Molecular docking, QSAR studies, drug discovery
+**Python Libraries:**
+- `RDKit`: `Chem.MolFromMol2File('file.mol2')`
+- `Open Babel`: `pybel.readfile('mol2', 'file.mol2')`
+- `MDAnalysis`: Can parse mol2 topology
+**EDA Approach:**
+- Atom type distribution
+- Partial charge analysis
+- Bond type statistics
+- Substructure identification
+- Conformational analysis
+- Energy minimization status check
+### .sdf - Structure Data File
+**Description:** Multi-structure file format with associated data
+**Typical Data:** Multiple molecular structures with properties/annotations
+**Use Cases:** Chemical databases, virtual screening, compound libraries
+**Python Libraries:**
+- `RDKit`: `Chem.SDMolSupplier('file.sdf')`
+- `Open Babel`: `pybel.readfile('sdf', 'file.sdf')`
+- `PandasTools` (RDKit): For DataFrame integration
+**EDA Approach:**
+- Dataset size and diversity metrics
+- Property distribution analysis (MW, logP, etc.)
+- Structural diversity (Tanimoto similarity)
+- Missing data assessment
+- Outlier detection in properties
+- Scaffold analysis
+### .xyz - XYZ Coordinates
+**Description:** Simple Cartesian coordinate format
+**Typical Data:** Atom types and 3D coordinates
+**Use Cases:** Quantum chemistry, geometry optimization, molecular dynamics
+**Python Libraries:**
+- `ASE`: `ase.io.read('file.xyz')`
+- `Open Babel`: `pybel.readfile('xyz', 'file.xyz')`
+- `cclib`: For parsing QM outputs with xyz
+**EDA Approach:**
+- Geometry analysis (bond lengths, angles, dihedrals)
+- Center of mass calculation
+- Moment of inertia
+- Molecular size metrics
+- Coordinate validation
+- Symmetry detection
+### .smi / .smiles - SMILES String
+**Description:** Line notation for chemical structures
+**Typical Data:** Text representation of molecular structure
+**Use Cases:** Chemical databases, literature mining, data exchange
+**Python Libraries:**
+- `RDKit`: `Chem.MolFromSmiles(smiles)`
+- `Open Babel`: Can parse SMILES
+- `DeepChem`: For ML on SMILES
+**EDA Approach:**
+- SMILES syntax validation
+- Descriptor calculation from SMILES
+- Fingerprint generation
+- Substructure searching
+- Tautomer enumeration
+- Stereoisomer handling
+### .pdbqt - AutoDock PDBQT
+**Description:** Modified PDB format for AutoDock docking
+**Typical Data:** Coordinates, partial charges, atom types for docking
+**Use Cases:** Molecular docking, virtual screening
+**Python Libraries:**
+- `Meeko`: For PDBQT preparation
+- `Open Babel`: Can read PDBQT
+- `ProDy`: Limited PDBQT support
+**EDA Approach:**
+- Charge distribution analysis
+- Rotatable bond identification
+- Atom type validation
+- Coordinate quality check
+- Hydrogen placement validation
+- Torsion definition analysis
+### .mae - Maestro Format
+**Description:** Schrödinger's proprietary molecular structure format
+**Typical Data:** Structures, properties, annotations from Schrödinger suite
+**Use Cases:** Drug discovery, molecular modeling with Schrödinger tools
+**Python Libraries:**
+- `schrodinger.structure`: Requires Schrödinger installation
+- Custom parsers for basic reading
+**EDA Approach:**
+- Property extraction and analysis
+- Structure quality metrics
+- Conformer analysis
+- Docking score distributions
+- Ligand efficiency metrics
+### .gro - GROMACS Coordinate File
+**Description:** Molecular structure file for GROMACS MD simulations
+**Typical Data:** Atom positions, velocities, box vectors
+**Use Cases:** Molecular dynamics simulations, GROMACS workflows
+**Python Libraries:**
+- `MDAnalysis`: `Universe('file.gro')`
+- `MDTraj`: `mdtraj.load_gro('file.gro')`
+- `GromacsWrapper`: For GROMACS integration
+**EDA Approach:**
+- System composition analysis
+- Box dimension validation
+- Atom position distribution
+- Velocity distribution (if present)
+- Density calculation
+- Solvation analysis
+## Computational Chemistry Output Formats
+### .log - Gaussian Log File
+**Description:** Output from Gaussian quantum chemistry calculations
+**Typical Data:** Energies, geometries, frequencies, orbitals, populations
+**Use Cases:** QM calculations, geometry optimization, frequency analysis
+**Python Libraries:**
+- `cclib`: `cclib.io.ccread('file.log')`
+- `GaussianRunPack`: For Gaussian workflows
+- Custom parsers with regex
+**EDA Approach:**
+- Convergence analysis
+- Energy profile extraction
+- Vibrational frequency analysis
+- Orbital energy levels
+- Population analysis (Mulliken, NBO)
+- Thermochemistry data extraction
+### .out - Quantum Chemistry Output
+**Description:** Generic output file from various QM packages
+**Typical Data:** Calculation results, energies, properties
+**Use Cases:** QM calculations across different software
+**Python Libraries:**
+- `cclib`: Universal parser for QM outputs
+- `ASE`: Can read some output formats
+**EDA Approach:**
+- Software-specific parsing
+- Convergence criteria check
+- Energy and gradient trends
+- Basis set and method validation
+- Computational cost analysis
+### .wfn / .wfx - Wavefunction Files
+**Description:** Wavefunction data for quantum chemical analysis
+**Typical Data:** Molecular orbitals, basis sets, density matrices
+**Use Cases:** Electron density analysis, QTAIM analysis
+**Python Libraries:**
+- `Multiwfn`: Interface via Python
+- `Horton`: For wavefunction analysis
+- Custom parsers for specific formats
+**EDA Approach:**
+- Orbital population analysis
+- Electron density distribution
+- Critical point analysis (QTAIM)
+- Molecular orbital visualization
+- Bonding analysis
+### .fchk - Gaussian Formatted Checkpoint
+**Description:** Formatted checkpoint file from Gaussian
+**Typical Data:** Complete wavefunction data, results, geometry
+**Use Cases:** Post-processing Gaussian calculations
+**Python Libraries:**
+- `cclib`: Can parse fchk files
+- `GaussView` Python API (if available)
+- Custom parsers
+**EDA Approach:**
+- Wavefunction quality assessment
+- Property extraction
+- Basis set information
+- Gradient and Hessian analysis
+- Natural orbital analysis
+### .cube - Gaussian Cube File
+**Description:** Volumetric data on a 3D grid
+**Typical Data:** Electron density, molecular orbitals, ESP on grid
+**Use Cases:** Visualization of volumetric properties
+**Python Libraries:**
+- `cclib`: `cclib.io.ccread('file.cube')`
+- `ase.io`: `ase.io.read('file.cube')`
+- `pyquante`: For cube file manipulation
+**EDA Approach:**
+- Grid dimension and spacing analysis
+- Value distribution statistics
+- Isosurface value determination
+- Integration over volume
+- Comparison between different cubes
+## Molecular Dynamics Formats
+### .dcd - Binary Trajectory
+**Description:** Binary trajectory format (CHARMM, NAMD)
+**Typical Data:** Time series of atomic coordinates
+**Use Cases:** MD trajectory analysis
+**Python Libraries:**
+- `MDAnalysis`: `Universe(topology, 'traj.dcd')`
+- `MDTraj`: `mdtraj.load_dcd('traj.dcd', top='topology.pdb')`
+- `PyTraj` (Amber): Limited support
+**EDA Approach:**
+- RMSD/RMSF analysis
+- Trajectory length and frame count
+- Coordinate range and drift
+- Periodic boundary handling
+- File integrity check
+- Time step validation
+### .xtc - Compressed Trajectory
+**Description:** GROMACS compressed trajectory format
+**Typical Data:** Compressed coordinates from MD simulations
+**Use Cases:** Space-efficient MD trajectory storage
+**Python Libraries:**
+- `MDAnalysis`: `Universe(topology, 'traj.xtc')`
+- `MDTraj`: `mdtraj.load_xtc('traj.xtc', top='topology.pdb')`
+**EDA Approach:**
+- Compression ratio assessment
+- Precision loss evaluation
+- RMSD over time
+- Structural stability metrics
+- Sampling frequency analysis
+### .trr - GROMACS Trajectory
+**Description:** Full precision GROMACS trajectory
+**Typical Data:** Coordinates, velocities, forces from MD
+**Use Cases:** High-precision MD analysis
+**Python Libraries:**
+- `MDAnalysis`: Full support
+- `MDTraj`: Can read trr files
+- `GromacsWrapper`
+**EDA Approach:**
+- Full system dynamics analysis
+- Energy conservation check (with velocities)
+- Force analysis
+- Temperature and pressure validation
+- System equilibration assessment
+### .nc / .netcdf - Amber NetCDF Trajectory
+**Description:** Network Common Data Form trajectory
+**Typical Data:** MD coordinates, velocities, forces
+**Use Cases:** Amber MD simulations, large trajectory storage
+**Python Libraries:**
+- `MDAnalysis`: NetCDF support
+- `PyTraj`: Native Amber analysis
+- `netCDF4`: Low-level access
+**EDA Approach:**
+- Metadata extraction
+- Trajectory statistics
+- Time series analysis
+- Replica exchange analysis
+- Multi-dimensional data extraction
+### .top - GROMACS Topology
+**Description:** Molecular topology for GROMACS
+**Typical Data:** Atom types, bonds, angles, force field parameters
+**Use Cases:** MD simulation setup and analysis
+**Python Libraries:**
+- `ParmEd`: `parmed.load_file('system.top')`
+- `MDAnalysis`: Can parse topology
+- Custom parsers for specific fields
+**EDA Approach:**
+- Force field parameter validation
+- System composition
+- Bond/angle/dihedral distribution
+- Charge neutrality check
+- Molecule type enumeration
+### .psf - Protein Structure File (CHARMM)
+**Description:** Topology file for CHARMM/NAMD
+**Typical Data:** Atom connectivity, types, charges
+**Use Cases:** CHARMM/NAMD MD simulations
+**Python Libraries:**
+- `MDAnalysis`: Native PSF support
+- `ParmEd`: Can read PSF files
+**EDA Approach:**
+- Topology validation
+- Connectivity analysis
+- Charge distribution
+- Atom type statistics
+- Segment analysis
+### .prmtop - Amber Parameter/Topology
+**Description:** Amber topology and parameter file
+**Typical Data:** System topology, force field parameters
+**Use Cases:** Amber MD simulations
+**Python Libraries:**
+- `ParmEd`: `parmed.load_file('system.prmtop')`
+- `PyTraj`: Native Amber support
+**EDA Approach:**
+- Force field completeness
+- Parameter validation
+- System size and composition
+- Periodic box information
+- Atom mask creation for analysis
+### .inpcrd / .rst7 - Amber Coordinates
+**Description:** Amber coordinate/restart file
+**Typical Data:** Atomic coordinates, velocities, box info
+**Use Cases:** Starting coordinates for Amber MD
+**Python Libraries:**
+- `ParmEd`: Works with prmtop
+- `PyTraj`: Amber coordinate reading
+**EDA Approach:**
+- Coordinate validity
+- System initialization check
+- Box vector validation
+- Velocity distribution (if restart)
+- Energy minimization status
+## Spectroscopy and Analytical Data
+### .jcamp / .jdx - JCAMP-DX
+**Description:** Joint Committee on Atomic and Molecular Physical Data eXchange
+**Typical Data:** Spectroscopic data (IR, NMR, MS, UV-Vis)
+**Use Cases:** Spectroscopy data exchange and archiving
+**Python Libraries:**
+- `jcamp`: `jcamp.jcamp_reader('file.jdx')`
+- `nmrglue`: For NMR JCAMP files
+- Custom parsers for specific subtypes
+**EDA Approach:**
+- Peak detection and analysis
+- Baseline correction assessment
+- Signal-to-noise calculation
+- Spectral range validation
+- Integration analysis
+- Comparison with reference spectra
+### .mzML - Mass Spectrometry Markup Language
+**Description:** Standard XML format for mass spectrometry data
+**Typical Data:** MS/MS spectra, chromatograms, metadata
+**Use Cases:** Proteomics, metabolomics, mass spectrometry workflows
+**Python Libraries:**
+- `pymzml`: `pymzml.run.Reader('file.mzML')`
+- `pyteomics`: `pyteomics.mzml.read('file.mzML')`
+- `MSFileReader` wrappers
+**EDA Approach:**
+- Scan count and types
+- MS level distribution
+- Retention time range
+- m/z range and resolution
+- Peak intensity distribution
+- Data completeness
+- Quality control metrics
+### .mzXML - Mass Spectrometry XML
+**Description:** Open XML format for MS data
+**Typical Data:** Mass spectra, retention times, peak lists
+**Use Cases:** Legacy MS data, metabolomics
+**Python Libraries:**
+- `pymzml`: Can read mzXML
+- `pyteomics.mzxml`
+- `lxml` for direct XML parsing
+**EDA Approach:**
+- Similar to mzML
+- Version compatibility check
+- Conversion quality assessment
+- Peak picking validation
+### .raw - Vendor Raw Data
+**Description:** Proprietary instrument data files (Thermo, Bruker, etc.)
+**Typical Data:** Raw instrument signals, unprocessed data
+**Use Cases:** Direct instrument data access
+**Python Libraries:**
+- `pymsfilereader`: For Thermo RAW files
+- `ThermoRawFileParser`: CLI wrapper
+- Vendor-specific APIs (Thermo, Bruker Compass)
+**EDA Approach:**
+- Instrument method extraction
+- Raw signal quality
+- Calibration status
+- Scan function analysis
+- Chromatographic quality metrics
+### .d - Agilent Data Directory
+**Description:** Agilent's data folder structure
+**Typical Data:** LC-MS, GC-MS data and metadata
+**Use Cases:** Agilent instrument data processing
+**Python Libraries:**
+- `agilent-reader`: Community tools
+- `Chemstation` Python integration
+- Custom directory parsing
+**EDA Approach:**
+- Directory structure validation
+- Method parameter extraction
+- Signal file integrity
+- Calibration curve analysis
+- Sequence information extraction
+### .fid - NMR Free Induction Decay
+**Description:** Raw NMR time-domain data
+**Typical Data:** Time-domain NMR signal
+**Use Cases:** NMR processing and analysis
+**Python Libraries:**
+- `nmrglue`: `nmrglue.bruker.read_fid('fid')`
+- `nmrstarlib`: For NMR-STAR files
+**EDA Approach:**
+- Signal decay analysis
+- Noise level assessment
+- Acquisition parameter validation
+- Apodization function selection
+- Zero-filling optimization
+- Phasing parameter estimation
+### .ft - NMR Frequency-Domain Data
+**Description:** Processed NMR spectrum
+**Typical Data:** Frequency-domain NMR data
+**Use Cases:** NMR analysis and interpretation
+**Python Libraries:**
+- `nmrglue`: Comprehensive NMR support
+- `pyNMR`: For processing
+**EDA Approach:**
+- Peak picking and integration
+- Chemical shift calibration
+- Multiplicity analysis
+- Coupling constant extraction
+- Spectral quality metrics
+- Reference compound identification
+### .spc - Spectroscopy File
+**Description:** Thermo Galactic spectroscopy format
+**Typical Data:** IR, Raman, UV-Vis spectra
+**Use Cases:** Spectroscopic data from various instruments
+**Python Libraries:**
+- `spc`: `spc.File('file.spc')`
+- Custom parsers for binary format
+**EDA Approach:**
+- Spectral resolution
+- Wavelength/wavenumber range
+- Baseline characterization
+- Peak identification
+- Derivative spectra calculation
+## Chemical Database Formats
+### .inchi - International Chemical Identifier
+**Description:** Text identifier for chemical substances
+**Typical Data:** Layered chemical structure representation
+**Use Cases:** Chemical database keys, structure searching
+**Python Libraries:**
+- `RDKit`: `Chem.MolFromInchi(inchi)`
+- `Open Babel`: InChI conversion
+**EDA Approach:**
+- InChI validation
+- Layer analysis
+- Stereochemistry verification
+- InChI key generation
+- Structure round-trip validation
+### .cdx / .cdxml - ChemDraw Exchange
+**Description:** ChemDraw drawing file format
+**Typical Data:** 2D chemical structures with annotations
+**Use Cases:** Chemical drawing, publication figures
+**Python Libraries:**
+- `RDKit`: Can import some CDXML
+- `Open Babel`: Limited support
+- `ChemDraw` Python API (commercial)
+**EDA Approach:**
+- Structure extraction
+- Annotation preservation
+- Style consistency
+- 2D coordinate validation
+### .cml - Chemical Markup Language
+**Description:** XML-based chemical structure format
+**Typical Data:** Chemical structures, reactions, properties
+**Use Cases:** Semantic chemical data representation
+**Python Libraries:**
+- `RDKit`: CML support
+- `Open Babel`: Good CML support
+- `lxml`: For XML parsing
+**EDA Approach:**
+- XML schema validation
+- Namespace handling
+- Property extraction
+- Reaction scheme analysis
+- Metadata completeness
+### .rxn - MDL Reaction File
+**Description:** Chemical reaction structure file
+**Typical Data:** Reactants, products, reaction arrows
+**Use Cases:** Reaction databases, synthesis planning
+**Python Libraries:**
+- `RDKit`: `Chem.ReactionFromRxnFile('file.rxn')`
+- `Open Babel`: Reaction support
+**EDA Approach:**
+- Reaction balancing validation
+- Atom mapping analysis
+- Reagent identification
+- Stereochemistry changes
+- Reaction classification
+### .rdf - Reaction Data File
+**Description:** Multi-reaction file format
+**Typical Data:** Multiple reactions with data
+**Use Cases:** Reaction databases
+**Python Libraries:**
+- `RDKit`: RDF reading capabilities
+- Custom parsers
+**EDA Approach:**
+- Reaction yield statistics
+- Condition analysis
+- Success rate patterns
+- Reagent frequency analysis
+## Computational Output and Data
+### .hdf5 / .h5 - Hierarchical Data Format
+**Description:** Container for scientific data arrays
+**Typical Data:** Large arrays, metadata, hierarchical organization
+**Use Cases:** Large dataset storage, computational results
+**Python Libraries:**
+- `h5py`: `h5py.File('file.h5', 'r')`
+- `pytables`: Advanced HDF5 interface
+- `pandas`: Can read HDF5
+**EDA Approach:**
+- Dataset structure exploration
+- Array shape and dtype analysis
+- Metadata extraction
+- Memory-efficient data sampling
+- Chunk optimization analysis
+- Compression ratio assessment
+### .pkl / .pickle - Python Pickle
+**Description:** Serialized Python objects
+**Typical Data:** Any Python object (molecules, dataframes, models)
+**Use Cases:** Intermediate data storage, model persistence
+**Python Libraries:**
+- `pickle`: Built-in serialization
+- `joblib`: Enhanced pickling for large arrays
+- `dill`: Extended pickle support
+**EDA Approach:**
+- Object type inspection
+- Size and complexity analysis
+- Version compatibility check
+- Security validation (trusted source)
+- Deserialization testing
+### .npy / .npz - NumPy Arrays
+**Description:** NumPy array binary format
+**Typical Data:** Numerical arrays (coordinates, features, matrices)
+**Use Cases:** Fast numerical data I/O
+**Python Libraries:**
+- `numpy`: `np.load('file.npy')`
+- Direct memory mapping for large files
+**EDA Approach:**
+- Array shape and dimensions
+- Data type and precision
+- Statistical summary (mean, std, range)
+- Missing value detection
+- Outlier identification
+- Memory footprint analysis
+### .mat - MATLAB Data File
+**Description:** MATLAB workspace data
+**Typical Data:** Arrays, structures from MATLAB
+**Use Cases:** MATLAB-Python data exchange
+**Python Libraries:**
+- `scipy.io`: `scipy.io.loadmat('file.mat')`
+- `h5py`: For v7.3 MAT files
+**EDA Approach:**
+- Variable extraction and types
+- Array dimension analysis
+- Structure field exploration
+- MATLAB version compatibility
+- Data type conversion validation
+### .csv - Comma-Separated Values
+**Description:** Tabular data in text format
+**Typical Data:** Chemical properties, experimental data, descriptors
+**Use Cases:** Data exchange, analysis, machine learning
+**Python Libraries:**
+- `pandas`: `pd.read_csv('file.csv')`
+- `csv`: Built-in module
+- `polars`: Fast CSV reading
+**EDA Approach:**
+- Data types inference
+- Missing value patterns
+- Statistical summaries
+- Correlation analysis
+- Distribution visualization
+- Outlier detection
+### .json - JavaScript Object Notation
+**Description:** Structured text data format
+**Typical Data:** Chemical properties, metadata, API responses
+**Use Cases:** Data interchange, configuration, web APIs
+**Python Libraries:**
+- `json`: Built-in JSON support
+- `pandas`: `pd.read_json()`
+- `ujson`: Faster JSON parsing
+**EDA Approach:**
+- Schema validation
+- Nesting depth analysis
+- Key-value distribution
+- Data type consistency
+- Array length statistics
+### .parquet - Apache Parquet
+**Description:** Columnar storage format
+**Typical Data:** Large tabular datasets efficiently
+**Use Cases:** Big data, efficient columnar analytics
+**Python Libraries:**
+- `pandas`: `pd.read_parquet('file.parquet')`
+- `pyarrow`: Direct parquet access
+- `fastparquet`: Alternative implementation
+**EDA Approach:**
+- Column statistics from metadata
+- Partition analysis
+- Compression efficiency
+- Row group structure
+- Fast sampling for large files
+- Schema evolution tracking

.scider/skills/exploratory-data-analysis/references/general_scientific_formats.md ADDED Viewed

	@@ -0,0 +1,518 @@

+# General Scientific Data Formats Reference
+This reference covers general-purpose scientific data formats used across multiple disciplines.
+## Numerical and Array Data
+### .npy - NumPy Array
+**Description:** Binary NumPy array format
+**Typical Data:** N-dimensional arrays of any data type
+**Use Cases:** Fast I/O for numerical data, intermediate results
+**Python Libraries:**
+- `numpy`: `np.load('file.npy')`, `np.save()`
+- Memory-mapped access: `np.load('file.npy', mmap_mode='r')`
+**EDA Approach:**
+- Array shape and dimensionality
+- Data type and precision
+- Statistical summary (mean, std, min, max, percentiles)
+- Missing or invalid values (NaN, inf)
+- Memory footprint
+- Value distribution and histogram
+- Sparsity analysis
+- Correlation structure (if 2D)
+### .npz - Compressed NumPy Archive
+**Description:** Multiple NumPy arrays in one file
+**Typical Data:** Collections of related arrays
+**Use Cases:** Saving multiple arrays together, compressed storage
+**Python Libraries:**
+- `numpy`: `np.load('file.npz')` returns dict-like object
+- `np.savez()` or `np.savez_compressed()`
+**EDA Approach:**
+- List of contained arrays
+- Individual array analysis
+- Relationships between arrays
+- Total file size and compression ratio
+- Naming conventions
+- Data consistency checks
+### .csv - Comma-Separated Values
+**Description:** Plain text tabular data
+**Typical Data:** Experimental measurements, results tables
+**Use Cases:** Universal data exchange, spreadsheet export
+**Python Libraries:**
+- `pandas`: `pd.read_csv('file.csv')`
+- `csv`: Built-in module
+- `polars`: High-performance CSV reading
+- `numpy`: `np.loadtxt()` or `np.genfromtxt()`
+**EDA Approach:**
+- Row and column counts
+- Data type inference
+- Missing value patterns and frequency
+- Column statistics (numeric: mean, std; categorical: frequencies)
+- Outlier detection
+- Correlation matrix
+- Duplicate row detection
+- Header and index validation
+- Encoding issues detection
+### .tsv / .tab - Tab-Separated Values
+**Description:** Tab-delimited tabular data
+**Typical Data:** Similar to CSV but tab-separated
+**Use Cases:** Bioinformatics, text processing output
+**Python Libraries:**
+- `pandas`: `pd.read_csv('file.tsv', sep='\t')`
+**EDA Approach:**
+- Same as CSV format
+- Tab vs space validation
+- Quote handling
+### .xlsx / .xls - Excel Spreadsheets
+**Description:** Microsoft Excel binary/XML formats
+**Typical Data:** Tabular data with formatting, formulas
+**Use Cases:** Lab notebooks, data entry, reports
+**Python Libraries:**
+- `pandas`: `pd.read_excel('file.xlsx')`
+- `openpyxl`: Full Excel file manipulation
+- `xlrd`: Reading .xls (legacy)
+**EDA Approach:**
+- Sheet enumeration and names
+- Per-sheet data analysis
+- Formula evaluation
+- Merged cells handling
+- Hidden rows/columns
+- Data validation rules
+- Named ranges
+- Formatting-only cells detection
+### .json - JavaScript Object Notation
+**Description:** Hierarchical text data format
+**Typical Data:** Nested data structures, metadata
+**Use Cases:** API responses, configuration, results
+**Python Libraries:**
+- `json`: Built-in module
+- `pandas`: `pd.read_json()`
+- `ujson`: Faster JSON parsing
+**EDA Approach:**
+- Schema inference
+- Nesting depth
+- Key-value distribution
+- Array lengths
+- Data type consistency
+- Missing keys
+- Duplicate detection
+- Size and complexity metrics
+### .xml - Extensible Markup Language
+**Description:** Hierarchical markup format
+**Typical Data:** Structured data with metadata
+**Use Cases:** Standards-based data exchange, APIs
+**Python Libraries:**
+- `lxml`: `lxml.etree.parse()`
+- `xml.etree.ElementTree`: Built-in XML
+- `xmltodict`: Convert XML to dict
+**EDA Approach:**
+- Schema/DTD validation
+- Element hierarchy and depth
+- Namespace handling
+- Attribute vs element content
+- CDATA sections
+- Text content extraction
+- Sibling and child counts
+### .yaml / .yml - YAML
+**Description:** Human-readable data serialization
+**Typical Data:** Configuration, metadata, parameters
+**Use Cases:** Experiment configurations, pipelines
+**Python Libraries:**
+- `yaml`: `yaml.safe_load()` or `yaml.load()`
+- `ruamel.yaml`: YAML 1.2 support
+**EDA Approach:**
+- Configuration structure
+- Data type handling
+- List and dict depth
+- Anchor and alias usage
+- Multi-document files
+- Comments preservation
+- Validation against schema
+### .toml - TOML Configuration
+**Description:** Configuration file format
+**Typical Data:** Settings, parameters
+**Use Cases:** Python package configuration, settings
+**Python Libraries:**
+- `tomli` / `tomllib`: TOML reading (tomllib in Python 3.11+)
+- `toml`: Reading and writing
+**EDA Approach:**
+- Section structure
+- Key-value pairs
+- Data type inference
+- Nested table validation
+- Required vs optional fields
+### .ini - INI Configuration
+**Description:** Simple configuration format
+**Typical Data:** Application settings
+**Use Cases:** Legacy configurations, simple settings
+**Python Libraries:**
+- `configparser`: Built-in INI parser
+**EDA Approach:**
+- Section enumeration
+- Key-value extraction
+- Type conversion
+- Comment handling
+- Case sensitivity
+## Binary and Compressed Data
+### .hdf5 / .h5 - Hierarchical Data Format 5
+**Description:** Container for large scientific datasets
+**Typical Data:** Multi-dimensional arrays, metadata, groups
+**Use Cases:** Large datasets, multi-modal data, parallel I/O
+**Python Libraries:**
+- `h5py`: `h5py.File('file.h5', 'r')`
+- `pytables`: Advanced HDF5 interface
+- `pandas`: HDF5 storage via HDFStore
+**EDA Approach:**
+- Group and dataset hierarchy
+- Dataset shapes and dtypes
+- Attributes and metadata
+- Compression and chunking strategy
+- Memory-efficient sampling
+- Dataset relationships
+- File size and efficiency
+- Access patterns optimization
+### .zarr - Chunked Array Storage
+**Description:** Cloud-optimized chunked arrays
+**Typical Data:** Large N-dimensional arrays
+**Use Cases:** Cloud storage, parallel computing, streaming
+**Python Libraries:**
+- `zarr`: `zarr.open('file.zarr')`
+- `xarray`: Zarr backend support
+**EDA Approach:**
+- Array metadata and dimensions
+- Chunk size optimization
+- Compression codec and ratio
+- Synchronizer and store type
+- Multi-scale hierarchies
+- Parallel access performance
+- Attribute metadata
+### .gz / .gzip - Gzip Compressed
+**Description:** Compressed data files
+**Typical Data:** Any compressed text or binary
+**Use Cases:** Compression for storage/transfer
+**Python Libraries:**
+- `gzip`: Built-in gzip module
+- `pandas`: Automatic gzip handling in read functions
+**EDA Approach:**
+- Compression ratio
+- Original file type detection
+- Decompression validation
+- Header information
+- Multi-member archives
+### .bz2 - Bzip2 Compressed
+**Description:** Bzip2 compression
+**Typical Data:** Highly compressed files
+**Use Cases:** Better compression than gzip
+**Python Libraries:**
+- `bz2`: Built-in bz2 module
+- Automatic handling in pandas
+**EDA Approach:**
+- Compression efficiency
+- Decompression time
+- Content validation
+### .zip - ZIP Archive
+**Description:** Archive with multiple files
+**Typical Data:** Collections of files
+**Use Cases:** File distribution, archiving
+**Python Libraries:**
+- `zipfile`: Built-in ZIP support
+- `pandas`: Can read zipped CSVs
+**EDA Approach:**
+- Archive member listing
+- Compression method per file
+- Total vs compressed size
+- Directory structure
+- File type distribution
+- Extraction validation
+### .tar / .tar.gz - TAR Archive
+**Description:** Unix tape archive
+**Typical Data:** Multiple files and directories
+**Use Cases:** Software distribution, backups
+**Python Libraries:**
+- `tarfile`: Built-in TAR support
+**EDA Approach:**
+- Member file listing
+- Compression (if .tar.gz, .tar.bz2)
+- Directory structure
+- Permissions preservation
+- Extraction testing
+## Time Series and Waveform Data
+### .wav - Waveform Audio
+**Description:** Audio waveform data
+**Typical Data:** Acoustic signals, audio recordings
+**Use Cases:** Acoustic analysis, ultrasound, signal processing
+**Python Libraries:**
+- `scipy.io.wavfile`: `scipy.io.wavfile.read()`
+- `wave`: Built-in module
+- `soundfile`: Enhanced audio I/O
+**EDA Approach:**
+- Sample rate and duration
+- Bit depth and channels
+- Amplitude distribution
+- Spectral analysis (FFT)
+- Signal-to-noise ratio
+- Clipping detection
+- Frequency content
+### .mat - MATLAB Data
+**Description:** MATLAB workspace variables
+**Typical Data:** Arrays, structures, cells
+**Use Cases:** MATLAB-Python interoperability
+**Python Libraries:**
+- `scipy.io`: `scipy.io.loadmat()`
+- `h5py`: For MATLAB v7.3 files (HDF5-based)
+- `mat73`: Pure Python for v7.3
+**EDA Approach:**
+- Variable names and types
+- Array dimensions
+- Structure field exploration
+- Cell array handling
+- Sparse matrix detection
+- MATLAB version compatibility
+- Metadata extraction
+### .edf - European Data Format
+**Description:** Time series data (especially medical)
+**Typical Data:** EEG, physiological signals
+**Use Cases:** Medical signal storage
+**Python Libraries:**
+- `pyedflib`: EDF/EDF+ reading and writing
+- `mne`: Neurophysiology data (supports EDF)
+**EDA Approach:**
+- Signal count and names
+- Sampling frequencies
+- Signal ranges and units
+- Recording duration
+- Annotation events
+- Data quality (saturation, noise)
+- Patient/study information
+### .csv (Time Series)
+**Description:** CSV with timestamp column
+**Typical Data:** Time-indexed measurements
+**Use Cases:** Sensor data, monitoring, experiments
+**Python Libraries:**
+- `pandas`: `pd.read_csv()` with `parse_dates`
+**EDA Approach:**
+- Temporal range and resolution
+- Sampling regularity
+- Missing time points
+- Trend and seasonality
+- Stationarity tests
+- Autocorrelation
+- Anomaly detection
+## Geospatial and Environmental Data
+### .shp - Shapefile
+**Description:** Geospatial vector data
+**Typical Data:** Geographic features (points, lines, polygons)
+**Use Cases:** GIS analysis, spatial data
+**Python Libraries:**
+- `geopandas`: `gpd.read_file('file.shp')`
+- `fiona`: Lower-level shapefile access
+- `pyshp`: Pure Python shapefile reader
+**EDA Approach:**
+- Geometry type and count
+- Coordinate reference system
+- Bounding box
+- Attribute table analysis
+- Geometry validity
+- Spatial distribution
+- Multi-part features
+- Associated files (.shx, .dbf, .prj)
+### .geojson - GeoJSON
+**Description:** JSON format for geographic data
+**Typical Data:** Features with geometry and properties
+**Use Cases:** Web mapping, spatial analysis
+**Python Libraries:**
+- `geopandas`: Native GeoJSON support
+- `json`: Parse as JSON then process
+**EDA Approach:**
+- Feature count and types
+- CRS specification
+- Bounding box calculation
+- Property schema
+- Geometry complexity
+- Nesting structure
+### .tif / .tiff (Geospatial)
+**Description:** GeoTIFF with spatial reference
+**Typical Data:** Satellite imagery, DEMs, rasters
+**Use Cases:** Remote sensing, terrain analysis
+**Python Libraries:**
+- `rasterio`: `rasterio.open('file.tif')`
+- `gdal`: Geospatial Data Abstraction Library
+- `xarray` with `rioxarray`: N-D geospatial arrays
+**EDA Approach:**
+- Raster dimensions and resolution
+- Band count and descriptions
+- Coordinate reference system
+- Geotransform parameters
+- NoData value handling
+- Pixel value distribution
+- Histogram analysis
+- Overviews and pyramids
+### .nc / .netcdf - Network Common Data Form
+**Description:** Self-describing array-based data
+**Typical Data:** Climate, atmospheric, oceanographic data
+**Use Cases:** Scientific datasets, model output
+**Python Libraries:**
+- `netCDF4`: `netCDF4.Dataset('file.nc')`
+- `xarray`: `xr.open_dataset('file.nc')`
+**EDA Approach:**
+- Variable enumeration
+- Dimension analysis
+- Time series properties
+- Spatial coverage
+- Attribute metadata (CF conventions)
+- Coordinate systems
+- Chunking and compression
+- Data quality flags
+### .grib / .grib2 - Gridded Binary
+**Description:** Meteorological data format
+**Typical Data:** Weather forecasts, climate data
+**Use Cases:** Numerical weather prediction
+**Python Libraries:**
+- `pygrib`: GRIB file reading
+- `xarray` with `cfgrib`: GRIB to xarray
+**EDA Approach:**
+- Message inventory
+- Parameter and level types
+- Spatial grid specification
+- Temporal coverage
+- Ensemble members
+- Forecast vs analysis
+- Data packing and precision
+### .hdf4 - HDF4 Format
+**Description:** Older HDF format
+**Typical Data:** NASA Earth Science data
+**Use Cases:** Satellite data (MODIS, etc.)
+**Python Libraries:**
+- `pyhdf`: HDF4 access
+- `gdal`: Can read HDF4
+**EDA Approach:**
+- Scientific dataset listing
+- Vdata and attributes
+- Dimension scales
+- Metadata extraction
+- Quality flags
+- Conversion to HDF5 or NetCDF
+## Specialized Scientific Formats
+### .fits - Flexible Image Transport System
+**Description:** Astronomy data format
+**Typical Data:** Images, tables, spectra from telescopes
+**Use Cases:** Astronomical observations
+**Python Libraries:**
+- `astropy.io.fits`: `fits.open('file.fits')`
+- `fitsio`: Alternative FITS library
+**EDA Approach:**
+- HDU (Header Data Unit) structure
+- Image dimensions and WCS
+- Header keyword analysis
+- Table column descriptions
+- Data type and scaling
+- FITS convention compliance
+- Checksum validation
+### .asdf - Advanced Scientific Data Format
+**Description:** Next-gen data format for astronomy
+**Typical Data:** Complex hierarchical scientific data
+**Use Cases:** James Webb Space Telescope data
+**Python Libraries:**
+- `asdf`: `asdf.open('file.asdf')`
+**EDA Approach:**
+- Tree structure exploration
+- Schema validation
+- Internal vs external arrays
+- Compression methods
+- YAML metadata
+- Version compatibility
+### .root - ROOT Data Format
+**Description:** CERN ROOT framework format
+**Typical Data:** High-energy physics data
+**Use Cases:** Particle physics experiments
+**Python Libraries:**
+- `uproot`: Pure Python ROOT reading
+- `ROOT`: Official PyROOT bindings
+**EDA Approach:**
+- TTree structure
+- Branch types and entries
+- Histogram inventory
+- Event loop statistics
+- File compression
+- Split level analysis
+### .txt - Plain Text Data
+**Description:** Generic text-based data
+**Typical Data:** Tab/space-delimited, custom formats
+**Use Cases:** Simple data exchange, logs
+**Python Libraries:**
+- `pandas`: `pd.read_csv()` with custom delimiters
+- `numpy`: `np.loadtxt()`, `np.genfromtxt()`
+- Built-in file reading
+**EDA Approach:**
+- Format detection (delimiter, header)
+- Data type inference
+- Comment line handling
+- Missing value codes
+- Column alignment
+- Encoding detection
+### .dat - Generic Data File
+**Description:** Binary or text data
+**Typical Data:** Instrument output, custom formats
+**Use Cases:** Various scientific instruments
+**Python Libraries:**
+- Format-specific: requires knowledge of structure
+- `numpy`: `np.fromfile()` for binary
+- `struct`: Parse binary structures
+**EDA Approach:**
+- Binary vs text determination
+- Header detection
+- Record structure inference
+- Endianness
+- Data type patterns
+- Validation with documentation
+### .log - Log Files
+**Description:** Text logs from software/instruments
+**Typical Data:** Timestamped events, messages
+**Use Cases:** Troubleshooting, experiment tracking
+**Python Libraries:**
+- Built-in file reading
+- `pandas`: Structured log parsing
+- Regular expressions for parsing
+**EDA Approach:**
+- Log level distribution
+- Timestamp parsing
+- Error and warning frequency
+- Event sequencing
+- Pattern recognition
+- Anomaly detection
+- Session boundaries

.scider/skills/exploratory-data-analysis/references/microscopy_imaging_formats.md ADDED Viewed

	@@ -0,0 +1,620 @@

+# Microscopy and Imaging File Formats Reference
+This reference covers file formats used in microscopy, medical imaging, remote sensing, and scientific image analysis.
+## Microscopy-Specific Formats
+### .tif / .tiff - Tagged Image File Format
+**Description:** Flexible image format supporting multiple pages and metadata
+**Typical Data:** Microscopy images, z-stacks, time series, multi-channel
+**Use Cases:** Fluorescence microscopy, confocal imaging, biological imaging
+**Python Libraries:**
+- `tifffile`: `tifffile.imread('file.tif')` - Microscopy TIFF support
+- `PIL/Pillow`: `Image.open('file.tif')` - Basic TIFF
+- `scikit-image`: `io.imread('file.tif')`
+- `AICSImageIO`: Multi-format microscopy reader
+**EDA Approach:**
+- Image dimensions and bit depth
+- Multi-page/z-stack analysis
+- Metadata extraction (OME-TIFF)
+- Channel analysis and intensity distributions
+- Temporal dynamics (time-lapse)
+- Pixel size and spatial calibration
+- Histogram analysis per channel
+- Dynamic range utilization
+### .nd2 - Nikon NIS-Elements
+**Description:** Proprietary Nikon microscope format
+**Typical Data:** Multi-dimensional microscopy (XYZCT)
+**Use Cases:** Nikon microscope data, confocal, widefield
+**Python Libraries:**
+- `nd2reader`: `ND2Reader('file.nd2')`
+- `pims`: `pims.ND2_Reader('file.nd2')`
+- `AICSImageIO`: Universal reader
+**EDA Approach:**
+- Experiment metadata extraction
+- Channel configurations
+- Time-lapse frame analysis
+- Z-stack depth and spacing
+- XY stage positions
+- Laser settings and power
+- Pixel binning information
+- Acquisition timestamps
+### .lif - Leica Image Format
+**Description:** Leica microscope proprietary format
+**Typical Data:** Multi-experiment, multi-dimensional images
+**Use Cases:** Leica confocal and widefield data
+**Python Libraries:**
+- `readlif`: `readlif.LifFile('file.lif')`
+- `AICSImageIO`: LIF support
+- `python-bioformats`: Via Bio-Formats
+**EDA Approach:**
+- Multiple experiment detection
+- Image series enumeration
+- Metadata per experiment
+- Channel and timepoint structure
+- Physical dimensions extraction
+- Objective and detector information
+- Scan settings analysis
+### .czi - Carl Zeiss Image
+**Description:** Zeiss microscope format
+**Typical Data:** Multi-dimensional microscopy with rich metadata
+**Use Cases:** Zeiss confocal, lightsheet, widefield
+**Python Libraries:**
+- `czifile`: `czifile.CziFile('file.czi')`
+- `AICSImageIO`: CZI support
+- `pylibCZIrw`: Official Zeiss library
+**EDA Approach:**
+- Scene and position analysis
+- Mosaic tile structure
+- Channel wavelength information
+- Acquisition mode detection
+- Scaling and calibration
+- Instrument configuration
+- ROI definitions
+### .oib / .oif - Olympus Image Format
+**Description:** Olympus microscope formats
+**Typical Data:** Confocal and multiphoton imaging
+**Use Cases:** Olympus FluoView data
+**Python Libraries:**
+- `AICSImageIO`: OIB/OIF support
+- `python-bioformats`: Via Bio-Formats
+**EDA Approach:**
+- Directory structure validation (OIF)
+- Metadata file parsing
+- Channel configuration
+- Scan parameters
+- Objective and filter information
+- PMT settings
+### .vsi - Olympus VSI
+**Description:** Olympus slide scanner format
+**Typical Data:** Whole slide imaging, large mosaics
+**Use Cases:** Virtual microscopy, pathology
+**Python Libraries:**
+- `openslide-python`: `openslide.OpenSlide('file.vsi')`
+- `AICSImageIO`: VSI support
+**EDA Approach:**
+- Pyramid level analysis
+- Tile structure and overlap
+- Macro and label images
+- Magnification levels
+- Whole slide statistics
+- Region detection
+### .ims - Imaris Format
+**Description:** Bitplane Imaris HDF5-based format
+**Typical Data:** Large 3D/4D microscopy datasets
+**Use Cases:** 3D rendering, time-lapse analysis
+**Python Libraries:**
+- `h5py`: Direct HDF5 access
+- `imaris_ims_file_reader`: Specialized reader
+**EDA Approach:**
+- Resolution level analysis
+- Time point structure
+- Channel organization
+- Dataset hierarchy
+- Thumbnail generation
+- Memory-mapped access strategies
+- Chunking optimization
+### .lsm - Zeiss LSM
+**Description:** Legacy Zeiss confocal format
+**Typical Data:** Confocal laser scanning microscopy
+**Use Cases:** Older Zeiss confocal data
+**Python Libraries:**
+- `tifffile`: LSM support (TIFF-based)
+- `python-bioformats`: LSM reading
+**EDA Approach:**
+- Similar to TIFF with LSM-specific metadata
+- Scan speed and resolution
+- Laser lines and power
+- Detector gain and offset
+- LUT information
+### .stk - MetaMorph Stack
+**Description:** MetaMorph image stack format
+**Typical Data:** Time-lapse or z-stack sequences
+**Use Cases:** MetaMorph software output
+**Python Libraries:**
+- `tifffile`: STK is TIFF-based
+- `python-bioformats`: STK support
+**EDA Approach:**
+- Stack dimensionality
+- Plane metadata
+- Timing information
+- Stage positions
+- UIC tags parsing
+### .dv - DeltaVision
+**Description:** Applied Precision DeltaVision format
+**Typical Data:** Deconvolution microscopy
+**Use Cases:** DeltaVision microscope data
+**Python Libraries:**
+- `mrc`: Can read DV (MRC-related)
+- `AICSImageIO`: DV support
+**EDA Approach:**
+- Wave information (channels)
+- Extended header analysis
+- Lens and magnification
+- Deconvolution status
+- Time stamps per section
+### .mrc - Medical Research Council
+**Description:** Electron microscopy format
+**Typical Data:** EM images, cryo-EM, tomography
+**Use Cases:** Structural biology, electron microscopy
+**Python Libraries:**
+- `mrcfile`: `mrcfile.open('file.mrc')`
+- `EMAN2`: EM-specific tools
+**EDA Approach:**
+- Volume dimensions
+- Voxel size and units
+- Origin and map statistics
+- Symmetry information
+- Extended header analysis
+- Density statistics
+- Header consistency validation
+### .dm3 / .dm4 - Gatan Digital Micrograph
+**Description:** Gatan TEM/STEM format
+**Typical Data:** Transmission electron microscopy
+**Use Cases:** TEM imaging and analysis
+**Python Libraries:**
+- `hyperspy`: `hs.load('file.dm3')`
+- `ncempy`: `ncempy.io.dm.dmReader('file.dm3')`
+**EDA Approach:**
+- Microscope parameters
+- Energy dispersive spectroscopy data
+- Diffraction patterns
+- Calibration information
+- Tag structure analysis
+- Image series handling
+### .eer - Electron Event Representation
+**Description:** Direct electron detector format
+**Typical Data:** Electron counting data from detectors
+**Use Cases:** Cryo-EM data collection
+**Python Libraries:**
+- `mrcfile`: Some EER support
+- Vendor-specific tools (Gatan, TFS)
+**EDA Approach:**
+- Event counting statistics
+- Frame rate and dose
+- Detector configuration
+- Motion correction assessment
+- Gain reference validation
+### .ser - TIA Series
+**Description:** FEI/TFS TIA format
+**Typical Data:** EM image series
+**Use Cases:** FEI/Thermo Fisher EM data
+**Python Libraries:**
+- `hyperspy`: SER support
+- `ncempy`: TIA reader
+**EDA Approach:**
+- Series structure
+- Calibration data
+- Acquisition metadata
+- Time stamps
+- Multi-dimensional data organization
+## Medical and Biological Imaging
+### .dcm - DICOM
+**Description:** Digital Imaging and Communications in Medicine
+**Typical Data:** Medical images with patient/study metadata
+**Use Cases:** Clinical imaging, radiology, CT, MRI, PET
+**Python Libraries:**
+- `pydicom`: `pydicom.dcmread('file.dcm')`
+- `SimpleITK`: `sitk.ReadImage('file.dcm')`
+- `nibabel`: Limited DICOM support
+**EDA Approach:**
+- Patient metadata extraction (anonymization check)
+- Modality-specific analysis
+- Series and study organization
+- Slice thickness and spacing
+- Window/level settings
+- Hounsfield units (CT)
+- Image orientation and position
+- Multi-frame analysis
+### .nii / .nii.gz - NIfTI
+**Description:** Neuroimaging Informatics Technology Initiative
+**Typical Data:** Brain imaging, fMRI, structural MRI
+**Use Cases:** Neuroimaging research, brain analysis
+**Python Libraries:**
+- `nibabel`: `nibabel.load('file.nii')`
+- `nilearn`: Neuroimaging with ML
+- `SimpleITK`: NIfTI support
+**EDA Approach:**
+- Volume dimensions and voxel size
+- Affine transformation matrix
+- Time series analysis (fMRI)
+- Intensity distribution
+- Brain extraction quality
+- Registration assessment
+- Orientation validation
+- Header information consistency
+### .mnc - MINC Format
+**Description:** Medical Image NetCDF
+**Typical Data:** Medical imaging (predecessor to NIfTI)
+**Use Cases:** Legacy neuroimaging data
+**Python Libraries:**
+- `pyminc`: MINC-specific tools
+- `nibabel`: MINC support
+**EDA Approach:**
+- Similar to NIfTI
+- NetCDF structure exploration
+- Dimension ordering
+- Metadata extraction
+### .nrrd - Nearly Raw Raster Data
+**Description:** Medical imaging format with detached header
+**Typical Data:** Medical images, research imaging
+**Use Cases:** 3D Slicer, ITK-based applications
+**Python Libraries:**
+- `pynrrd`: `nrrd.read('file.nrrd')`
+- `SimpleITK`: NRRD support
+**EDA Approach:**
+- Header field analysis
+- Encoding format
+- Dimension and spacing
+- Orientation matrix
+- Compression assessment
+- Endianness handling
+### .mha / .mhd - MetaImage
+**Description:** MetaImage format (ITK)
+**Typical Data:** Medical/scientific 3D images
+**Use Cases:** ITK/SimpleITK applications
+**Python Libraries:**
+- `SimpleITK`: Native MHA/MHD support
+- `itk`: Direct ITK integration
+**EDA Approach:**
+- Header-data file pairing (MHD)
+- Transform matrix
+- Element spacing
+- Compression format
+- Data type and dimensions
+### .hdr / .img - Analyze Format
+**Description:** Legacy medical imaging format
+**Typical Data:** Brain imaging (pre-NIfTI)
+**Use Cases:** Old neuroimaging datasets
+**Python Libraries:**
+- `nibabel`: Analyze support
+- Conversion to NIfTI recommended
+**EDA Approach:**
+- Header-image pairing validation
+- Byte order issues
+- Conversion to modern formats
+- Metadata limitations
+## Scientific Image Formats
+### .png - Portable Network Graphics
+**Description:** Lossless compressed image format
+**Typical Data:** 2D images, screenshots, processed data
+**Use Cases:** Publication figures, lossless storage
+**Python Libraries:**
+- `PIL/Pillow`: `Image.open('file.png')`
+- `scikit-image`: `io.imread('file.png')`
+- `imageio`: `imageio.imread('file.png')`
+**EDA Approach:**
+- Bit depth analysis (8-bit, 16-bit)
+- Color mode (grayscale, RGB, palette)
+- Metadata (PNG chunks)
+- Transparency handling
+- Compression efficiency
+- Histogram analysis
+### .jpg / .jpeg - Joint Photographic Experts Group
+**Description:** Lossy compressed image format
+**Typical Data:** Natural images, photos
+**Use Cases:** Visualization, web graphics (not raw data)
+**Python Libraries:**
+- `PIL/Pillow`: Standard JPEG support
+- `scikit-image`: JPEG reading
+**EDA Approach:**
+- Compression artifacts detection
+- Quality factor estimation
+- Color space (RGB, grayscale)
+- EXIF metadata
+- Quantization table analysis
+- Note: Not suitable for quantitative analysis
+### .bmp - Bitmap Image
+**Description:** Uncompressed raster image
+**Typical Data:** Simple images, screenshots
+**Use Cases:** Compatibility, simple storage
+**Python Libraries:**
+- `PIL/Pillow`: BMP support
+- `scikit-image`: BMP reading
+**EDA Approach:**
+- Color depth
+- Palette analysis (if indexed)
+- File size efficiency
+- Pixel format validation
+### .gif - Graphics Interchange Format
+**Description:** Image format with animation support
+**Typical Data:** Animated images, simple graphics
+**Use Cases:** Animations, time-lapse visualization
+**Python Libraries:**
+- `PIL/Pillow`: GIF support
+- `imageio`: Better GIF animation support
+**EDA Approach:**
+- Frame count and timing
+- Palette limitations (256 colors)
+- Loop count
+- Disposal method
+- Transparency handling
+### .svg - Scalable Vector Graphics
+**Description:** XML-based vector graphics
+**Typical Data:** Vector drawings, plots, diagrams
+**Use Cases:** Publication-quality figures, plots
+**Python Libraries:**
+- `svgpathtools`: Path manipulation
+- `cairosvg`: Rasterization
+- `lxml`: XML parsing
+**EDA Approach:**
+- Element structure analysis
+- Style information
+- Viewbox and dimensions
+- Path complexity
+- Text element extraction
+- Layer organization
+### .eps - Encapsulated PostScript
+**Description:** Vector graphics format
+**Typical Data:** Publication figures
+**Use Cases:** Legacy publication graphics
+**Python Libraries:**
+- `PIL/Pillow`: Basic EPS rasterization
+- `ghostscript` via subprocess
+**EDA Approach:**
+- Bounding box information
+- Preview image validation
+- Font embedding
+- Conversion to modern formats
+### .pdf (Images)
+**Description:** Portable Document Format with images
+**Typical Data:** Publication figures, multi-page documents
+**Use Cases:** Publication, data presentation
+**Python Libraries:**
+- `PyMuPDF/fitz`: `fitz.open('file.pdf')`
+- `pdf2image`: Rasterization
+- `pdfplumber`: Text and layout extraction
+**EDA Approach:**
+- Page count
+- Image extraction
+- Resolution and DPI
+- Embedded fonts and metadata
+- Compression methods
+- Image vs vector content
+### .fig - MATLAB Figure
+**Description:** MATLAB figure file
+**Typical Data:** MATLAB plots and figures
+**Use Cases:** MATLAB data visualization
+**Python Libraries:**
+- Custom parsers (MAT file structure)
+- Conversion to other formats
+**EDA Approach:**
+- Figure structure
+- Data extraction from plots
+- Axes and label information
+- Plot type identification
+### .hdf5 (Imaging Specific)
+**Description:** HDF5 for large imaging datasets
+**Typical Data:** High-content screening, large microscopy
+**Use Cases:** BigDataViewer, large-scale imaging
+**Python Libraries:**
+- `h5py`: Universal HDF5 access
+- Imaging-specific readers (BigDataViewer)
+**EDA Approach:**
+- Dataset hierarchy
+- Chunk and compression strategy
+- Multi-resolution pyramid
+- Metadata organization
+- Memory-mapped access
+- Parallel I/O performance
+### .zarr - Chunked Array Storage
+**Description:** Cloud-optimized array storage
+**Typical Data:** Large imaging datasets, OME-ZARR
+**Use Cases:** Cloud microscopy, large-scale analysis
+**Python Libraries:**
+- `zarr`: `zarr.open('file.zarr')`
+- `ome-zarr-py`: OME-ZARR support
+**EDA Approach:**
+- Chunk size optimization
+- Compression codec analysis
+- Multi-scale representation
+- Array dimensions and dtype
+- Metadata structure (OME)
+- Cloud access patterns
+### .raw - Raw Image Data
+**Description:** Unformatted binary pixel data
+**Typical Data:** Raw detector output
+**Use Cases:** Custom imaging systems
+**Python Libraries:**
+- `numpy`: `np.fromfile()` with dtype
+- `imageio`: Raw format plugins
+**EDA Approach:**
+- Dimensions determination (external info needed)
+- Byte order and data type
+- Header presence detection
+- Pixel value range
+- Noise characteristics
+### .bin - Binary Image Data
+**Description:** Generic binary image format
+**Typical Data:** Raw or custom-formatted images
+**Use Cases:** Instrument-specific outputs
+**Python Libraries:**
+- `numpy`: Custom binary reading
+- `struct`: For structured binary data
+**EDA Approach:**
+- Format specification required
+- Header parsing (if present)
+- Data type inference
+- Dimension extraction
+- Validation with known parameters
+## Image Analysis Formats
+### .roi - ImageJ ROI
+**Description:** ImageJ region of interest format
+**Typical Data:** Geometric ROIs, selections
+**Use Cases:** ImageJ/Fiji analysis workflows
+**Python Libraries:**
+- `read-roi`: `read_roi.read_roi_file('file.roi')`
+- `roifile`: ROI manipulation
+**EDA Approach:**
+- ROI type analysis (rectangle, polygon, etc.)
+- Coordinate extraction
+- ROI properties (area, perimeter)
+- Group analysis (ROI sets)
+- Z-position and time information
+### .zip (ROI sets)
+**Description:** ZIP archive of ImageJ ROIs
+**Typical Data:** Multiple ROI files
+**Use Cases:** Batch ROI analysis
+**Python Libraries:**
+- `read-roi`: `read_roi.read_roi_zip('file.zip')`
+- Standard `zipfile` module
+**EDA Approach:**
+- ROI count in set
+- ROI type distribution
+- Spatial distribution
+- Overlapping ROI detection
+- Naming conventions
+### .ome.tif / .ome.tiff - OME-TIFF
+**Description:** TIFF with OME-XML metadata
+**Typical Data:** Standardized microscopy with rich metadata
+**Use Cases:** Bio-Formats compatible storage
+**Python Libraries:**
+- `tifffile`: OME-TIFF support
+- `AICSImageIO`: OME reading
+- `python-bioformats`: Bio-Formats integration
+**EDA Approach:**
+- OME-XML validation
+- Physical dimensions extraction
+- Channel naming and wavelengths
+- Plane positions (Z, C, T)
+- Instrument metadata
+- Bio-Formats compatibility
+### .ome.zarr - OME-ZARR
+**Description:** OME-NGFF specification on ZARR
+**Typical Data:** Next-generation file format for bioimaging
+**Use Cases:** Cloud-native imaging, large datasets
+**Python Libraries:**
+- `ome-zarr-py`: Official implementation
+- `zarr`: Underlying array storage
+**EDA Approach:**
+- Multiscale resolution levels
+- Metadata compliance with OME-NGFF spec
+- Coordinate transformations
+- Label and ROI handling
+- Cloud storage optimization
+- Chunk access patterns
+### .klb - Keller Lab Block
+**Description:** Fast microscopy format for large data
+**Typical Data:** Lightsheet microscopy, time-lapse
+**Use Cases:** High-throughput imaging
+**Python Libraries:**
+- `pyklb`: KLB reading and writing
+**EDA Approach:**
+- Compression efficiency
+- Block structure
+- Multi-resolution support
+- Read performance benchmarking
+- Metadata extraction
+### .vsi - Whole Slide Imaging
+**Description:** Virtual slide format (multiple vendors)
+**Typical Data:** Pathology slides, large mosaics
+**Use Cases:** Digital pathology
+**Python Libraries:**
+- `openslide-python`: Multi-format WSI
+- `tiffslide`: Pure Python alternative
+**EDA Approach:**
+- Pyramid level count
+- Downsampling factors
+- Associated images (macro, label)
+- Tile size and overlap
+- MPP (microns per pixel)
+- Background detection
+- Tissue segmentation
+### .ndpi - Hamamatsu NanoZoomer
+**Description:** Hamamatsu slide scanner format
+**Typical Data:** Whole slide pathology images
+**Use Cases:** Digital pathology workflows
+**Python Libraries:**
+- `openslide-python`: NDPI support
+**EDA Approach:**
+- Multi-resolution pyramid
+- Lens and objective information
+- Scan area and magnification
+- Focal plane information
+- Tissue detection
+### .svs - Aperio ScanScope
+**Description:** Aperio whole slide format
+**Typical Data:** Digital pathology slides
+**Use Cases:** Pathology image analysis
+**Python Libraries:**
+- `openslide-python`: SVS support
+**EDA Approach:**
+- Pyramid structure
+- MPP calibration
+- Label and macro images
+- Compression quality
+- Thumbnail generation
+### .scn - Leica SCN
+**Description:** Leica slide scanner format
+**Typical Data:** Whole slide imaging
+**Use Cases:** Digital pathology
+**Python Libraries:**
+- `openslide-python`: SCN support
+**EDA Approach:**
+- Tile structure analysis
+- Collection organization
+- Metadata extraction
+- Magnification levels

.scider/skills/exploratory-data-analysis/references/proteomics_metabolomics_formats.md ADDED Viewed

	@@ -0,0 +1,517 @@

+# Proteomics and Metabolomics File Formats Reference
+This reference covers file formats specific to proteomics, metabolomics, lipidomics, and related omics workflows.
+## Mass Spectrometry-Based Proteomics
+### .mzML - Mass Spectrometry Markup Language
+**Description:** Standard XML format for MS data
+**Typical Data:** MS1 and MS2 spectra, retention times, intensities
+**Use Cases:** Proteomics, metabolomics pipelines
+**Python Libraries:**
+- `pymzml`: `pymzml.run.Reader('file.mzML')`
+- `pyteomics.mzml`: `pyteomics.mzml.read('file.mzML')`
+- `pyopenms`: OpenMS Python bindings
+**EDA Approach:**
+- Scan count and MS level distribution
+- Total ion chromatogram (TIC) analysis
+- Base peak chromatogram (BPC)
+- m/z coverage and resolution
+- Retention time range
+- Precursor selection patterns
+- Data completeness
+- Quality control metrics (lock mass, standards)
+### .mzXML - Legacy MS XML Format
+**Description:** Older XML-based MS format
+**Typical Data:** Mass spectra with metadata
+**Use Cases:** Legacy proteomics data
+**Python Libraries:**
+- `pyteomics.mzxml`
+- `pymzml`: Can read mzXML
+**EDA Approach:**
+- Similar to mzML
+- Format version compatibility
+- Conversion quality validation
+- Metadata preservation check
+### .mzIdentML - Peptide Identification Format
+**Description:** PSI standard for peptide identifications
+**Typical Data:** Peptide-spectrum matches, proteins, scores
+**Use Cases:** Search engine results, proteomics workflows
+**Python Libraries:**
+- `pyteomics.mzid`
+- `pyopenms`: MzIdentML support
+**EDA Approach:**
+- PSM count and score distribution
+- FDR calculation and filtering
+- Modification analysis
+- Missed cleavage statistics
+- Protein inference results
+- Search parameters validation
+- Decoy hit analysis
+- Rank-1 vs lower ranks
+### .pepXML - Trans-Proteomic Pipeline Peptide XML
+**Description:** TPP format for peptide identifications
+**Typical Data:** Search results with statistical validation
+**Use Cases:** Proteomics database search output
+**Python Libraries:**
+- `pyteomics.pepxml`
+**EDA Approach:**
+- Search engine comparison
+- Score distributions (XCorr, expect value, etc.)
+- Charge state analysis
+- Modification frequencies
+- PeptideProphet probabilities
+- Protein coverage
+- Spectral counting
+### .protXML - Protein Inference Results
+**Description:** TPP protein-level identifications
+**Typical Data:** Protein groups, probabilities, peptides
+**Use Cases:** Protein-level analysis
+**Python Libraries:**
+- `pyteomics.protxml`
+**EDA Approach:**
+- Protein group statistics
+- Parsimonious protein sets
+- ProteinProphet probabilities
+- Coverage and peptide count per protein
+- Unique vs shared peptides
+- Protein molecular weight distribution
+- GO term enrichment preparation
+### .pride.xml - PRIDE XML Format
+**Description:** Proteomics Identifications Database format
+**Typical Data:** Complete proteomics experiment data
+**Use Cases:** Public data deposition (legacy)
+**Python Libraries:**
+- `pyteomics.pride`
+- Custom XML parsers
+**EDA Approach:**
+- Experiment metadata extraction
+- Identification completeness
+- Cross-linking to spectra
+- Protocol information
+- Instrument details
+### .tsv / .csv (Proteomics)
+**Description:** Tab or comma-separated proteomics results
+**Typical Data:** Peptide or protein quantification tables
+**Use Cases:** MaxQuant, Proteome Discoverer, Skyline output
+**Python Libraries:**
+- `pandas`: `pd.read_csv()` or `pd.read_table()`
+**EDA Approach:**
+- Identification counts
+- Quantitative value distributions
+- Missing value patterns
+- Intensity-based analysis
+- Label-free quantification assessment
+- Isobaric tag ratio analysis
+- Coefficient of variation
+- Batch effects
+### .msf - Thermo MSF Database
+**Description:** Proteome Discoverer results database
+**Typical Data:** SQLite database with search results
+**Use Cases:** Thermo Proteome Discoverer workflows
+**Python Libraries:**
+- `sqlite3`: Database access
+- Custom MSF parsers
+**EDA Approach:**
+- Database schema exploration
+- Peptide and protein tables
+- Score thresholds
+- Quantification data
+- Processing node information
+- Confidence levels
+### .pdResult - Proteome Discoverer Result
+**Description:** Proteome Discoverer study results
+**Typical Data:** Comprehensive search and quantification
+**Use Cases:** PD study exports
+**Python Libraries:**
+- Vendor tools for conversion
+- Export to TSV for Python analysis
+**EDA Approach:**
+- Study design validation
+- Result filtering criteria
+- Quantitative comparison groups
+- Imputation strategies
+### .pep.xml - Peptide Summary
+**Description:** Compact peptide identification format
+**Typical Data:** Peptide sequences, modifications, scores
+**Use Cases:** Downstream analysis input
+**Python Libraries:**
+- `pyteomics`: XML parsing
+**EDA Approach:**
+- Unique peptide counting
+- PTM site localization
+- Retention time predictability
+- Charge state preferences
+## Quantitative Proteomics
+### .sky - Skyline Document
+**Description:** Skyline targeted proteomics document
+**Typical Data:** Transition lists, chromatograms, results
+**Use Cases:** Targeted proteomics (SRM/MRM/PRM)
+**Python Libraries:**
+- `skyline`: Python API (limited)
+- Export to CSV for analysis
+**EDA Approach:**
+- Transition selection validation
+- Chromatographic peak quality
+- Interference detection
+- Retention time consistency
+- Calibration curve assessment
+- Replicate correlation
+- LOD/LOQ determination
+### .sky.zip - Zipped Skyline Document
+**Description:** Skyline document with external files
+**Typical Data:** Complete Skyline analysis
+**Use Cases:** Sharing Skyline projects
+**Python Libraries:**
+- `zipfile`: Extract for processing
+**EDA Approach:**
+- Document structure
+- External file references
+- Result export and analysis
+### .wiff - SCIEX WIFF Format
+**Description:** SCIEX instrument data with quantitation
+**Typical Data:** LC-MS/MS with MRM transitions
+**Use Cases:** SCIEX QTRAP, TripleTOF data
+**Python Libraries:**
+- Vendor tools (limited Python access)
+- Conversion to mzML
+**EDA Approach:**
+- MRM transition performance
+- Dwell time optimization
+- Cycle time analysis
+- Peak integration quality
+### .raw (Thermo)
+**Description:** Thermo raw instrument file
+**Typical Data:** Full MS data from Orbitrap, Q Exactive
+**Use Cases:** Label-free and TMT quantification
+**Python Libraries:**
+- `pymsfilereader`: Thermo RawFileReader
+- `ThermoRawFileParser`: Cross-platform CLI
+**EDA Approach:**
+- MS1 and MS2 acquisition rates
+- AGC target and fill times
+- Resolution settings
+- Isolation window validation
+- SPS ion selection (TMT)
+- Contamination assessment
+### .d (Agilent)
+**Description:** Agilent data directory
+**Typical Data:** LC-MS and GC-MS data
+**Use Cases:** Agilent instrument workflows
+**Python Libraries:**
+- Community parsers
+- Export to mzML
+**EDA Approach:**
+- Method consistency
+- Calibration status
+- Sequence run information
+- Retention time stability
+## Metabolomics and Lipidomics
+### .mzML (Metabolomics)
+**Description:** Standard MS format for metabolomics
+**Typical Data:** Full scan MS, targeted MS/MS
+**Use Cases:** Untargeted and targeted metabolomics
+**Python Libraries:**
+- Same as proteomics mzML tools
+**EDA Approach:**
+- Feature detection quality
+- Mass accuracy assessment
+- Retention time alignment
+- Blank subtraction
+- QC sample consistency
+- Isotope pattern validation
+- Adduct formation analysis
+- In-source fragmentation check
+### .cdf / .netCDF - ANDI-MS
+**Description:** Analytical Data Interchange for MS
+**Typical Data:** GC-MS, LC-MS chromatography data
+**Use Cases:** Metabolomics, GC-MS workflows
+**Python Libraries:**
+- `netCDF4`: Low-level access
+- `pyopenms`: CDF support
+- `xcms` via R integration
+**EDA Approach:**
+- TIC and extracted ion chromatograms
+- Peak detection across samples
+- Retention index calculation
+- Mass spectral matching
+- Library search preparation
+### .msp - Mass Spectral Format (NIST)
+**Description:** NIST spectral library format
+**Typical Data:** Reference mass spectra
+**Use Cases:** Metabolite identification, library matching
+**Python Libraries:**
+- `matchms`: Spectral matching
+- Custom MSP parsers
+**EDA Approach:**
+- Library coverage
+- Metadata completeness (InChI, SMILES)
+- Spectral quality metrics
+- Collision energy standardization
+- Precursor type annotation
+### .mgf (Metabolomics)
+**Description:** Mascot Generic Format for MS/MS
+**Typical Data:** MS/MS spectra for metabolite ID
+**Use Cases:** Spectral library searching
+**Python Libraries:**
+- `matchms`: Metabolomics spectral analysis
+- `pyteomics.mgf`
+**EDA Approach:**
+- Spectrum quality filtering
+- Precursor isolation purity
+- Fragment m/z accuracy
+- Neutral loss patterns
+- MS/MS completeness
+### .nmrML - NMR Markup Language
+**Description:** Standard XML format for NMR metabolomics
+**Typical Data:** 1D/2D NMR spectra with metadata
+**Use Cases:** NMR-based metabolomics
+**Python Libraries:**
+- `nmrml2isa`: Format conversion
+- Custom XML parsers
+**EDA Approach:**
+- Spectral quality metrics
+- Binning consistency
+- Reference compound validation
+- pH and temperature effects
+- Metabolite identification confidence
+### .json (Metabolomics)
+**Description:** JSON format for metabolomics results
+**Typical Data:** Feature tables, annotations, metadata
+**Use Cases:** GNPS, MetaboAnalyst, web tools
+**Python Libraries:**
+- `json`: Standard library
+- `pandas`: JSON normalization
+**EDA Approach:**
+- Feature annotation coverage
+- GNPS clustering results
+- Molecular networking statistics
+- Adduct and in-source fragment linkage
+- Putative identification confidence
+### .txt (Metabolomics Tables)
+**Description:** Tab-delimited feature tables
+**Typical Data:** m/z, RT, intensities across samples
+**Use Cases:** MZmine, XCMS, MS-DIAL output
+**Python Libraries:**
+- `pandas`: Text file reading
+**EDA Approach:**
+- Feature count and quality
+- Missing value imputation
+- Data normalization assessment
+- Batch correction validation
+- PCA and clustering for QC
+- Fold change calculations
+- Statistical test preparation
+### .featureXML - OpenMS Feature Format
+**Description:** OpenMS detected features
+**Typical Data:** LC-MS features with quality scores
+**Use Cases:** OpenMS workflows
+**Python Libraries:**
+- `pyopenms`: FeatureXML support
+**EDA Approach:**
+- Feature detection parameters
+- Quality metrics per feature
+- Isotope pattern fitting
+- Charge state assignment
+- FWHM and asymmetry
+### .consensusXML - OpenMS Consensus Features
+**Description:** Linked features across samples
+**Typical Data:** Aligned features with group info
+**Use Cases:** Multi-sample LC-MS analysis
+**Python Libraries:**
+- `pyopenms`: ConsensusXML reading
+**EDA Approach:**
+- Feature correspondence quality
+- Retention time alignment
+- Missing value patterns
+- Intensity normalization needs
+- Batch-wise feature agreement
+### .idXML - OpenMS Identification Format
+**Description:** Peptide/metabolite identifications
+**Typical Data:** MS/MS identifications with scores
+**Use Cases:** OpenMS ID workflows
+**Python Libraries:**
+- `pyopenms`: IdXML support
+**EDA Approach:**
+- Identification rate
+- Score distribution
+- Spectral match quality
+- False discovery assessment
+- Annotation transfer validation
+## Lipidomics-Specific Formats
+### .lcb - LipidCreator Batch
+**Description:** LipidCreator transition list
+**Typical Data:** Lipid transitions for targeted MS
+**Use Cases:** Targeted lipidomics
+**Python Libraries:**
+- Export to CSV for processing
+**EDA Approach:**
+- Transition coverage per lipid class
+- Retention time prediction
+- Collision energy optimization
+- Class-specific fragmentation patterns
+### .mzTab - Proteomics/Metabolomics Tabular Format
+**Description:** PSI tabular summary format
+**Typical Data:** Protein/peptide/metabolite quantification
+**Use Cases:** Publication and data sharing
+**Python Libraries:**
+- `pyteomics.mztab`
+- `pandas` for TSV-like structure
+**EDA Approach:**
+- Data completeness
+- Metadata section validation
+- Quantification method
+- Identification confidence
+- Software and parameters
+- Quality metrics summary
+### .csv (LipidSearch, LipidMatch)
+**Description:** Lipid identification results
+**Typical Data:** Lipid annotations, grades, intensities
+**Use Cases:** Lipidomics software output
+**Python Libraries:**
+- `pandas`: CSV reading
+**EDA Approach:**
+- Lipid class distribution
+- Identification grade/confidence
+- Fatty acid composition analysis
+- Double bond and chain length patterns
+- Intensity correlations
+- Normalization to internal standards
+### .sdf (Metabolomics)
+**Description:** Structure data file for metabolites
+**Typical Data:** Chemical structures with properties
+**Use Cases:** Metabolite database creation
+**Python Libraries:**
+- `RDKit`: `Chem.SDMolSupplier('file.sdf')`
+**EDA Approach:**
+- Structure validation
+- Property calculation (logP, MW, TPSA)
+- Molecular formula consistency
+- Tautomer enumeration
+- Retention time prediction features
+### .mol (Metabolomics)
+**Description:** Single molecule structure files
+**Typical Data:** Metabolite chemical structure
+**Use Cases:** Structure-based searches
+**Python Libraries:**
+- `RDKit`: `Chem.MolFromMolFile('file.mol')`
+**EDA Approach:**
+- Structure correctness
+- Stereochemistry validation
+- Charge state
+- Implicit hydrogen handling
+## Data Processing and Analysis
+### .h5 / .hdf5 (Omics)
+**Description:** HDF5 for large omics datasets
+**Typical Data:** Feature matrices, spectra, metadata
+**Use Cases:** Large-scale studies, cloud computing
+**Python Libraries:**
+- `h5py`: HDF5 access
+- `anndata`: For single-cell proteomics
+**EDA Approach:**
+- Dataset organization
+- Chunking and compression
+- Metadata structure
+- Efficient data access patterns
+- Sample and feature annotations
+### .Rdata / .rds - R Objects
+**Description:** Serialized R analysis objects
+**Typical Data:** Processed omics results from R packages
+**Use Cases:** xcms, CAMERA, MSnbase workflows
+**Python Libraries:**
+- `pyreadr`: `pyreadr.read_r('file.Rdata')`
+- `rpy2`: R-Python integration
+**EDA Approach:**
+- Object structure exploration
+- Data extraction
+- Method parameter review
+- Conversion to Python-native formats
+### .mzTab-M - Metabolomics mzTab
+**Description:** mzTab specific to metabolomics
+**Typical Data:** Small molecule quantification
+**Use Cases:** Metabolomics data sharing
+**Python Libraries:**
+- `pyteomics.mztab`: Can parse mzTab-M
+**EDA Approach:**
+- Small molecule evidence
+- Feature quantification
+- Database references (HMDB, KEGG, etc.)
+- Adduct and charge annotation
+- MS level information
+### .parquet (Omics)
+**Description:** Columnar storage for large tables
+**Typical Data:** Feature matrices, metadata
+**Use Cases:** Efficient big data omics
+**Python Libraries:**
+- `pandas`: `pd.read_parquet()`
+- `pyarrow`: Direct parquet access
+**EDA Approach:**
+- Compression efficiency
+- Column-wise statistics
+- Partition structure
+- Schema validation
+- Fast filtering and aggregation
+### .pkl (Omics Models)
+**Description:** Pickled Python objects
+**Typical Data:** ML models, processed data
+**Use Cases:** Workflow intermediate storage
+**Python Libraries:**
+- `pickle`: Standard serialization
+- `joblib`: Enhanced pickling
+**EDA Approach:**
+- Object type and structure
+- Model parameters
+- Feature importance (if ML model)
+- Data shapes and types
+- Deserialization validation
+### .zarr (Omics)
+**Description:** Chunked, compressed array storage
+**Typical Data:** Multi-dimensional omics data
+**Use Cases:** Cloud-optimized analysis
+**Python Libraries:**
+- `zarr`: Array storage
+**EDA Approach:**
+- Chunk optimization
+- Compression codecs
+- Multi-scale data
+- Parallel access patterns
+- Metadata annotations

.scider/skills/exploratory-data-analysis/references/spectroscopy_analytical_formats.md ADDED Viewed

	@@ -0,0 +1,633 @@

+# Spectroscopy and Analytical Chemistry File Formats Reference
+This reference covers file formats used in various spectroscopic techniques and analytical chemistry instrumentation.
+## NMR Spectroscopy
+### .fid - NMR Free Induction Decay
+**Description:** Raw time-domain NMR data from Bruker, Agilent, JEOL
+**Typical Data:** Complex time-domain signal
+**Use Cases:** NMR spectroscopy, structure elucidation
+**Python Libraries:**
+- `nmrglue`: `nmrglue.bruker.read_fid('fid')` or `nmrglue.varian.read_fid('fid')`
+- `nmrstarlib`: NMR data handling
+**EDA Approach:**
+- Time-domain signal decay
+- Sampling rate and acquisition time
+- Number of data points
+- Signal-to-noise ratio estimation
+- Baseline drift assessment
+- Digital filter effects
+- Acquisition parameter validation
+- Apodization function selection
+### .ft / .ft1 / .ft2 - NMR Frequency Domain
+**Description:** Fourier-transformed NMR spectrum
+**Typical Data:** Processed frequency-domain data
+**Use Cases:** NMR analysis, peak integration
+**Python Libraries:**
+- `nmrglue`: Frequency domain reading
+- Custom processing pipelines
+**EDA Approach:**
+- Peak picking and integration
+- Chemical shift range
+- Baseline correction quality
+- Phase correction assessment
+- Reference peak identification
+- Spectral resolution
+- Artifacts detection
+- Multiplicity analysis
+### .1r / .2rr - Bruker NMR Processed Data
+**Description:** Bruker processed spectrum (real part)
+**Typical Data:** 1D or 2D processed NMR spectra
+**Use Cases:** NMR data analysis with Bruker software
+**Python Libraries:**
+- `nmrglue`: Bruker format support
+**EDA Approach:**
+- Processing parameters review
+- Window function effects
+- Zero-filling assessment
+- Linear prediction validation
+- Spectral artifacts
+### .dx - NMR JCAMP-DX
+**Description:** JCAMP-DX format for NMR
+**Typical Data:** Standardized NMR spectrum
+**Use Cases:** Data exchange between software
+**Python Libraries:**
+- `jcamp`: JCAMP reader
+- `nmrglue`: Can import JCAMP
+**EDA Approach:**
+- Format compliance
+- Metadata completeness
+- Peak table validation
+- Integration values
+- Compound identification info
+### .mnova - Mnova Format
+**Description:** Mestrelab Research Mnova format
+**Typical Data:** NMR data with processing info
+**Use Cases:** Mnova software workflows
+**Python Libraries:**
+- `nmrglue`: Limited Mnova support
+- Conversion tools to standard formats
+**EDA Approach:**
+- Multi-spectrum handling
+- Processing pipeline review
+- Quantification data
+- Structure assignment
+## Mass Spectrometry
+### .mzML - Mass Spectrometry Markup Language
+**Description:** Standard XML-based MS format
+**Typical Data:** MS spectra, chromatograms, metadata
+**Use Cases:** Proteomics, metabolomics, lipidomics
+**Python Libraries:**
+- `pymzml`: `pymzml.run.Reader('file.mzML')`
+- `pyteomics.mzml`: `pyteomics.mzml.read('file.mzML')`
+- `MSFileReader`: Various wrappers
+**EDA Approach:**
+- Scan count and MS level distribution
+- Retention time range and TIC
+- m/z range and resolution
+- Precursor ion selection
+- Fragmentation patterns
+- Instrument configuration
+- Quality control metrics
+- Data completeness
+### .mzXML - Mass Spectrometry XML
+**Description:** Legacy XML MS format
+**Typical Data:** Mass spectra and chromatograms
+**Use Cases:** Proteomics workflows (older)
+**Python Libraries:**
+- `pyteomics.mzxml`
+- `pymzml`: Can read mzXML
+**EDA Approach:**
+- Similar to mzML
+- Version compatibility
+- Conversion quality assessment
+### .mzData - mzData Format
+**Description:** Legacy PSI MS format
+**Typical Data:** Mass spectrometry data
+**Use Cases:** Legacy data archives
+**Python Libraries:**
+- `pyteomics`: Limited support
+- Conversion to mzML recommended
+**EDA Approach:**
+- Format conversion validation
+- Data completeness
+- Metadata extraction
+### .raw - Vendor Raw Files (Thermo, Agilent, Bruker)
+**Description:** Proprietary instrument data
+**Typical Data:** Raw mass spectra and metadata
+**Use Cases:** Direct instrument output
+**Python Libraries:**
+- `pymsfilereader`: Thermo RAW files
+- `ThermoRawFileParser`: CLI wrapper
+- Vendor-specific APIs
+**EDA Approach:**
+- Method parameter extraction
+- Instrument performance metrics
+- Calibration status
+- Scan function analysis
+- MS/MS quality metrics
+- Dynamic exclusion evaluation
+### .d - Agilent Data Directory
+**Description:** Agilent MS data folder
+**Typical Data:** LC-MS, GC-MS with methods
+**Use Cases:** Agilent MassHunter workflows
+**Python Libraries:**
+- Community parsers
+- Chemstation integration
+**EDA Approach:**
+- Directory structure validation
+- Method parameters
+- Calibration curves
+- Sequence metadata
+- Signal quality metrics
+### .wiff - AB SCIEX Data
+**Description:** AB SCIEX/SCIEX instrument format
+**Typical Data:** Mass spectrometry data
+**Use Cases:** SCIEX instrument workflows
+**Python Libraries:**
+- Vendor SDKs (limited Python support)
+- Conversion tools
+**EDA Approach:**
+- Experiment type identification
+- Scan properties
+- Quantitation data
+- Multi-experiment structure
+### .mgf - Mascot Generic Format
+**Description:** Peak list format for MS/MS
+**Typical Data:** Precursor and fragment masses
+**Use Cases:** Peptide identification, database searches
+**Python Libraries:**
+- `pyteomics.mgf`: `pyteomics.mgf.read('file.mgf')`
+- `pyopenms`: MGF support
+**EDA Approach:**
+- Spectrum count
+- Charge state distribution
+- Precursor m/z and intensity
+- Fragment peak count
+- Mass accuracy
+- Title and metadata parsing
+### .pkl - Peak List (Binary)
+**Description:** Binary peak list format
+**Typical Data:** Serialized MS/MS spectra
+**Use Cases:** Software-specific storage
+**Python Libraries:**
+- `pickle`: Standard deserialization
+- `pyteomics`: PKL support
+**EDA Approach:**
+- Data structure inspection
+- Conversion to standard formats
+- Metadata preservation
+### .ms1 / .ms2 - MS1/MS2 Formats
+**Description:** Simple text format for MS data
+**Typical Data:** MS1 and MS2 scans
+**Use Cases:** Database searching, proteomics
+**Python Libraries:**
+- `pyteomics.ms1` and `ms2`
+- Simple text parsing
+**EDA Approach:**
+- Scan count by level
+- Retention time series
+- Charge state analysis
+- m/z range coverage
+### .pepXML - Peptide XML
+**Description:** TPP peptide identification format
+**Typical Data:** Peptide-spectrum matches
+**Use Cases:** Proteomics search results
+**Python Libraries:**
+- `pyteomics.pepxml`
+**EDA Approach:**
+- Search result statistics
+- Score distribution
+- Modification analysis
+- FDR assessment
+- Enzyme specificity
+### .protXML - Protein XML
+**Description:** TPP protein inference format
+**Typical Data:** Protein identifications
+**Use Cases:** Proteomics protein-level results
+**Python Libraries:**
+- `pyteomics.protxml`
+**EDA Approach:**
+- Protein group analysis
+- Coverage statistics
+- Confidence scoring
+- Parsimony analysis
+### .msp - NIST MS Search Format
+**Description:** NIST spectral library format
+**Typical Data:** Reference mass spectra
+**Use Cases:** Spectral library searching
+**Python Libraries:**
+- `matchms`: Spectral library handling
+- Custom parsers
+**EDA Approach:**
+- Library size and coverage
+- Metadata completeness
+- Peak count statistics
+- Compound annotation quality
+## Infrared and Raman Spectroscopy
+### .spc - Galactic SPC
+**Description:** Thermo Galactic spectroscopy format
+**Typical Data:** IR, Raman, UV-Vis spectra
+**Use Cases:** Various spectroscopy instruments
+**Python Libraries:**
+- `spc`: `spc.File('file.spc')`
+- `specio`: Multi-format reader
+**EDA Approach:**
+- Wavenumber/wavelength range
+- Data point density
+- Multi-spectrum handling
+- Baseline characteristics
+- Peak identification
+- Absorbance/transmittance mode
+- Instrument information
+### .spa - Thermo Nicolet
+**Description:** Thermo Fisher FTIR format
+**Typical Data:** FTIR spectra
+**Use Cases:** OMNIC software data
+**Python Libraries:**
+- Custom binary parsers
+- Conversion to JCAMP or SPC
+**EDA Approach:**
+- Interferogram vs spectrum
+- Background spectrum validation
+- Atmospheric compensation
+- Resolution and scan number
+- Sample information
+### .0 - Bruker OPUS
+**Description:** Bruker OPUS FTIR format (numbered files)
+**Typical Data:** FTIR spectra and metadata
+**Use Cases:** Bruker FTIR instruments
+**Python Libraries:**
+- `brukeropusreader`: OPUS format parser
+- `specio`: OPUS support
+**EDA Approach:**
+- Multiple block types (AB, ScSm, etc.)
+- Sample and reference spectra
+- Instrument parameters
+- Optical path configuration
+- Beam splitter and detector info
+### .dpt - Data Point Table
+**Description:** Simple XY data format
+**Typical Data:** Generic spectroscopic data
+**Use Cases:** Renishaw Raman, generic exports
+**Python Libraries:**
+- `pandas`: CSV-like reading
+- Text parsing
+**EDA Approach:**
+- X-axis type (wavelength, wavenumber, Raman shift)
+- Y-axis units (intensity, absorbance, etc.)
+- Data point spacing
+- Header information
+- Multi-column data handling
+### .wdf - Renishaw Raman
+**Description:** Renishaw WiRE data format
+**Typical Data:** Raman spectra and maps
+**Use Cases:** Renishaw Raman microscopy
+**Python Libraries:**
+- `renishawWiRE`: WDF reader
+- Custom parsers for WDF format
+**EDA Approach:**
+- Spectral vs mapping data
+- Laser wavelength
+- Accumulation and exposure time
+- Spatial coordinates (mapping)
+- Z-scan data
+- Baseline and cosmic ray correction
+### .txt (Spectroscopy)
+**Description:** Generic text export from instruments
+**Typical Data:** Wavelength/wavenumber and intensity
+**Use Cases:** Universal data exchange
+**Python Libraries:**
+- `pandas`: Text file reading
+- `numpy`: Simple array loading
+**EDA Approach:**
+- Delimiter and format detection
+- Header parsing
+- Units identification
+- Multiple spectrum handling
+- Metadata extraction from comments
+## UV-Visible Spectroscopy
+### .asd / .asc - ASD Binary/ASCII
+**Description:** ASD FieldSpec spectroradiometer
+**Typical Data:** Hyperspectral UV-Vis-NIR data
+**Use Cases:** Remote sensing, reflectance spectroscopy
+**Python Libraries:**
+- `spectral.io.asd`: ASD format support
+- Custom parsers
+**EDA Approach:**
+- Wavelength range (UV to NIR)
+- Reference spectrum validation
+- Dark current correction
+- Integration time
+- GPS metadata (if present)
+- Reflectance vs radiance
+### .sp - Perkin Elmer
+**Description:** Perkin Elmer UV/Vis format
+**Typical Data:** UV-Vis spectrophotometer data
+**Use Cases:** PE Lambda instruments
+**Python Libraries:**
+- Custom parsers
+- Conversion to standard formats
+**EDA Approach:**
+- Scan parameters
+- Baseline correction
+- Multi-wavelength scans
+- Time-based measurements
+- Sample/reference handling
+### .csv (Spectroscopy)
+**Description:** CSV export from UV-Vis instruments
+**Typical Data:** Wavelength and absorbance/transmittance
+**Use Cases:** Universal format for UV-Vis data
+**Python Libraries:**
+- `pandas`: Native CSV support
+**EDA Approach:**
+- Lambda max identification
+- Beer's law compliance
+- Baseline offset
+- Path length correction
+- Concentration calculations
+## X-ray and Diffraction
+### .cif - Crystallographic Information File
+**Description:** Crystal structure and diffraction data
+**Typical Data:** Unit cell, atomic positions, structure factors
+**Use Cases:** Crystallography, materials science
+**Python Libraries:**
+- `gemmi`: `gemmi.cif.read_file('file.cif')`
+- `PyCifRW`: CIF reading/writing
+- `pymatgen`: Materials structure analysis
+**EDA Approach:**
+- Crystal system and space group
+- Unit cell parameters
+- Atomic positions and occupancy
+- Thermal parameters
+- R-factors and refinement quality
+- Completeness and redundancy
+- Structure validation
+### .hkl - Reflection Data
+**Description:** Miller indices and intensities
+**Typical Data:** Integrated diffraction intensities
+**Use Cases:** Crystallographic refinement
+**Python Libraries:**
+- Custom parsers (format dependent)
+- Crystallography packages (CCP4, etc.)
+**EDA Approach:**
+- Resolution range
+- Completeness by shell
+- I/sigma distribution
+- Systematic absences
+- Twinning detection
+- Wilson plot
+### .mtz - MTZ Format (CCP4)
+**Description:** Binary crystallographic data
+**Typical Data:** Reflections, phases, structure factors
+**Use Cases:** Macromolecular crystallography
+**Python Libraries:**
+- `gemmi`: MTZ support
+- `cctbx`: Comprehensive crystallography
+**EDA Approach:**
+- Column types and data
+- Resolution limits
+- R-factors (Rwork, Rfree)
+- Phase probability distribution
+- Map coefficients
+- Batch information
+### .xy / .xye - Powder Diffraction
+**Description:** 2-theta vs intensity data
+**Typical Data:** Powder X-ray diffraction patterns
+**Use Cases:** Phase identification, Rietveld refinement
+**Python Libraries:**
+- `pandas`: Simple XY reading
+- `pymatgen`: XRD pattern analysis
+**EDA Approach:**
+- 2-theta range
+- Peak positions and intensities
+- Background modeling
+- Peak width analysis (strain/size)
+- Phase identification via matching
+- Preferred orientation effects
+### .raw (XRD)
+**Description:** Vendor-specific XRD raw data
+**Typical Data:** XRD patterns with metadata
+**Use Cases:** Bruker, PANalytical, Rigaku instruments
+**Python Libraries:**
+- Vendor-specific parsers
+- Conversion tools
+**EDA Approach:**
+- Scan parameters (step size, time)
+- Sample alignment
+- Incident beam setup
+- Detector configuration
+- Background scan validation
+### .gsa / .gsas - GSAS Format
+**Description:** General Structure Analysis System
+**Typical Data:** Powder diffraction for Rietveld
+**Use Cases:** Rietveld refinement
+**Python Libraries:**
+- GSAS-II Python interface
+- Custom parsers
+**EDA Approach:**
+- Histogram data
+- Instrument parameters
+- Phase information
+- Refinement constraints
+- Profile function parameters
+## Electron Spectroscopy
+### .vms - VG Scienta
+**Description:** VG Scienta spectrometer format
+**Typical Data:** XPS, UPS, ARPES spectra
+**Use Cases:** Photoelectron spectroscopy
+**Python Libraries:**
+- Custom parsers for VMS
+- `specio`: Multi-format support
+**EDA Approach:**
+- Binding energy calibration
+- Pass energy and resolution
+- Photoelectron line identification
+- Satellite peak analysis
+- Background subtraction quality
+- Fermi edge position
+### .spe - WinSpec/SPE Format
+**Description:** Princeton Instruments/Roper Scientific
+**Typical Data:** CCD spectra, Raman, PL
+**Use Cases:** Spectroscopy with CCD detectors
+**Python Libraries:**
+- `spe2py`: SPE file reader
+- `spe_loader`: Alternative parser
+**EDA Approach:**
+- CCD frame analysis
+- Wavelength calibration
+- Dark frame subtraction
+- Cosmic ray identification
+- Readout noise
+- Accumulation statistics
+### .pxt - Princeton PTI
+**Description:** Photon Technology International
+**Typical Data:** Fluorescence, phosphorescence spectra
+**Use Cases:** Fluorescence spectroscopy
+**Python Libraries:**
+- Custom parsers
+- Text-based format variants
+**EDA Approach:**
+- Excitation and emission spectra
+- Quantum yield calculations
+- Time-resolved measurements
+- Temperature-dependent data
+- Correction factors applied
+### .dat (Spectroscopy Generic)
+**Description:** Generic binary or text spectroscopy data
+**Typical Data:** Various spectroscopic measurements
+**Use Cases:** Many instruments use .dat extension
+**Python Libraries:**
+- Format-specific identification needed
+- `numpy`, `pandas` for known formats
+**EDA Approach:**
+- Format detection (binary vs text)
+- Header identification
+- Data structure inference
+- Units and axis labels
+- Instrument signature detection
+## Chromatography
+### .chrom - Chromatogram Data
+**Description:** Generic chromatography format
+**Typical Data:** Retention time vs signal
+**Use Cases:** HPLC, GC, LC-MS
+**Python Libraries:**
+- Vendor-specific parsers
+- `pandas` for text exports
+**EDA Approach:**
+- Retention time range
+- Peak detection and integration
+- Baseline drift
+- Resolution between peaks
+- Signal-to-noise ratio
+- Tailing factor
+### .ch - ChemStation
+**Description:** Agilent ChemStation format
+**Typical Data:** Chromatograms and method parameters
+**Use Cases:** Agilent HPLC and GC systems
+**Python Libraries:**
+- `agilent-chemstation`: Community tools
+- Binary format parsers
+**EDA Approach:**
+- Method validation
+- Integration parameters
+- Calibration curve
+- Sample sequence information
+- Instrument status
+### .arw - Empower (Waters)
+**Description:** Waters Empower format
+**Typical Data:** UPLC/HPLC chromatograms
+**Use Cases:** Waters instrument data
+**Python Libraries:**
+- Vendor tools (limited Python access)
+- Database extraction tools
+**EDA Approach:**
+- Audit trail information
+- Processing methods
+- Compound identification
+- Quantitation results
+- System suitability tests
+### .lcd - Shimadzu LabSolutions
+**Description:** Shimadzu chromatography format
+**Typical Data:** GC/HPLC data
+**Use Cases:** Shimadzu instruments
+**Python Libraries:**
+- Vendor-specific parsers
+**EDA Approach:**
+- Method parameters
+- Peak purity analysis
+- Spectral data (if PDA)
+- Quantitative results
+## Other Analytical Techniques
+### .dta - DSC/TGA Data
+**Description:** Thermal analysis data (TA Instruments)
+**Typical Data:** Temperature vs heat flow or mass
+**Use Cases:** Differential scanning calorimetry, thermogravimetry
+**Python Libraries:**
+- Custom parsers for TA formats
+- `pandas` for exported data
+**EDA Approach:**
+- Transition temperature identification
+- Enthalpy calculations
+- Mass loss steps
+- Heating rate effects
+- Baseline determination
+- Purity assessment
+### .run - ICP-MS/ICP-OES
+**Description:** Elemental analysis data
+**Typical Data:** Element concentrations or counts
+**Use Cases:** Inductively coupled plasma MS/OES
+**Python Libraries:**
+- Vendor-specific tools
+- Custom parsers
+**EDA Approach:**
+- Element detection and quantitation
+- Internal standard performance
+- Spike recovery
+- Dilution factor corrections
+- Isotope ratios
+- LOD/LOQ calculations
+### .exp - Electrochemistry Data
+**Description:** Electrochemical experiment data
+**Typical Data:** Potential vs current or charge
+**Use Cases:** Cyclic voltammetry, chronoamperometry
+**Python Libraries:**
+- Custom parsers per instrument (CHI, Gamry, etc.)
+- `galvani`: Biologic EC-Lab files
+**EDA Approach:**
+- Redox peak identification
+- Peak potential and current
+- Scan rate effects
+- Electron transfer kinetics
+- Background subtraction
+- Capacitance calculations

.scider/skills/exploratory-data-analysis/scripts/eda_analyzer.py ADDED Viewed

	@@ -0,0 +1,548 @@

+#!/usr/bin/env python3
+"""
+Exploratory Data Analysis Analyzer
+Analyzes scientific data files and generates comprehensive markdown reports
+"""
+import json
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+def detect_file_type(filepath):
+    """
+    Detect the file type based on extension and content.
+    Returns:
+        tuple: (extension, file_category, reference_file)
+    """
+    file_path = Path(filepath)
+    extension = file_path.suffix.lower()
+    name = file_path.name.lower()
+    # Map extensions to categories and reference files
+    extension_map = {
+        # Chemistry/Molecular
+        "pdb": ("chemistry_molecular", "Protein Data Bank"),
+        "cif": ("chemistry_molecular", "Crystallographic Information File"),
+        "mol": ("chemistry_molecular", "MDL Molfile"),
+        "mol2": ("chemistry_molecular", "Tripos Mol2"),
+        "sdf": ("chemistry_molecular", "Structure Data File"),
+        "xyz": ("chemistry_molecular", "XYZ Coordinates"),
+        "smi": ("chemistry_molecular", "SMILES String"),
+        "smiles": ("chemistry_molecular", "SMILES String"),
+        "pdbqt": ("chemistry_molecular", "AutoDock PDBQT"),
+        "mae": ("chemistry_molecular", "Maestro Format"),
+        "gro": ("chemistry_molecular", "GROMACS Coordinate File"),
+        "log": ("chemistry_molecular", "Gaussian Log File"),
+        "out": ("chemistry_molecular", "Quantum Chemistry Output"),
+        "wfn": ("chemistry_molecular", "Wavefunction Files"),
+        "wfx": ("chemistry_molecular", "Wavefunction Files"),
+        "fchk": ("chemistry_molecular", "Gaussian Formatted Checkpoint"),
+        "cube": ("chemistry_molecular", "Gaussian Cube File"),
+        "dcd": ("chemistry_molecular", "Binary Trajectory"),
+        "xtc": ("chemistry_molecular", "Compressed Trajectory"),
+        "trr": ("chemistry_molecular", "GROMACS Trajectory"),
+        "nc": ("chemistry_molecular", "Amber NetCDF Trajectory"),
+        "netcdf": ("chemistry_molecular", "Amber NetCDF Trajectory"),
+        # Bioinformatics/Genomics
+        "fasta": ("bioinformatics_genomics", "FASTA Format"),
+        "fa": ("bioinformatics_genomics", "FASTA Format"),
+        "fna": ("bioinformatics_genomics", "FASTA Format"),
+        "fastq": ("bioinformatics_genomics", "FASTQ Format"),
+        "fq": ("bioinformatics_genomics", "FASTQ Format"),
+        "sam": ("bioinformatics_genomics", "Sequence Alignment/Map"),
+        "bam": ("bioinformatics_genomics", "Binary Alignment/Map"),
+        "cram": ("bioinformatics_genomics", "CRAM Format"),
+        "bed": ("bioinformatics_genomics", "Browser Extensible Data"),
+        "bedgraph": ("bioinformatics_genomics", "BED with Graph Data"),
+        "bigwig": ("bioinformatics_genomics", "Binary BigWig"),
+        "bw": ("bioinformatics_genomics", "Binary BigWig"),
+        "bigbed": ("bioinformatics_genomics", "Binary BigBed"),
+        "bb": ("bioinformatics_genomics", "Binary BigBed"),
+        "gff": ("bioinformatics_genomics", "General Feature Format"),
+        "gff3": ("bioinformatics_genomics", "General Feature Format"),
+        "gtf": ("bioinformatics_genomics", "Gene Transfer Format"),
+        "vcf": ("bioinformatics_genomics", "Variant Call Format"),
+        "bcf": ("bioinformatics_genomics", "Binary VCF"),
+        "gvcf": ("bioinformatics_genomics", "Genomic VCF"),
+        # Microscopy/Imaging
+        "tif": ("microscopy_imaging", "Tagged Image File Format"),
+        "tiff": ("microscopy_imaging", "Tagged Image File Format"),
+        "nd2": ("microscopy_imaging", "Nikon NIS-Elements"),
+        "lif": ("microscopy_imaging", "Leica Image Format"),
+        "czi": ("microscopy_imaging", "Carl Zeiss Image"),
+        "oib": ("microscopy_imaging", "Olympus Image Format"),
+        "oif": ("microscopy_imaging", "Olympus Image Format"),
+        "vsi": ("microscopy_imaging", "Olympus VSI"),
+        "ims": ("microscopy_imaging", "Imaris Format"),
+        "lsm": ("microscopy_imaging", "Zeiss LSM"),
+        "stk": ("microscopy_imaging", "MetaMorph Stack"),
+        "dv": ("microscopy_imaging", "DeltaVision"),
+        "mrc": ("microscopy_imaging", "Medical Research Council"),
+        "dm3": ("microscopy_imaging", "Gatan Digital Micrograph"),
+        "dm4": ("microscopy_imaging", "Gatan Digital Micrograph"),
+        "dcm": ("microscopy_imaging", "DICOM"),
+        "nii": ("microscopy_imaging", "NIfTI"),
+        "nrrd": ("microscopy_imaging", "Nearly Raw Raster Data"),
+        # Spectroscopy/Analytical
+        "fid": ("spectroscopy_analytical", "NMR Free Induction Decay"),
+        "mzml": ("spectroscopy_analytical", "Mass Spectrometry Markup Language"),
+        "mzxml": ("spectroscopy_analytical", "Mass Spectrometry XML"),
+        "raw": ("spectroscopy_analytical", "Vendor Raw Files"),
+        "d": ("spectroscopy_analytical", "Agilent Data Directory"),
+        "mgf": ("spectroscopy_analytical", "Mascot Generic Format"),
+        "spc": ("spectroscopy_analytical", "Galactic SPC"),
+        "jdx": ("spectroscopy_analytical", "JCAMP-DX"),
+        "jcamp": ("spectroscopy_analytical", "JCAMP-DX"),
+        # Proteomics/Metabolomics
+        "pepxml": ("proteomics_metabolomics", "Trans-Proteomic Pipeline Peptide XML"),
+        "protxml": ("proteomics_metabolomics", "Protein Inference Results"),
+        "mzid": ("proteomics_metabolomics", "Peptide Identification Format"),
+        "mztab": ("proteomics_metabolomics", "Proteomics/Metabolomics Tabular Format"),
+        # General Scientific
+        "npy": ("general_scientific", "NumPy Array"),
+        "npz": ("general_scientific", "Compressed NumPy Archive"),
+        "csv": ("general_scientific", "Comma-Separated Values"),
+        "tsv": ("general_scientific", "Tab-Separated Values"),
+        "xlsx": ("general_scientific", "Excel Spreadsheets"),
+        "xls": ("general_scientific", "Excel Spreadsheets"),
+        "json": ("general_scientific", "JavaScript Object Notation"),
+        "xml": ("general_scientific", "Extensible Markup Language"),
+        "hdf5": ("general_scientific", "Hierarchical Data Format 5"),
+        "h5": ("general_scientific", "Hierarchical Data Format 5"),
+        "h5ad": ("bioinformatics_genomics", "Anndata Format"),
+        "zarr": ("general_scientific", "Chunked Array Storage"),
+        "parquet": ("general_scientific", "Apache Parquet"),
+        "mat": ("general_scientific", "MATLAB Data"),
+        "fits": ("general_scientific", "Flexible Image Transport System"),
+    }
+    ext_clean = extension.lstrip(".")
+    if ext_clean in extension_map:
+        category, description = extension_map[ext_clean]
+        return ext_clean, category, description
+    return ext_clean, "unknown", "Unknown Format"
+def get_file_basic_info(filepath):
+    """Get basic file information."""
+    file_path = Path(filepath)
+    stat = file_path.stat()
+    return {
+        "filename": file_path.name,
+        "path": str(file_path.absolute()),
+        "size_bytes": stat.st_size,
+        "size_human": format_bytes(stat.st_size),
+        "modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
+        "extension": file_path.suffix.lower(),
+    }
+def format_bytes(size):
+    """Convert bytes to human-readable format."""
+    for unit in ["B", "KB", "MB", "GB", "TB"]:
+        if size < 1024.0:
+            return f"{size:.2f} {unit}"
+        size /= 1024.0
+    return f"{size:.2f} PB"
+def load_reference_info(category, extension):
+    """
+    Load reference information for the file type.
+    Args:
+        category: File category (e.g., 'chemistry_molecular')
+        extension: File extension
+    Returns:
+        dict: Reference information
+    """
+    # Map categories to reference files
+    category_files = {
+        "chemistry_molecular": "chemistry_molecular_formats.md",
+        "bioinformatics_genomics": "bioinformatics_genomics_formats.md",
+        "microscopy_imaging": "microscopy_imaging_formats.md",
+        "spectroscopy_analytical": "spectroscopy_analytical_formats.md",
+        "proteomics_metabolomics": "proteomics_metabolomics_formats.md",
+        "general_scientific": "general_scientific_formats.md",
+    }
+    if category not in category_files:
+        return None
+    # Get the reference file path
+    script_dir = Path(__file__).parent
+    ref_file = script_dir.parent / "references" / category_files[category]
+    if not ref_file.exists():
+        return None
+    # Parse the reference file for the specific extension
+    # This is a simplified parser - could be more sophisticated
+    try:
+        with open(ref_file, "r") as f:
+            content = f.read()
+        # Extract section for this file type
+        # Look for the extension heading
+        import re
+        pattern = rf"### \.{extension}[^#]*?(?=###|\Z)"
+        match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
+        if match:
+            section = match.group(0)
+            return {"raw_section": section, "reference_file": category_files[category]}
+    except Exception as e:
+        print(f"Error loading reference: {e}", file=sys.stderr)
+    return None
+def analyze_file(filepath):
+    """
+    Main analysis function that routes to specific analyzers.
+    Returns:
+        dict: Analysis results
+    """
+    basic_info = get_file_basic_info(filepath)
+    extension, category, description = detect_file_type(filepath)
+    analysis = {
+        "basic_info": basic_info,
+        "file_type": {"extension": extension, "category": category, "description": description},
+        "reference_info": load_reference_info(category, extension),
+        "data_analysis": {},
+    }
+    # Try to perform data-specific analysis based on file type
+    try:
+        if category == "general_scientific":
+            analysis["data_analysis"] = analyze_general_scientific(filepath, extension)
+        elif category == "bioinformatics_genomics":
+            analysis["data_analysis"] = analyze_bioinformatics(filepath, extension)
+        elif category == "microscopy_imaging":
+            analysis["data_analysis"] = analyze_imaging(filepath, extension)
+        # Add more specific analyzers as needed
+    except Exception as e:
+        analysis["data_analysis"]["error"] = str(e)
+    return analysis
+def analyze_general_scientific(filepath, extension):
+    """Analyze general scientific data formats."""
+    results = {}
+    try:
+        if extension in ["npy"]:
+            import numpy as np
+            data = np.load(filepath)
+            results = {
+                "shape": data.shape,
+                "dtype": str(data.dtype),
+                "size": data.size,
+                "ndim": data.ndim,
+                "statistics": {
+                    "min": float(np.min(data)) if np.issubdtype(data.dtype, np.number) else None,
+                    "max": float(np.max(data)) if np.issubdtype(data.dtype, np.number) else None,
+                    "mean": float(np.mean(data)) if np.issubdtype(data.dtype, np.number) else None,
+                    "std": float(np.std(data)) if np.issubdtype(data.dtype, np.number) else None,
+                },
+            }
+        elif extension in ["npz"]:
+            import numpy as np
+            data = np.load(filepath)
+            results = {
+                "arrays": list(data.files),
+                "array_count": len(data.files),
+                "array_shapes": {name: data[name].shape for name in data.files},
+            }
+        elif extension in ["csv", "tsv"]:
+            import pandas as pd
+            sep = "\t" if extension == "tsv" else ","
+            df = pd.read_csv(filepath, sep=sep, nrows=10000)  # Sample first 10k rows
+            results = {
+                "shape": df.shape,
+                "columns": list(df.columns),
+                "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
+                "missing_values": df.isnull().sum().to_dict(),
+                "summary_statistics": (
+                    df.describe().to_dict()
+                    if len(df.select_dtypes(include="number").columns) > 0
+                    else {}
+                ),
+            }
+        elif extension in ["json"]:
+            with open(filepath, "r") as f:
+                data = json.load(f)
+            results = {
+                "type": type(data).__name__,
+                "keys": list(data.keys()) if isinstance(data, dict) else None,
+                "length": len(data) if isinstance(data, (list, dict)) else None,
+            }
+        elif extension in ["h5", "hdf5"]:
+            import h5py
+            with h5py.File(filepath, "r") as f:
+                def get_structure(group, prefix=""):
+                    items = {}
+                    for key in group.keys():
+                        path = f"{prefix}/{key}"
+                        if isinstance(group[key], h5py.Dataset):
+                            items[path] = {
+                                "type": "dataset",
+                                "shape": group[key].shape,
+                                "dtype": str(group[key].dtype),
+                            }
+                        elif isinstance(group[key], h5py.Group):
+                            items[path] = {"type": "group"}
+                            items.update(get_structure(group[key], path))
+                    return items
+                results = {"structure": get_structure(f), "attributes": dict(f.attrs)}
+    except ImportError as e:
+        results["error"] = f"Required library not installed: {e}"
+    except Exception as e:
+        results["error"] = f"Analysis error: {e}"
+    return results
+def analyze_bioinformatics(filepath, extension):
+    """Analyze bioinformatics/genomics formats."""
+    results = {}
+    try:
+        if extension in ["fasta", "fa", "fna"]:
+            from Bio import SeqIO
+            sequences = list(SeqIO.parse(filepath, "fasta"))
+            lengths = [len(seq) for seq in sequences]
+            results = {
+                "sequence_count": len(sequences),
+                "total_length": sum(lengths),
+                "mean_length": sum(lengths) / len(lengths) if lengths else 0,
+                "min_length": min(lengths) if lengths else 0,
+                "max_length": max(lengths) if lengths else 0,
+                "sequence_ids": [seq.id for seq in sequences[:10]],  # First 10
+            }
+        elif extension in ["fastq", "fq"]:
+            from Bio import SeqIO
+            sequences = []
+            for i, seq in enumerate(SeqIO.parse(filepath, "fastq")):
+                sequences.append(seq)
+                if i >= 9999:  # Sample first 10k
+                    break
+            lengths = [len(seq) for seq in sequences]
+            qualities = [
+                sum(seq.letter_annotations["phred_quality"]) / len(seq) for seq in sequences
+            ]
+            results = {
+                "read_count_sampled": len(sequences),
+                "mean_length": sum(lengths) / len(lengths) if lengths else 0,
+                "mean_quality": sum(qualities) / len(qualities) if qualities else 0,
+                "min_length": min(lengths) if lengths else 0,
+                "max_length": max(lengths) if lengths else 0,
+            }
+    except ImportError as e:
+        results["error"] = f"Required library not installed (try: pip install biopython): {e}"
+    except Exception as e:
+        results["error"] = f"Analysis error: {e}"
+    return results
+def analyze_imaging(filepath, extension):
+    """Analyze microscopy/imaging formats."""
+    results = {}
+    try:
+        if extension in ["tif", "tiff", "png", "jpg", "jpeg"]:
+            import numpy as np
+            from PIL import Image
+            img = Image.open(filepath)
+            img_array = np.array(img)
+            results = {
+                "size": img.size,
+                "mode": img.mode,
+                "format": img.format,
+                "shape": img_array.shape,
+                "dtype": str(img_array.dtype),
+                "value_range": [int(img_array.min()), int(img_array.max())],
+                "mean_intensity": float(img_array.mean()),
+            }
+            # Check for multi-page TIFF
+            if extension in ["tif", "tiff"]:
+                try:
+                    frame_count = 0
+                    while True:
+                        img.seek(frame_count)
+                        frame_count += 1
+                except EOFError:
+                    results["page_count"] = frame_count
+    except ImportError as e:
+        results["error"] = f"Required library not installed (try: pip install pillow): {e}"
+    except Exception as e:
+        results["error"] = f"Analysis error: {e}"
+    return results
+def generate_markdown_report(analysis, output_path=None):
+    """
+    Generate a comprehensive markdown report from analysis results.
+    Args:
+        analysis: Analysis results dictionary
+        output_path: Path to save the report (if None, prints to stdout)
+    """
+    lines = []
+    # Title
+    filename = analysis["basic_info"]["filename"]
+    lines.append(f"# Exploratory Data Analysis Report: {filename}\n")
+    lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+    lines.append("---\n")
+    # Basic Information
+    lines.append("## Basic Information\n")
+    basic = analysis["basic_info"]
+    lines.append(f"- **Filename:** `{basic['filename']}`")
+    lines.append(f"- **Full Path:** `{basic['path']}`")
+    lines.append(f"- **File Size:** {basic['size_human']} ({basic['size_bytes']:,} bytes)")
+    lines.append(f"- **Last Modified:** {basic['modified']}")
+    lines.append(f"- **Extension:** `.{analysis['file_type']['extension']}`\n")
+    # File Type Information
+    lines.append("## File Type\n")
+    ft = analysis["file_type"]
+    lines.append(f"- **Category:** {ft['category'].replace('_', ' ').title()}")
+    lines.append(f"- **Description:** {ft['description']}\n")
+    # Reference Information
+    if analysis.get("reference_info"):
+        lines.append("## Format Reference\n")
+        ref = analysis["reference_info"]
+        if "raw_section" in ref:
+            lines.append(ref["raw_section"])
+            lines.append(f"\n*Reference: {ref['reference_file']}*\n")
+    # Data Analysis
+    if analysis.get("data_analysis"):
+        lines.append("## Data Analysis\n")
+        data = analysis["data_analysis"]
+        if "error" in data:
+            lines.append(f"⚠️ **Analysis Error:** {data['error']}\n")
+        else:
+            # Format the data analysis based on what's present
+            lines.append("### Summary Statistics\n")
+            lines.append("```json")
+            lines.append(json.dumps(data, indent=2, default=str))
+            lines.append("```\n")
+    # Recommendations
+    lines.append("## Recommendations for Further Analysis\n")
+    lines.append(
+        f"Based on the file type (`.{analysis['file_type']['extension']}`), consider the following analyses:\n"
+    )
+    # Add specific recommendations based on category
+    category = analysis["file_type"]["category"]
+    if category == "general_scientific":
+        lines.append("- Statistical distribution analysis")
+        lines.append("- Missing value imputation strategies")
+        lines.append("- Correlation analysis between variables")
+        lines.append("- Outlier detection and handling")
+        lines.append("- Dimensionality reduction (PCA, t-SNE)")
+    elif category == "bioinformatics_genomics":
+        lines.append("- Sequence quality control and filtering")
+        lines.append("- GC content analysis")
+        lines.append("- Read alignment and mapping statistics")
+        lines.append("- Variant calling and annotation")
+        lines.append("- Differential expression analysis")
+    elif category == "microscopy_imaging":
+        lines.append("- Image quality assessment")
+        lines.append("- Background correction and normalization")
+        lines.append("- Segmentation and object detection")
+        lines.append("- Colocalization analysis")
+        lines.append("- Intensity measurements and quantification")
+    lines.append("")
+    # Footer
+    lines.append("---")
+    lines.append("*This report was generated by the exploratory-data-analysis skill.*")
+    report = "\n".join(lines)
+    if output_path:
+        with open(output_path, "w") as f:
+            f.write(report)
+        print(f"Report saved to: {output_path}")
+    else:
+        print(report)
+    return report
+def main():
+    """Main CLI interface."""
+    if len(sys.argv) < 2:
+        print("Usage: python eda_analyzer.py <filepath> [output.md]")
+        print("  filepath: Path to the data file to analyze")
+        print("  output.md: Optional output path for markdown report")
+        sys.exit(1)
+    filepath = sys.argv[1]
+    output_path = sys.argv[2] if len(sys.argv) > 2 else None
+    if not os.path.exists(filepath):
+        print(f"Error: File not found: {filepath}")
+        sys.exit(1)
+    # If no output path specified, use the input filename
+    if output_path is None:
+        input_path = Path(filepath)
+        output_path = input_path.parent / f"{input_path.stem}_eda_report.md"
+    print(f"Analyzing: {filepath}")
+    analysis = analyze_file(filepath)
+    print(f"\nGenerating report...")
+    generate_markdown_report(analysis, output_path)
+    print(f"\n✓ Analysis complete!")
+if __name__ == "__main__":
+    main()

.scider/skills/literature-review-agent/SKILL.md ADDED Viewed

	@@ -0,0 +1,357 @@

+---
+name: literature-review-agent
+description: Step 3 of the PaperOrchestra pipeline (arXiv:2604.05018). Execute the literature search strategy from outline.json — discover candidate papers via web search, verify them through Semantic Scholar (Levenshtein > 70 fuzzy title match, temporal cutoff, dedup by paperId), build a BibTeX file, and draft Introduction + Related Work using ≥90% of the verified pool. Runs in parallel with the plotting-agent. TRIGGER when the orchestrator delegates Step 3 or when the user asks to "find citations for my paper", "draft the related work", or "build the bibliography".
+allowed_agents: [writing]
+---
+# Literature Review Agent (Step 3)
+Faithful implementation of the Hybrid Literature Agent from PaperOrchestra
+(Song et al., 2026, arXiv:2604.05018, §4 Step 3, App. D.3, App. F.1 p.46).
+**Cost: ~20–30 LLM calls.** This is one of the two longest steps (the other is
+plotting). Wall-time floor is set by Semantic Scholar's 1 QPS verification
+limit.
+## Inputs
+- `workspace/outline.json` — specifically `intro_related_work_plan` with the
+  Introduction search directions and the 2-4 Related Work methodology
+  clusters
+- `workspace/inputs/conference_guidelines.md` — used to derive `cutoff_date`
+- `workspace/inputs/idea.md`, `workspace/inputs/experimental_log.md` — for
+  framing the Intro and grounding the Related Work positioning
+## Outputs
+- `workspace/citation_pool.json` — verified Semantic Scholar metadata for
+  every paper that survived verification
+- `workspace/refs.bib` — BibTeX file generated from the verified pool
+- `workspace/drafts/intro_relwork.tex` — drafted Introduction and Related
+  Work sections, written into the template, with the rest of the template
+  preserved verbatim
+## Two-phase pipeline (App. D.3)
+```
+PHASE 1 — Parallel Candidate Discovery
+   For each search direction in introduction_strategy.search_directions:
+   For each limitation_search_query in each related_work cluster:
+     - Use the host's web search tool to discover up to ~10 candidate papers.
+     - Run up to 10 discovery queries in parallel (host-permitting).
+     - Collect (title, snippet, url) tuples — no verification yet.
+   → PRE-DEDUP before Phase 2 (see Step 1.5 below)
+PHASE 2 — Sequential Citation Verification (1 QPS, with cache)
+   For each candidate (after pre-dedup), sequentially:
+     0. Check s2_cache.json first (scripts/s2_cache.py --check).
+        If HIT: use cached response, skip live S2 call. No throttle needed.
+        If MISS: proceed with live request below.
+     1. Query Semantic Scholar by title:
+          GET https://api.semanticscholar.org/graph/v1/paper/search?query=<title>
+              &fields=title,abstract,year,authors,venue,externalIds&limit=5
+        (Public endpoint, no key. Throttle to 1 QPS for live requests only.)
+     2. Store the S2 response in cache: s2_cache.py --store.
+     3. Pick the top hit. Check Levenshtein title ratio against the original
+        candidate title. If ratio < 70: discard.
+     4. Bonus: if year and venue exactly align with hints, add a +5 point
+        match-quality bonus.
+     5. Require: abstract is non-empty.
+     6. Require: paper.year (or month if known) strictly predates cutoff_date.
+        Months default to day-1: e.g., "October 2024" → 2024-10-01.
+     7. If all checks pass, add to verified pool.
+   After all candidates are verified, dedup by Semantic Scholar paperId.
+```
+The host agent does the LLM/web work; the deterministic helpers in `scripts/`
+do the math.
+## Step-by-step
+### 0. Derive `cutoff_date`
+Parse `conference_guidelines.md` for the submission deadline. The paper aligns
+research cutoff with venue submission deadline (App. D.1):
+| Venue | Cutoff |
+|---|---|
+| CVPR 2025 | Nov 2024 |
+| ICLR 2025 | Oct 2024 |
+| Other | One month before the stated submission deadline |
+Encode as `YYYY-MM-DD`. Months default to day-1 (e.g., `2024-10-01`).
+### 1. Phase 1: Parallel Candidate Discovery
+From `outline.json`:
+- All `introduction_strategy.search_directions` (3-5 queries)
+- For each cluster in `related_work_strategy.subsections`:
+  - The cluster's `sota_investigation_mission` becomes a search query
+  - All `limitation_search_queries` (1-3 each)
+For each query, **use your host's web search tool** (e.g., `WebSearch` in
+Claude Code, `@web` in Cursor, the search tool in Antigravity). Collect the
+top ~10 candidates per query: title, abstract snippet, source URL.
+If your host supports parallel sub-tasks, fire up to 10 concurrent search
+queries. If not, run sequentially — slower but functionally equivalent.
+#### Optional: Exa as a Phase 1 backend
+If your host has no native web search, OR you want a research-paper-focused
+backend with better signal-to-noise, you can use [Exa](https://exa.ai) via
+the bundled `scripts/exa_search.py` helper. It is **opt-in** and reads
+`EXA_API_KEY` from the environment — the repo never commits a key.
+```bash
+export EXA_API_KEY="your-key-here"   # get one at https://dashboard.exa.ai/
+python skills/literature-review-agent/scripts/exa_search.py \
+    --query "Sparse attention long context transformers" \
+    --num-results 15 \
+    --discovered-for "related_work[2.1]"
+```
+Output is a normalized candidate list ready to merge into
+`raw_candidates.json`. Phase 2 verification (Semantic Scholar fuzzy match,
+cutoff, dedup) is unchanged. See `references/exa-search-cookbook.md` for
+the full recipe, query patterns, cost estimates, and security notes.
+Combine all discovered candidates into a single working list. Tag each with
+the originating query ID so you can later attribute it to "intro" vs
+"related_work[i]".
+### 1.5. Pre-dedup before Phase 2
+**Always run this before starting Phase 2.** Multiple search queries routinely
+return the same papers (e.g., "Attention is All You Need" appears in almost
+every NLP discovery query). Verifying duplicates wastes 30-40% of S2 quota
+at 1 QPS.
+```bash
+python skills/literature-review-agent/scripts/pre_dedup_candidates.py \
+    --in workspace/raw_candidates.json \
+    --out workspace/deduped_candidates.json
+# Prints: "150 candidates → 97 unique (53 duplicates removed)"
+```
+Use `workspace/deduped_candidates.json` as input to Phase 2.
+### 2. Phase 2: Sequential Verification via Semantic Scholar (with cache)
+For each candidate in `deduped_candidates.json`, in **sequential** order:
+**Step A — check cache first** (no S2 call, no throttle needed):
+```bash
+python skills/literature-review-agent/scripts/s2_cache.py \
+    --cache workspace/cache/s2_cache.json \
+    --check "<candidate title>"
+# exit 0 + prints JSON → use cached response, skip Step B
+# exit 1 → proceed to Step B
+```
+**Step B — live S2 request** (cache MISS only, throttle to 1 QPS):
+**Preferred:** use the bundled `scripts/s2_search.py` helper — it handles
+auth, retries, and 429 back-off automatically:
+```bash
+python skills/literature-review-agent/scripts/s2_search.py \
+    --query "<URL-decoded candidate title>" --limit 5
+# If SEMANTIC_SCHOLAR_API_KEY is set the key is forwarded automatically.
+# If not, the public unauthenticated endpoint is used (≤1 QPS, still works).
+```
+Check whether the key is configured before starting Phase 2:
+```bash
+python skills/literature-review-agent/scripts/s2_search.py --check-key
+```
+**Fallback:** if you prefer your host's URL fetch tool, GET:
+```
+https://api.semanticscholar.org/graph/v1/paper/search?query=<URL-encoded title>&limit=5&fields=title,abstract,year,authors,venue,externalIds
+```
+Add header `x-api-key: <SEMANTIC_SCHOLAR_API_KEY>` if the env var is set.
+Be polite: ≤1 request per second for live requests. Cache hits are free.
+**Step C — store in cache** (after every successful live request):
+```bash
+python skills/literature-review-agent/scripts/s2_cache.py \
+    --cache workspace/cache/s2_cache.json \
+    --store "<candidate title>" \
+    --response '<full S2 JSON response>'
+```
+For the top hit:
+```bash
+python skills/literature-review-agent/scripts/levenshtein_match.py \
+    --candidate "Original candidate title" \
+    --found "S2 returned title"
+# prints integer 0-100. Discard if < 70.
+```
+Then check the temporal cutoff:
+```bash
+python skills/literature-review-agent/scripts/check_cutoff.py \
+    --paper-year 2024 \
+    --paper-month 9 \
+    --cutoff 2024-10-01
+# exit 0 if strictly predates, exit 1 if not
+```
+If both checks pass AND the abstract is non-empty, append the paper's full
+S2 metadata to the verified pool.
+### 3. Dedup and assemble the pool
+After all candidates are verified:
+```bash
+python skills/literature-review-agent/scripts/dedupe_by_id.py \
+    --in raw_pool.json \
+    --out workspace/citation_pool.json
+```
+The dedupe script keys on `paperId` (Semantic Scholar's internal unique ID),
+falling back to `externalIds.DOI`, then `externalIds.ArXiv`, then a
+normalized title.
+The script also computes and writes `min_cite_paper_count` =
+`floor(0.9 * len(papers))` — the minimum number of papers the writing step
+must cite (the paper's ≥90% integration rule, App. D.3).
+**Immediately after dedupe_by_id.py**, validate and auto-fix the pool schema:
+```bash
+python skills/literature-review-agent/scripts/validate_pool.py \
+    --pool workspace/citation_pool.json --fix
+# Catches and fixes authors-as-strings, reports missing required fields.
+# Must pass before proceeding to Step 4.
+```
+### 4. Build the BibTeX file
+```bash
+python skills/literature-review-agent/scripts/bibtex_format.py \
+    --pool workspace/citation_pool.json \
+    --out workspace/refs.bib
+```
+The script generates citation keys deterministically from `firstauthor + year
++ first significant word of title` (e.g., `vaswani2017attention`). It writes
+out only `@article` / `@inproceedings` / `@misc` entries — never invents
+fields. It also writes the canonical `bibtex_key` back into each paper record
+in `citation_pool.json`.
+**Immediately after bibtex_format.py**, sync keys in `intro_relwork.tex`:
+```bash
+python skills/literature-review-agent/scripts/sync_keys.py \
+    --pool workspace/citation_pool.json \
+    --tex  workspace/drafts/intro_relwork.tex \
+    --inplace
+# Replaces every \cite{agent_key} with \cite{canonical_bibtex_key}.
+# Eliminates citation_coverage gate failures caused by key mismatch.
+```
+These two steps replace the manual Python snippets that were previously
+required. The pipeline is now:
+```
+dedupe_by_id → validate_pool --fix → bibtex_format → sync_keys
+```
+### 5. Draft Introduction + Related Work
+This is where you (the host agent) actually write text. Load the
+**verbatim Literature Review Agent prompt** at `references/prompt.md`.
+Substitute the template placeholders:
+| Placeholder | Value |
+|---|---|
+| `intro_related_work_plan` | full JSON object from `outline.json` |
+| `project_idea` | contents of `idea.md` |
+| `project_experimental_log` | contents of `experimental_log.md` |
+| `citation_checklist` | the BibTeX keys from `refs.bib` |
+| `collected_papers` | list of `{key, title, abstract}` from `citation_pool.json` |
+| `paper_count` | `len(citation_pool.papers)` |
+| `min_cite_paper_count` | from `citation_pool.json` |
+| `cutoff_date` | the date you derived in Step 0 |
+**Also prepend the Anti-Leakage Prompt** from
+`../paper-orchestra/references/anti-leakage-prompt.md`.
+Run your LLM with the combined prompt against `template.tex`. The agent's
+job is to fill in the empty Introduction and Related Work sections of the
+template **and leave everything else untouched**. Output: the full
+`template.tex` with those two sections filled. Save to
+`workspace/drafts/intro_relwork.tex`.
+### 6. Verify ≥90% citation coverage
+```bash
+python skills/literature-review-agent/scripts/citation_coverage.py \
+    --tex workspace/drafts/intro_relwork.tex \
+    --pool workspace/citation_pool.json
+# exit 0 if ≥90% of pool is cited; exit 1 otherwise
+```
+If the gate fails, re-prompt the writing step explicitly listing the missing
+keys and asking the agent to integrate them where contextually appropriate.
+## Critical rules from the prompt
+These are excerpted from `references/prompt.md`. The host agent MUST honor
+them on the writing call:
+- **Cite ONLY from `collected_papers`.** Never invent BibTeX keys, never
+  reference papers not in the pool.
+- **Cite at least `min_cite_paper_count` of them** in Intro + Related Work
+  combined.
+- **TIMELINE RULE**: Do not treat any papers published after `cutoff_date`
+  as prior baselines to beat. They are concurrent work only.
+- **EVALUATION RULE**: Do not claim our method beats / achieves SOTA over a
+  specific cited paper UNLESS that paper is explicitly evaluated against in
+  `experimental_log.md`. Frame other recent papers strictly as concurrent,
+  orthogonal, or conceptual work.
+- **Output format**: return the full code for the updated `template.tex`,
+  with the two empty sections (Introduction and Related Work) filled in,
+  and **all the other code** (packages, styles, other sections) **identical
+  to the original** template.tex.
+- Wrap output in ```` ```latex ... ``` ```` fences.
+- Do not change `\usepackage[capitalize]{cleveref}` to `cleverref` (there is
+  no `cleverref.sty`).
+## Degraded mode (no web search)
+If your host has no web search tool, switch to degraded mode:
+1. If the user has placed a pre-built `workspace/inputs/refs.bib` in the
+   workspace, load it directly into `workspace/refs.bib` and skip Phase 1
+   and Phase 2.
+2. Otherwise, emit `workspace/drafts/intro_relwork.tex` containing the
+   template with two TODO markers in the Intro and Related Work sections,
+   and tell the user the pipeline cannot complete Step 3 without web search.
+## Resources
+- `references/prompt.md` — verbatim Literature Review Agent prompt from App. F.1
+- `references/discovery-pipeline.md` — Phase 1 + Phase 2 explained in detail
+- `references/verification-rules.md` — Levenshtein cutoff, year alignment, dedup
+- `references/citation-density-rule.md` — the ≥90% integration rule
+- `references/s2-api-cookbook.md` — Semantic Scholar URLs, fields, rate limits
+- `references/exa-search-cookbook.md` — optional Exa backend for Phase 1 (research-paper-focused web search)
+- `scripts/pre_dedup_candidates.py` — **NEW** dedup Phase 1 candidates before Phase 2 (saves 30-40% S2 quota)
+- `scripts/s2_cache.py` — **NEW** persistent S2 response cache (eliminates re-verification on re-runs)
+- `scripts/validate_pool.py` — **NEW** validate & auto-fix citation_pool.json schema (authors format)
+- `scripts/sync_keys.py` — **NEW** sync cite keys in .tex with canonical bibtex_keys after bibtex_format.py
+- `scripts/levenshtein_match.py` — fuzzy title match (ratio > 70)
+- `scripts/check_cutoff.py` — date cmp w/ month → day-1 default
+- `scripts/dedupe_by_id.py` — dedup verified pool by S2 paperId
+- `scripts/bibtex_format.py` — build refs.bib from JSON pool
+- `scripts/citation_coverage.py` — ≥90% citation coverage gate
+- `scripts/s2_search.py` — **NEW** Semantic Scholar title-search helper; reads `SEMANTIC_SCHOLAR_API_KEY` from env (optional — falls back to unauthenticated)
+- `scripts/exa_search.py` — optional Exa Phase 1 backend (reads `EXA_API_KEY` from env)

.scider/skills/literature-review-agent/references/citation-density-rule.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# Citation Density Rule
+Source: arXiv:2604.05018, App. D.3.
+## The 90% rule
+> ...the system strictly constrains the model to cite only the provided
+> verified papers, explicitly mandating that at least 90% of the gathered
+> literature pool must be actively integrated and cited when synthesizing
+> the Introduction and Related Work sections.
+Why: this is the paper's core defense against citation inflation. The
+literature review pool is built once via the rigorous discovery →
+verification → dedup pipeline. The writing step must then *use* almost all
+of it. This prevents the agent from gathering 50 papers and citing only the
+3 most famous ones, which would defeat the entire literature search.
+## Implementation
+After the Lit Review writing call produces `intro_relwork.tex`:
+```bash
+python scripts/citation_coverage.py \
+    --tex workspace/drafts/intro_relwork.tex \
+    --pool workspace/citation_pool.json \
+    --threshold 0.90
+```
+The script:
+1. Reads `citation_pool.json` and counts `papers[]` (= N).
+2. Computes `min_required = floor(0.90 * N)`.
+3. Greps `intro_relwork.tex` for all `\cite{KEY}`, `\citep{KEY}`, `\citet{KEY}`,
+   `\autocite{KEY}`, `\citeauthor{KEY}`, etc.
+4. Counts the **unique** keys actually cited.
+5. Reports `cited / N` and exits non-zero if `cited < min_required`.
+## What to do on failure
+The script prints the missing keys grouped by `discovered_for` cluster:
+```
+FAIL: 17/22 papers cited (77.3%, need ≥90%)
+Uncited papers (5):
+  - vaswani2017attention      [discovered_for: intro]       (Attention Is All You Need)
+  - he2016deep                [discovered_for: intro]       (Deep Residual Learning ...)
+  - liu2024video              [discovered_for: related_work[2.1]]  (Long Video Generation ...)
+  - chen2024sparse            [discovered_for: related_work[2.2]]  (Sparse Attention Surveys ...)
+  - kim2024transformer        [discovered_for: related_work[2.2]]  (Transformer Scaling Laws ...)
+```
+The host agent should then re-call the Lit Review writing step with an
+appended instruction:
+```
+The previous draft cited only 17 out of 22 verified papers (77.3%, threshold
+is 90%). You MUST integrate the following 5 papers into the appropriate
+sections:
+  - vaswani2017attention (intro): foundational attention reference
+  - he2016deep (intro): foundational ResNet reference
+  - liu2024video (related work 2.1): direct competing approach for long video
+  - chen2024sparse (related work 2.2): sparse attention survey, group with [...]
+  - kim2024transformer (related work 2.2): scaling-laws context
+Do not remove any existing citations. Add new ones where contextually
+appropriate. Re-emit the full template.tex with both sections updated.
+```
+After 2-3 re-prompts, if coverage still falls short, the pipeline should
+emit a warning and proceed — the paper does not specify a hard halt on this,
+only a strong constraint.

.scider/skills/literature-review-agent/references/discovery-pipeline.md ADDED Viewed

	@@ -0,0 +1,151 @@

+# Discovery Pipeline (Phase 1 + Phase 2)
+Source: arXiv:2604.05018, App. D.3 ("Citation Verification") and App. B
+(LLM-call distribution).
+## Phase 1 — Parallel Candidate Discovery
+The paper uses 10 concurrent workers to fan out search-grounded LLM calls
+("Gemini-3-Flash with Google Search grounding"). For our host-agent
+implementation, the equivalent is: spawn up to 10 concurrent search queries
+using the host's native web search tool.
+### Inputs
+From `outline.json`:
+```
+introduction_strategy:
+  search_directions: [q1, q2, q3]              # 3-5 queries
+related_work_strategy:
+  subsections:
+    - methodology_cluster: "..."
+      sota_investigation_mission: "..."        # 1 derived query
+      limitation_search_queries: [q4, q5]      # 1-3 queries
+    - ...
+```
+Total query budget: typically 10-20 queries per paper.
+### Per-query procedure
+For each search query, instruct your host's search tool:
+```
+search("<query>", num_results=10)
+```
+Or, if you've enabled the optional Exa backend (see `exa-search-cookbook.md`):
+```bash
+python scripts/exa_search.py --query "<query>" --num-results 10
+```
+Both paths produce the same normalized candidate format. Collect the top
+10 results per query. Each result should yield:
+- `title` — the paper's title from the search snippet
+- `snippet` — the abstract preview from the search snippet
+- `source_url` — the result URL (often the arXiv abstract page)
+Tag each result with `discovered_for: ["intro"]` or
+`discovered_for: ["related_work[2.1]"]` so you can later trace which cluster
+each citation supports.
+Combine all results across all queries into a single `raw_candidates.json`:
+```json
+{
+  "candidates": [
+    {
+      "title": "Attention Is All You Need",
+      "snippet": "The dominant sequence transduction models...",
+      "source_url": "https://arxiv.org/abs/1706.03762",
+      "discovered_for": ["intro"]
+    },
+    ...
+  ]
+}
+```
+## Phase 2 — Sequential Verification via Semantic Scholar
+The paper enforces strict sequential verification at ≤1 QPS via the public
+Semantic Scholar API. We follow the same constraint.
+### Per-candidate procedure
+1. **Search S2 by title**. Use the host's URL fetch tool:
+   ```
+   GET https://api.semanticscholar.org/graph/v1/paper/search
+       ?query=<URL-encoded(title)>
+       &limit=5
+       &fields=title,abstract,year,authors,venue,externalIds
+   ```
+   No API key required for the public endpoint. Be polite: 1 QPS.
+2. **Take the top hit**. Compare `title` to the candidate `title` via the
+   helper:
+   ```bash
+   python scripts/levenshtein_match.py --candidate "..." --found "..."
+   ```
+   The helper prints an integer 0-100 (the Levenshtein ratio).
+   - **< 70 → discard the candidate.** Move on.
+   - **≥ 70 → continue to checks 3-5.**
+3. **Check abstract presence**. If `abstract` is null or empty → discard.
+   The paper requires every cited entity to have a retrievable abstract for
+   downstream context enrichment in the Section Writing Agent.
+4. **Check temporal cutoff**:
+   ```bash
+   python scripts/check_cutoff.py \
+       --paper-year <year> \
+       --paper-month <month or omit> \
+       --cutoff <YYYY-MM-DD>
+   ```
+   Exit 0 if strictly predates; exit 1 if not. Discard on exit 1.
+5. **Year-alignment bonus**. If the candidate's `discovered_for` query
+   mentioned a specific year and the S2 hit's year matches exactly, record
+   `match_score = ratio + 5`. (This is a soft bonus used for tie-breaking
+   when two candidates dedup to similar entries.)
+6. **Append to verified pool** if all checks pass. Record:
+   ```json
+   {
+     "paperId": "abc123...",
+     "title": "...",
+     "abstract": "...",
+     "year": 2017,
+     "venue": "NeurIPS",
+     "authors": [{"name": "A. Vaswani"}, ...],
+     "externalIds": {"DOI": "...", "ArXiv": "1706.03762"},
+     "match_score": 100,
+     "discovered_for": ["intro"]
+   }
+   ```
+### Rate-limit etiquette
+The S2 public endpoint enforces ~1 QPS without an API key. If you receive
+HTTP 429, sleep 5 seconds and retry. Do not parallelize Phase 2 — verification
+must be strictly sequential.
+If your host has the patience for it, the paper measures ~20-30 LLM/API calls
+total per Lit Review Agent invocation. With ~30 candidates that's roughly
+30 seconds of verification wall-time. With 100 candidates it's ~100 seconds.
+## Why two phases
+The split exists because:
+- **Discovery is high-throughput, low-stakes**. You want to cast a wide net
+  fast. Search APIs accept high concurrency.
+- **Verification is low-throughput, high-stakes**. The S2 API protects
+  itself with QPS limits, and the verification step is what keeps the paper
+  honest. Faking a citation is trivially easy without it.
+The paper's design "successfully combines the high-concurrency tolerance of
+the LLM API with the strict throughput limits of the Semantic Scholar API to
+prevent quota-induced latency" (App. B).

.scider/skills/literature-review-agent/references/exa-search-cookbook.md ADDED Viewed

	@@ -0,0 +1,245 @@

+# Exa Search Cookbook (optional Phase 1 backend)
+[Exa](https://exa.ai) is a search engine optimized for finding academic
+papers and other high-quality content. The `literature-review-agent` can
+use Exa as an **OPTIONAL** backend for Phase 1 candidate discovery — useful
+when your host coding agent has no native web search tool, or when you
+want a research-paper-focused search backend with better signal-to-noise
+than general web search.
+> **Exa is opt-in.** The literature-review-agent's default Phase 1 path is
+> "use your host agent's native web search tool" (`WebSearch` in Claude
+> Code, `@web` in Cursor, the search tool in Antigravity, etc.). That
+> requires zero configuration and no API key. Use Exa only if you want
+> to.
+## Why use it
+Exa fills three gaps:
+1. **Hosts with no built-in search.** Aider, OpenCode, and generic CLI
+   agents often lack a native web search tool. Exa gives them one.
+2. **Research-paper-focused results.** Exa's `category: "research paper"`
+   filter returns higher signal-to-noise than general web search for
+   academic queries. The example response (e.g., for the query
+   "PaperOrchestra") returns arXiv pages, conference proceedings, and
+   academic tools rather than general SEO content.
+3. **Batch / non-interactive runs.** When you want a deterministic,
+   scriptable backend rather than going through the host agent's tool
+   interface.
+Exa returns 10–20 results per call (the helper clamps to that range), and
+each result includes a `title`, `url`, optional `publishedDate`, and a
+list of `highlights` (snippets) which the helper joins into a `snippet`
+field consumable by the rest of the Phase 1 pipeline.
+## Get a key
+1. Sign up at <https://dashboard.exa.ai/>.
+2. Copy your API key (format: `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`).
+3. Set it in your environment:
+   ```bash
+   export EXA_API_KEY="paste-key-here"
+   ```
+   Or put it in a `.env` file (which is gitignored — the repo `.gitignore`
+   blocks `*.env` and `.env*` patterns) and source it:
+   ```bash
+   set -a; source .env; set +a
+   ```
+**This repo never commits a key.** The helper reads `EXA_API_KEY` from the
+environment at runtime. The key is your responsibility to provision and
+secure.
+## Run the helper
+```bash
+python skills/literature-review-agent/scripts/exa_search.py \
+    --query "Sparse attention long context transformers" \
+    --num-results 15 \
+    --discovered-for "related_work[2.1]"
+```
+Output (default — normalized to the literature-review-agent candidate
+format):
+```json
+{
+  "candidates": [
+    {
+      "title": "Longformer: The Long-Document Transformer",
+      "snippet": "We present the Longformer, a self-attention mechanism that scales linearly with sequence length...",
+      "source_url": "https://arxiv.org/abs/2004.05150",
+      "discovered_for": ["related_work[2.1]"],
+      "_exa_id": "https://arxiv.org/abs/2004.05150",
+      "_exa_published_date": "2020-04-10T00:00:00.000Z"
+    },
+    ...
+  ]
+}
+```
+This JSON can be merged directly into `workspace/raw_candidates.json`
+before the Phase 2 sequential verification step.
+### Useful flags
+| Flag | Default | Purpose |
+|---|---|---|
+| `--query` | (required) | Search query string |
+| `--num-results` | `10` | 1–20; the helper clamps to this range |
+| `--category` | `"research paper"` | Pass `""` to disable category filtering for broader results |
+| `--highlight-chars` | `4000` | Max characters per highlight (Exa parameter) |
+| `--discovered-for` | `"intro"` | Tag attached to each candidate; use `"related_work[2.1]"` for cluster queries |
+| `--raw` | off | Print the full Exa response JSON instead of normalized candidates |
+## Direct curl recipe
+If you'd rather not use the Python helper (for one-off testing, or to
+invoke from a host agent's `Bash` / `WebFetch` tool directly):
+```bash
+curl -X POST https://api.exa.ai/search \
+  --header "content-type: application/json" \
+  --header "x-api-key: $EXA_API_KEY" \
+  --data '{
+    "query": "PaperOrchestra automated paper writing",
+    "category": "research paper",
+    "numResults": 10,
+    "type": "auto",
+    "contents": {
+      "highlights": {
+        "maxCharacters": 4000
+      }
+    }
+  }'
+```
+The `$EXA_API_KEY` reference assumes the key is in your shell env. **Do
+not** paste the literal key into the curl command in shell history or
+chat — use the env var.
+## Response shape
+```json
+{
+  "requestId": "52fcb70256224863b33f356fdae37c7f",
+  "resolvedSearchType": "neural",
+  "results": [
+    {
+      "id": "https://arxiv.org/abs/2604.05018",
+      "title": "PaperOrchestra: A Multi-Agent Framework for ...",
+      "url": "https://arxiv.org/abs/2604.05018",
+      "publishedDate": "2026-04-06T00:00:00.000Z",
+      "highlights": ["...", "..."],
+      "highlightScores": [0.4, 0.3],
+      "image": "https://...",
+      "favicon": "https://..."
+    }
+  ],
+  "searchTime": 975.2,
+  "costDollars": {
+    "total":  0.007,
+    "search": {"neural": 0.007}
+  }
+}
+```
+## Mapping Exa → literature-review-agent candidate format
+Phase 2 verification (Semantic Scholar fuzzy match → cutoff check → dedup)
+expects candidates in this shape:
+```json
+{
+  "title":          "...",
+  "snippet":        "...",
+  "source_url":     "...",
+  "discovered_for": ["intro"]
+}
+```
+`exa_search.py --normalize` (the default mode) does this mapping:
+| Exa field | Candidate field |
+|---|---|
+| `result.title` | `title` |
+| `result.url` (fallback `result.id`) | `source_url` |
+| `result.highlights` joined and capped at 1500 chars | `snippet` |
+| `--discovered-for` flag | `discovered_for` |
+| `result.id` | `_exa_id` (preserved for debugging) |
+| `result.publishedDate` | `_exa_published_date` (preserved for tie-breaking) |
+Phase 2 verification still goes through Semantic Scholar regardless of
+whether the candidate came from Exa or from the host's native search.
+Exa is ONLY a discovery backend; the verification chain
+(`levenshtein_match.py` → `check_cutoff.py` → `dedupe_by_id.py` →
+`bibtex_format.py` → `citation_coverage.py`) is unchanged.
+## Query patterns
+Match the literature-review-agent's outline-driven query design. Run one
+Exa call per query, then merge all candidate lists:
+| Query type | Source in `outline.json` | Example query | `--discovered-for` |
+|---|---|---|---|
+| Macro context | `introduction_strategy.search_directions[i]` | `"Survey of long-context attention mechanisms 2020-2024"` | `"intro"` |
+| Foundational | same | `"Foundational papers transformer self-attention scaling laws"` | `"intro"` |
+| SOTA scan | `related_work_strategy.subsections[i].sota_investigation_mission` | `"Recent SOTA sparse attention transformers 2024"` | `"related_work[2.1]"` |
+| Limitation hunt | `related_work_strategy.subsections[i].limitation_search_queries[j]` | `"Block-sparse attention failure modes long sequences"` | `"related_work[2.1]"` |
+For the related-work cluster queries, the `--discovered-for` tag matters
+— the downstream `citation_coverage.py` gate uses it to attribute each
+citation to the right cluster when reporting which papers were not yet
+integrated.
+## Cost and rate limits
+Exa pricing is per-query (~$0.007 per neural search at the time of
+writing). For a typical paper with ~15-20 search queries (3-5 intro
+queries + 10-15 related-work queries), one full Lit Review Agent run
+costs ~$0.10-$0.15. Check <https://exa.ai/pricing> for current rates.
+Exa's rate limits are generous; the paper's 10-worker parallel discovery
+pattern is well within them. The pipeline's wall-time floor is still set
+by Semantic Scholar's 1 QPS verification limit, not by Exa.
+## Security
+- **NEVER commit `EXA_API_KEY` to git.** The repo's `.gitignore` blocks
+  `.env`, `*.env`, and `secrets.json` patterns. Keep your key in your
+  shell environment or your secrets manager (1Password CLI, op, doppler,
+  etc.).
+- The helper reads the key from the environment only. It does NOT accept
+  the key as a command-line argument (which would expose it in shell
+  history).
+- Exa logs requests for billing and quality. Assume your queries are not
+  private to Exa themselves. Don't include sensitive draft text in
+  queries.
+## Troubleshooting
+| Symptom | Likely cause | Fix |
+|---|---|---|
+| `ERROR: EXA_API_KEY environment variable not set` | env var missing | `export EXA_API_KEY="..."` |
+| `ERROR: Exa HTTP 401` | invalid or expired key | check the dashboard for the current key |
+| `ERROR: Exa HTTP 429` | rate-limited | back off, lower concurrency |
+| `WARN: Exa returned 0 results` | query too narrow or odd category | broaden the query or try `--category ""` |
+| `Exa network error` | no internet, DNS issue | check your connection; the helper uses urllib stdlib only, no proxy support |
+## When to prefer Exa vs the host's native search
+| Use case | Recommended backend |
+|---|---|
+| Claude Code, Cursor, Antigravity (have native web search) | host's native search (free, integrated) |
+| Aider, OpenCode, generic CLI agents | Exa (gives them search) |
+| Batch reproducible runs | Exa (deterministic backend) |
+| Research-paper-heavy queries | Exa (better academic signal) |
+| One-off interactive runs | host's native search (less friction) |
+You can also mix: use the host's web search for the broad intro queries
+and Exa for the narrow limitation-search queries where the
+research-paper-category filter helps the most.

.scider/skills/literature-review-agent/references/prompt.md ADDED Viewed

	@@ -0,0 +1,77 @@

+# Literature Review Agent — verbatim prompt
+**Source: arXiv:2604.05018, Appendix F.1, page 46 (verbatim).**
+This is the exact prompt used by the Literature Review Agent in the paper.
+Use it as your system message when drafting Introduction and Related Work.
+Substitute the placeholders before sending. The Anti-Leakage Prompt
+(`../paper-orchestra/references/anti-leakage-prompt.md`) MUST be prepended.
+---
+```
+Role: Senior AI Researcher.
+Task: Write the introduction and related work section of a paper.
+You will be given a template.tex, this is the initial skeleton we outlined for
+you. Your job is to fill in two sections: Introduction and Related Work.
+Leave all the other sections untouched.
+Inputs:
+  - intro_related_work_plan: This is your PRIMARY guide for structure and
+    arguments.
+  - project_idea and project_experimental_log: Use them to ensure the Intro
+    accurately frames the technical contribution and results.
+  - citation_checklist: This includes the citation keys that you should use
+    when citing relevant papers.
+  - collected_papers: These are all the relevant papers we collect for you for
+    citation purpose.
+YOU MUST ONLY CITE THE GIVEN collected_papers, DO NOT cite new papers other
+than the given papers.
+Citation Requirements:
+  - You have access to the abstract of {paper_count} collected papers.
+  - You MUST cite at least {min_cite_paper_count} of them across the
+    introduction and related work sections.
+  - Introduction: Cite key statistics, foundational models (CLIP, etc.), and
+    broad problem statements.
+  - Related Work: Do deep comparative citations. Group distinct works (e.g.,
+    "Several methods [A, B, C]...").
+  - Ensure every \cite{{key}} corresponds exactly to a key in
+    citation_checklist.
+  - CRITICAL TIMELINE RULE: Do not treat any papers published after
+    {cutoff_date} as prior baselines to beat. Treat them strictly as
+    concurrent work.
+  - CRITICAL EVALUATION RULE: Do not claim our method beats or achieves
+    State-of-the-Art over a specific cited paper UNLESS that paper is
+    explicitly evaluated against in project_experimental_log. Frame other
+    recent papers strictly as concurrent, orthogonal, or conceptual work.
+  - You need to return the full code for the new template.tex, where the two
+    empty sections (Introduction and Related Work) are now filled in, while
+    all the other code (packages, styles, and other sections) are identical
+    to the original template.tex.
+Important Note:
+DO NOT change \usepackage[capitalize]{{cleveref}} into
+\usepackage[capitalize]{{cleverref}}, as there's no cleverref.sty.
+Output Format:
+You must return the code for the updated template.tex. Make sure to wrap the
+code with ```latex content ```.
+```
+---
+## Placeholder substitution table
+| Placeholder | Source |
+|---|---|
+| `{paper_count}` | `len(citation_pool.papers)` from `workspace/citation_pool.json` |
+| `{min_cite_paper_count}` | `floor(0.9 * paper_count)` — the ≥90% rule |
+| `{cutoff_date}` | Derived from `conference_guidelines.md` — see App. D.1 of the paper |
+The other placeholders (`intro_related_work_plan`, `project_idea`,
+`project_experimental_log`, `citation_checklist`, `collected_papers`) are
+substituted by passing their full file/JSON contents into the user message.

.scider/skills/literature-review-agent/references/s2-api-cookbook.md ADDED Viewed

	@@ -0,0 +1,138 @@

+# Semantic Scholar API Cookbook
+How to verify a candidate paper via the Semantic Scholar Graph API.
+Base: `https://api.semanticscholar.org/graph/v1`
+Reference: <https://api.semanticscholar.org/api-docs/graph>
+## API key (optional)
+The pipeline uses the **public, unauthenticated endpoint** by default — no key
+required.  If you have a Semantic Scholar API key you can pass it via the
+`x-api-key` header to get higher rate limits (useful for large batches).
+Get a free key at <https://api.semanticscholar.org/> then export it once:
+```bash
+export SEMANTIC_SCHOLAR_API_KEY="your-key-here"
+```
+The bundled `scripts/s2_search.py` helper picks this up automatically.  If the
+variable is not set the script falls back to the unauthenticated endpoint — the
+pipeline works fine either way; just keep to ≤1 QPS on live requests.
+```bash
+# check whether the key is configured
+python skills/literature-review-agent/scripts/s2_search.py --check-key
+# search by title (key used automatically if set)
+python skills/literature-review-agent/scripts/s2_search.py \
+    --query "Attention is All You Need" --limit 5
+# print the raw S2 JSON
+python skills/literature-review-agent/scripts/s2_search.py \
+    --query "BERT pre-training" --raw
+```
+The repo never commits a key.  Key management is your responsibility (shell
+environment, 1Password, doppler, etc.).
+## Endpoint 1 — Search by title
+```
+GET /paper/search
+    ?query=<URL-encoded title>
+    &limit=5
+    &fields=title,abstract,year,authors,venue,externalIds
+```
+Example:
+```
+GET https://api.semanticscholar.org/graph/v1/paper/search?query=Attention%20Is%20All%20You%20Need&limit=5&fields=title,abstract,year,authors,venue,externalIds
+```
+Response (truncated):
+```json
+{
+  "total": 12345,
+  "data": [
+    {
+      "paperId": "204e3073870fae3d05bcbc2f6a8e263d9b72e776",
+      "title": "Attention is All you Need",
+      "abstract": "The dominant sequence transduction models are based on...",
+      "year": 2017,
+      "venue": "NeurIPS",
+      "authors": [{"name": "Ashish Vaswani"}, ...],
+      "externalIds": {
+        "DBLP": "conf/nips/VaswaniSPUJGKP17",
+        "ArXiv": "1706.03762",
+        "DOI": "10.5555/3295222.3295349"
+      }
+    },
+    ...
+  ]
+}
+```
+## Endpoint 2 — Get a specific paper by ID
+```
+GET /paper/<paperId>?fields=title,abstract,year,authors,venue,externalIds,citationCount
+```
+## Useful identifiers
+You can pass these as `<paperId>`:
+- S2 internal: `204e3073870fae3d05bcbc2f6a8e263d9b72e776`
+- DOI: `DOI:10.18653/v1/N18-3011`
+- ArXiv: `ARXIV:1706.03762`
+- Corpus ID: `CorpusId:13756489`
+- URL: `URL:https://arxiv.org/abs/1706.03762`
+## Rate limits
+- Unauthenticated: ~1 QPS sustained. Bursts will get 429.
+- Per the paper, "the strict throughput limits of the Semantic Scholar API
+  (1 query per second)" — App. B.
+If you get HTTP 429, sleep 5 seconds before retrying. Don't loop tightly.
+## Fields cheat sheet
+| Field | Type | Required by our pipeline? |
+|---|---|---|
+| `paperId` | string | yes (dedup key) |
+| `title` | string | yes (Levenshtein match) |
+| `abstract` | string | yes (rule 2: must exist) |
+| `year` | int | yes (cutoff check) |
+| `authors[].name` | string | yes (BibTeX author field) |
+| `venue` | string | recommended (BibTeX journal/booktitle) |
+| `externalIds.DOI` | string | recommended (dedup fallback, BibTeX doi) |
+| `externalIds.ArXiv` | string | recommended (dedup fallback) |
+| `publicationDate` | string `YYYY-MM-DD` | optional (more precise cutoff check) |
+| `citationCount` | int | optional (could inform tie-breaking) |
+Always pass `fields=...` explicitly — the default response is minimal and
+will not include the abstract.
+## Error handling
+| Status | Meaning | What to do |
+|---|---|---|
+| 200 | OK | proceed |
+| 400 | bad query syntax | URL-encode the title properly; retry once |
+| 404 | not found | discard the candidate |
+| 429 | rate limited | sleep 5s, retry |
+| 500-503 | S2 down | sleep 30s, retry up to 3 times, then give up |
+## Polite use
+The S2 API is a public service. Do not hammer it. If you have many candidates:
+- Throttle to 1 QPS.
+- Cache hits (the dedup script already serves as a deduplication cache).
+- Do not parallelize. Verification is sequential by design.

.scider/skills/literature-review-agent/references/verification-rules.md ADDED Viewed

	@@ -0,0 +1,100 @@

+# Verification Rules
+Source: arXiv:2604.05018, App. D.3 ("Citation Verification"), verbatim
+specifications below.
+## Rule 1 — Fuzzy title match (Levenshtein > 70)
+> Each candidate must resolve to a valid Semantic Scholar entity via a fuzzy
+> title match (Levenshtein distance ratio > 70 [Levenshtein, 1965]),
+> augmented by a point bonus for exact year alignment.
+Implementation: `scripts/levenshtein_match.py` uses
+`Levenshtein.ratio(a, b) * 100` from the `python-Levenshtein` package and
+returns the integer ratio. Threshold: **strictly greater than 70**.
+Examples:
+| Candidate title | S2 title | Ratio | Verdict |
+|---|---|---|---|
+| "Attention Is All You Need" | "Attention Is All You Need" | 100 | accept |
+| "Attention Is All You Need" | "Attention is All You Need." | 96 | accept |
+| "Sparse Attention for Transformers" | "Sparse Attention in Transformers" | 88 | accept |
+| "Self-Attention" | "Attention Is All You Need" | 47 | reject |
+| "Linformer" | "Linformer: Self-Attention with Linear Complexity" | 28 | reject |
+The Linformer case is the canonical false-negative: a short query against
+a long title. Workaround: when the candidate title looks abbreviated
+(< 4 words) and the S2 hit's title contains the candidate as a substring,
+override the ratio check. The paper does not specify this workaround
+explicitly; we add it as a soft safety net to avoid losing legitimate
+short-title hits. See `levenshtein_match.py --substring-bypass`.
+## Rule 2 — Abstract must exist
+> To enter the final context pool, the entity must possess a retrievable
+> abstract...
+Discard any verified hit where `abstract` is null, empty, or `"N/A"`. The
+Section Writing Agent uses the abstract to ground its citations contextually
+(per the Section Writing Agent prompt: "Read the abstract provided in
+citation_map.json for the papers you are citing. Use this context to write
+accurate, specific sentences about those works.").
+## Rule 3 — Strict temporal cutoff
+> ...and strictly predate the research cutoff (when specified down to the
+> month, the system defaults to the first day of that month).
+Implementation: `scripts/check_cutoff.py`. Comparison rules:
+- Cutoff is given as `YYYY-MM-DD`. The paper aligns it to venue submission
+  deadline (Nov 2024 for CVPR 2025, Oct 2024 for ICLR 2025 — App. D.1).
+- Paper year is required. Paper month is optional.
+- If paper has only year: assume month=12, day=31 (worst case for the paper —
+  must still be < cutoff).
+- If paper has year + month: assume day=1 of that month.
+- "Strictly predate" means `paper_date < cutoff_date`. Equality fails.
+Examples (cutoff = 2024-10-01):
+| Paper year | Paper month | Verdict |
+|---|---|---|
+| 2017 | — | accept |
+| 2024 | 9 | accept (2024-09-01 < 2024-10-01) |
+| 2024 | 10 | reject (2024-10-01 not strictly < 2024-10-01) |
+| 2024 | — (only year) | reject (2024-12-31 ≥ 2024-10-01) |
+The strict comparison is intentional: it prevents leakage of papers from
+the same submission cycle as the target venue.
+## Rule 4 — Dedup by Semantic Scholar paperId
+> Finally, gathered citations are deduplicated using unique paper ID keys.
+Implementation: `scripts/dedupe_by_id.py`. Key precedence:
+1. `paperId` (S2's internal unique ID, always present on a verified hit)
+2. `externalIds.DOI` (lowercased)
+3. `externalIds.ArXiv` (without version suffix)
+4. Normalized title (lowercased, alphanumeric only) — fallback only
+When two candidates collide, keep the one with the higher `match_score`.
+## Rule 5 — ≥90% citation integration
+> The system constrains the model to cite only the provided verified papers,
+> explicitly mandating that at least 90% of the gathered literature pool must
+> be actively integrated and cited when synthesizing the Introduction and
+> Related Work sections.
+Implementation: `scripts/citation_coverage.py`. After the Lit Review writing
+call produces `intro_relwork.tex`, this script:
+1. Extracts every `\cite{KEY}` and `\citep{KEY}` (and variants) from the
+   `.tex` file.
+2. Counts unique cited keys against `len(citation_pool.papers)`.
+3. Requires `cited / total ≥ 0.90`. Exits non-zero if not.
+If the gate fails, the host agent must re-prompt the writing step,
+explicitly listing the un-cited keys and asking the agent to integrate them.

.scider/skills/literature-review-agent/scripts/bibtex_format.py ADDED Viewed

	@@ -0,0 +1,211 @@

+#!/usr/bin/env python3
+"""
+bibtex_format.py — Generate refs.bib from a verified citation pool.
+Reads citation_pool.json (output of dedupe_by_id.py) and emits a BibTeX file
+with deterministic citation keys derived from the first author + year +
+first significant title word.
+Never invents fields. Only writes fields that are actually present in the
+S2 metadata. Writes one of:
+    @article{ ... }      — when venue looks like a journal
+    @inproceedings{ ... }— when venue looks like a conference
+    @misc{ ... }         — fallback (e.g., arXiv-only papers)
+Usage:
+    python bibtex_format.py --pool citation_pool.json --out refs.bib
+"""
+import argparse
+import json
+import re
+import sys
+CONFERENCE_HINTS = {
+    "neurips",
+    "nips",
+    "icml",
+    "iclr",
+    "cvpr",
+    "iccv",
+    "eccv",
+    "aaai",
+    "ijcai",
+    "acl",
+    "emnlp",
+    "naacl",
+    "kdd",
+    "www",
+    "sigir",
+    "uai",
+    "siggraph",
+    "interspeech",
+    "icassp",
+    "miccai",
+    "wacv",
+    "bmvc",
+    "coling",
+    "conll",
+}
+STOPWORDS = {
+    "a",
+    "an",
+    "and",
+    "the",
+    "of",
+    "for",
+    "to",
+    "with",
+    "on",
+    "in",
+    "by",
+    "from",
+    "as",
+    "is",
+    "are",
+    "be",
+    "via",
+    "into",
+    "their",
+    "our",
+    "we",
+    "this",
+    "that",
+    "using",
+    "use",
+    "about",
+    "at",
+    "or",
+    "if",
+}
+def normalize(s: str) -> str:
+    return re.sub(r"[^a-z]", "", s.lower())
+def first_significant_word(title: str) -> str:
+    for w in re.findall(r"[A-Za-z][A-Za-z\-]*", title):
+        wn = w.lower()
+        if wn not in STOPWORDS and len(wn) > 2:
+            return normalize(wn)
+    return "paper"
+def first_author_lastname(authors: list[dict]) -> str:
+    if not authors:
+        return "anon"
+    name = authors[0].get("name", "").strip()
+    if not name:
+        return "anon"
+    parts = name.replace(",", "").split()
+    return normalize(parts[-1]) or "anon"
+def make_key(paper: dict) -> str:
+    last = first_author_lastname(paper.get("authors") or [])
+    year = paper.get("year") or "0000"
+    word = first_significant_word(paper.get("title", ""))
+    return f"{last}{year}{word}"
+def is_conference(venue: str) -> bool:
+    if not venue:
+        return False
+    v = venue.lower()
+    return any(h in v for h in CONFERENCE_HINTS)
+def escape_bibtex(s: str) -> str:
+    if not s:
+        return ""
+    return s.replace("{", "\\{").replace("}", "\\}").replace("&", "\\&")
+def author_field(authors: list[dict]) -> str:
+    names = [a.get("name", "").strip() for a in authors if a.get("name")]
+    return " and ".join(escape_bibtex(n) for n in names)
+def format_entry(paper: dict, key: str) -> str:
+    venue = paper.get("venue") or ""
+    if is_conference(venue):
+        kind = "inproceedings"
+        venue_key = "booktitle"
+    elif venue:
+        kind = "article"
+        venue_key = "journal"
+    else:
+        kind = "misc"
+        venue_key = None
+    lines = [f"@{kind}{{{key},"]
+    if title := paper.get("title"):
+        lines.append(f"  title    = {{{escape_bibtex(title)}}},")
+    if authors := paper.get("authors"):
+        lines.append(f"  author   = {{{author_field(authors)}}},")
+    if year := paper.get("year"):
+        lines.append(f"  year     = {{{year}}},")
+    if venue and venue_key:
+        lines.append(f"  {venue_key:8s} = {{{escape_bibtex(venue)}}},")
+    ext = paper.get("externalIds") or {}
+    if doi := ext.get("DOI"):
+        lines.append(f"  doi      = {{{doi}}},")
+    if arxiv := ext.get("ArXiv"):
+        lines.append(f"  eprint   = {{{arxiv}}},")
+        lines.append(f"  archivePrefix = {{arXiv}},")
+    # Strip trailing comma on last field
+    if lines[-1].endswith(","):
+        lines[-1] = lines[-1].rstrip(",")
+    lines.append("}")
+    return "\n".join(lines)
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--pool", required=True, help="citation_pool.json")
+    p.add_argument("--out", required=True, help="output refs.bib")
+    args = p.parse_args()
+    with open(args.pool) as f:
+        pool = json.load(f)
+    papers = pool.get("papers", [])
+    if not papers:
+        print("ERROR: pool contains no papers", file=sys.stderr)
+        return 1
+    keys_used: dict[str, int] = {}
+    entries: list[str] = []
+    paper_keys: list[str] = []
+    for paper in papers:
+        base_key = make_key(paper)
+        # Disambiguate collisions with letter suffix
+        if base_key in keys_used:
+            keys_used[base_key] += 1
+            suffix = chr(ord("a") + keys_used[base_key] - 1)
+            key = base_key + suffix
+        else:
+            keys_used[base_key] = 1
+            key = base_key
+        paper["bibtex_key"] = key
+        paper_keys.append(key)
+        entries.append(format_entry(paper, key))
+    with open(args.out, "w") as f:
+        f.write("% Generated by paper-orchestra literature-review-agent/bibtex_format.py\n")
+        f.write(f"% {len(entries)} entries from citation_pool.json\n\n")
+        f.write("\n\n".join(entries))
+        f.write("\n")
+    # Write the keys back into the pool so the writing step has the
+    # citation_checklist mapping. (Idempotent — overwrites with same data.)
+    with open(args.pool, "w") as f:
+        json.dump(pool, f, indent=2, ensure_ascii=False)
+    print(f"OK: {len(entries)} BibTeX entries → {args.out}")
+    print(f"    keys: {', '.join(paper_keys[:5])}{'...' if len(paper_keys) > 5 else ''}")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

.scider/skills/literature-review-agent/scripts/check_cutoff.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python3
+"""
+check_cutoff.py — Strict temporal cutoff check for citation verification.
+Implements the paper's Rule 3 (App. D.3): a paper passes only if its
+publication date strictly predates the research cutoff. When only the year
+is known, assume the worst case (Dec 31). When year + month are known,
+assume day-1 of that month (per the paper's "first day of that month"
+default).
+Exit codes:
+    0  paper strictly predates cutoff (PASS)
+    1  paper does not strictly predate cutoff (FAIL)
+    2  argument error
+Usage:
+    python check_cutoff.py --paper-year 2024 --paper-month 9 --cutoff 2024-10-01
+    python check_cutoff.py --paper-year 2024 --cutoff 2024-10-01
+    python check_cutoff.py --paper-date 2024-09-15 --cutoff 2024-10-01
+"""
+import argparse
+import datetime as dt
+import sys
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--paper-year", type=int, help="Paper publication year")
+    p.add_argument("--paper-month", type=int, help="Paper publication month (1-12), optional")
+    p.add_argument("--paper-date", help="Full paper date YYYY-MM-DD, overrides year/month")
+    p.add_argument("--cutoff", required=True, help="Research cutoff date YYYY-MM-DD")
+    args = p.parse_args()
+    try:
+        cutoff = dt.date.fromisoformat(args.cutoff)
+    except ValueError:
+        print(f"ERROR: --cutoff must be YYYY-MM-DD, got {args.cutoff}", file=sys.stderr)
+        return 2
+    if args.paper_date:
+        try:
+            paper_date = dt.date.fromisoformat(args.paper_date)
+        except ValueError:
+            print(f"ERROR: --paper-date must be YYYY-MM-DD, got {args.paper_date}", file=sys.stderr)
+            return 2
+    elif args.paper_year:
+        if args.paper_month:
+            paper_date = dt.date(args.paper_year, args.paper_month, 1)
+        else:
+            paper_date = dt.date(args.paper_year, 12, 31)
+    else:
+        print("ERROR: must provide --paper-date OR --paper-year", file=sys.stderr)
+        return 2
+    if paper_date < cutoff:
+        print(f"PASS  paper={paper_date}  <  cutoff={cutoff}")
+        return 0
+    print(f"FAIL  paper={paper_date}  not strictly before cutoff={cutoff}")
+    return 1
+if __name__ == "__main__":
+    sys.exit(main())

.scider/skills/literature-review-agent/scripts/citation_coverage.py ADDED Viewed

	@@ -0,0 +1,104 @@

+#!/usr/bin/env python3
+"""
+citation_coverage.py — Enforce the paper's ≥90% citation integration rule
+(App. D.3).
+Greps a generated .tex file for all citation commands, counts the unique
+keys actually cited, and compares against the verified citation pool.
+Exits non-zero if coverage < 90%.
+Usage:
+    python citation_coverage.py --tex intro_relwork.tex --pool citation_pool.json
+    python citation_coverage.py --tex intro_relwork.tex --pool citation_pool.json --threshold 0.85
+"""
+import argparse
+import json
+import re
+import sys
+CITE_RE = re.compile(
+    r"\\(?:cite|citep|citet|citeauthor|citeyear|autocite|parencite|textcite)"
+    r"(?:\[[^\]]*\])?"
+    r"\{([^}]+)\}"
+)
+def extract_cited_keys(tex: str) -> set[str]:
+    keys = set()
+    for m in CITE_RE.finditer(tex):
+        for k in m.group(1).split(","):
+            k = k.strip()
+            if k:
+                keys.add(k)
+    return keys
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--tex", required=True, help="LaTeX file to inspect")
+    p.add_argument("--pool", required=True, help="citation_pool.json")
+    p.add_argument(
+        "--threshold",
+        type=float,
+        default=0.90,
+        help="Minimum integration ratio (default 0.90 per paper)",
+    )
+    args = p.parse_args()
+    with open(args.tex) as f:
+        tex = f.read()
+    with open(args.pool) as f:
+        pool = json.load(f)
+    pool_papers = pool.get("papers", [])
+    pool_keys = {p.get("bibtex_key") for p in pool_papers if p.get("bibtex_key")}
+    if not pool_keys:
+        print("ERROR: pool has no bibtex_keys. Run bibtex_format.py first.", file=sys.stderr)
+        return 1
+    cited = extract_cited_keys(tex)
+    cited_in_pool = cited & pool_keys
+    n_pool = len(pool_keys)
+    n_cited = len(cited_in_pool)
+    ratio = n_cited / n_pool if n_pool else 0.0
+    threshold_n = int(args.threshold * n_pool)
+    print(
+        f"Coverage: {n_cited}/{n_pool} = {ratio*100:.1f}% "
+        f"(threshold {args.threshold*100:.0f}% = {threshold_n})"
+    )
+    # report keys cited but NOT in pool — those are forbidden by the prompt
+    foreign = cited - pool_keys
+    if foreign:
+        print(
+            f"\nWARNING: {len(foreign)} cited keys NOT in citation pool "
+            f"(violates 'cite ONLY collected_papers' rule):"
+        )
+        for k in sorted(foreign):
+            print(f"  - {k}")
+    if n_cited < threshold_n:
+        uncited = pool_keys - cited
+        print(f"\nFAIL: missing {len(uncited)} pool papers from .tex:")
+        # show with title for actionable re-prompting
+        title_by_key = {
+            p.get("bibtex_key"): p.get("title", "") for p in pool_papers if p.get("bibtex_key")
+        }
+        discovered_by_key = {
+            p.get("bibtex_key"): p.get("discovered_for", [])
+            for p in pool_papers
+            if p.get("bibtex_key")
+        }
+        for k in sorted(uncited):
+            tag = ",".join(discovered_by_key.get(k, [])) or "?"
+            t = title_by_key.get(k, "")
+            print(f"  - {k:40s}  [{tag}]  {t[:60]}")
+        return 1
+    print("OK: citation coverage meets threshold")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

.scider/skills/literature-review-agent/scripts/dedupe_by_id.py ADDED Viewed

	@@ -0,0 +1,98 @@

+#!/usr/bin/env python3
+"""
+dedupe_by_id.py — Deduplicate a verified citation pool using Semantic Scholar
+unique paperId, with DOI / ArXiv / normalized-title fallbacks.
+Implements the paper's Rule 4 (App. D.3): "gathered citations are
+deduplicated using unique paper ID keys".
+Also computes `min_cite_paper_count = floor(0.9 * len(papers))` for the
+≥90% citation integration rule.
+Usage:
+    python dedupe_by_id.py --in raw_pool.json --out citation_pool.json [--cutoff 2024-10-01]
+"""
+import argparse
+import json
+import math
+import re
+import sys
+def norm_title(t: str) -> str:
+    return re.sub(r"[^a-z0-9]", "", t.lower())
+def make_key(paper: dict) -> str:
+    if paper.get("paperId"):
+        return f"s2:{paper['paperId']}"
+    ext = paper.get("externalIds") or {}
+    if ext.get("DOI"):
+        return f"doi:{ext['DOI'].lower()}"
+    if ext.get("ArXiv"):
+        # strip version suffix if any
+        a = ext["ArXiv"].split("v")[0] if "v" in ext["ArXiv"][-3:] else ext["ArXiv"]
+        return f"arxiv:{a.lower()}"
+    title = paper.get("title", "")
+    return f"title:{norm_title(title)}"
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--in", dest="inp", required=True, help="Raw verified pool JSON")
+    p.add_argument("--out", required=True, help="Deduped citation_pool.json")
+    p.add_argument("--cutoff", help="Cutoff date YYYY-MM-DD (recorded in output)")
+    args = p.parse_args()
+    with open(args.inp) as f:
+        raw = json.load(f)
+    candidates = raw.get("papers") or raw.get("candidates") or []
+    if not candidates:
+        print("ERROR: input has neither 'papers' nor 'candidates' key", file=sys.stderr)
+        return 1
+    by_key: dict[str, dict] = {}
+    collisions: list[tuple[str, str]] = []
+    for c in candidates:
+        key = make_key(c)
+        if key in by_key:
+            existing = by_key[key]
+            score_new = c.get("match_score", 0)
+            score_old = existing.get("match_score", 0)
+            if score_new > score_old:
+                # merge discovered_for
+                merged = existing.get("discovered_for", []) + c.get("discovered_for", [])
+                c["discovered_for"] = list(dict.fromkeys(merged))  # preserve order, dedupe
+                by_key[key] = c
+            else:
+                merged = existing.get("discovered_for", []) + c.get("discovered_for", [])
+                existing["discovered_for"] = list(dict.fromkeys(merged))
+            collisions.append((key, c.get("title", "")))
+        else:
+            by_key[key] = c
+    deduped = list(by_key.values())
+    n = len(deduped)
+    min_cite = math.floor(0.9 * n)
+    out = {
+        "papers": deduped,
+        "min_cite_paper_count": min_cite,
+        "n_total": n,
+        "n_collisions_merged": len(collisions),
+    }
+    if args.cutoff:
+        out["cutoff_date"] = args.cutoff
+    with open(args.out, "w") as f:
+        json.dump(out, f, indent=2, ensure_ascii=False)
+    print(f"OK: {len(candidates)} candidates → {n} unique papers")
+    print(f"    {len(collisions)} duplicates merged")
+    print(f"    min_cite_paper_count (≥90%): {min_cite}")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

.scider/skills/literature-review-agent/scripts/exa_search.py ADDED Viewed

	@@ -0,0 +1,169 @@

+#!/usr/bin/env python3
+"""
+exa_search.py — Optional Exa (https://exa.ai) backend for the literature
+review agent's Phase 1 (parallel candidate discovery) step.
+Exa is a search engine optimized for finding academic papers and other
+high-quality content. It is OPTIONAL — the literature-review-agent works
+fine with any host coding agent's native web search tool. Use Exa only if:
+  - Your host has no built-in web search (e.g., Aider, OpenCode, generic
+    CLI agents).
+  - You want a research-paper-focused search backend with better
+    signal-to-noise than general web search.
+  - You're running the pipeline in batch / non-interactive mode and want
+    a deterministic, scriptable backend.
+This helper reads EXA_API_KEY from the environment. The key is YOUR
+responsibility to provide; this repo never commits one. Get a key at
+https://dashboard.exa.ai/.
+Usage:
+    export EXA_API_KEY="your-key-here"
+    python exa_search.py --query "Sparse attention long context" --num-results 15
+    python exa_search.py --query "..." --raw                       # full JSON
+    python exa_search.py --query "..." --discovered-for "related_work[2.1]"
+Default output: JSON candidates in the literature-review-agent format, ready
+to be merged into raw_candidates.json before Phase 2 verification.
+Exit codes:
+    0  query succeeded
+    1  EXA_API_KEY missing, HTTP error, network error, or empty results
+"""
+import argparse
+import json
+import os
+import sys
+import urllib.error
+import urllib.request
+EXA_ENDPOINT = "https://api.exa.ai/search"
+DEFAULT_NUM = 10
+MAX_NUM = 20  # the user explicitly asked for a 10-20 range
+SNIPPET_CAP = 1500
+def search(query: str, num_results: int, category: str | None, highlight_max_chars: int) -> dict:
+    api_key = os.environ.get("EXA_API_KEY")
+    if not api_key:
+        print(
+            "ERROR: EXA_API_KEY environment variable not set.\n"
+            "Get a key at https://dashboard.exa.ai/ and run:\n"
+            '  export EXA_API_KEY="your-key-here"\n'
+            "Then retry. The literature-review-agent also works without\n"
+            "Exa — see references/discovery-pipeline.md for the default\n"
+            "host-native web search path.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    body: dict = {
+        "query": query,
+        "numResults": num_results,
+        "type": "auto",
+        "contents": {"highlights": {"maxCharacters": highlight_max_chars}},
+    }
+    if category:
+        body["category"] = category
+    req = urllib.request.Request(
+        EXA_ENDPOINT,
+        data=json.dumps(body).encode("utf-8"),
+        headers={
+            "content-type": "application/json",
+            "x-api-key": api_key,
+        },
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            return json.loads(resp.read().decode("utf-8"))
+    except urllib.error.HTTPError as e:
+        body_text = e.read().decode("utf-8", errors="replace")[:500]
+        print(f"ERROR: Exa HTTP {e.code}: {body_text}", file=sys.stderr)
+        sys.exit(1)
+    except urllib.error.URLError as e:
+        print(f"ERROR: Exa network error: {e.reason}", file=sys.stderr)
+        sys.exit(1)
+def normalize(exa_response: dict, discovered_for: list[str]) -> list[dict]:
+    """Convert Exa results into the literature-review-agent candidate format."""
+    candidates: list[dict] = []
+    for r in exa_response.get("results", []):
+        title = (r.get("title") or "").strip()
+        url = r.get("url") or r.get("id") or ""
+        highlights = r.get("highlights") or []
+        snippet = " ".join(h.strip() for h in highlights)[:SNIPPET_CAP]
+        candidates.append(
+            {
+                "title": title,
+                "snippet": snippet,
+                "source_url": url,
+                "discovered_for": list(discovered_for),
+                "_exa_id": r.get("id"),
+                "_exa_published_date": r.get("publishedDate"),
+            }
+        )
+    return candidates
+def main() -> int:
+    p = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    p.add_argument("--query", required=True, help="Search query")
+    p.add_argument(
+        "--num-results",
+        type=int,
+        default=DEFAULT_NUM,
+        help=f"Number of results to fetch " f"(default {DEFAULT_NUM}, clamped to [1, {MAX_NUM}])",
+    )
+    p.add_argument(
+        "--category",
+        default="research paper",
+        help='Exa category filter (default "research paper"; ' "pass an empty string to disable)",
+    )
+    p.add_argument(
+        "--highlight-chars",
+        type=int,
+        default=4000,
+        help="Max characters per highlight (default 4000)",
+    )
+    p.add_argument(
+        "--discovered-for",
+        default="intro",
+        help="Tag to attach to each candidate "
+        '(default "intro"). Use "related_work[2.1]" or '
+        "similar for cluster-specific queries so the "
+        "downstream citation_coverage gate can attribute "
+        "the citation to the right section.",
+    )
+    p.add_argument(
+        "--raw",
+        action="store_true",
+        help="Print the full Exa response JSON unmodified " "instead of normalized candidates",
+    )
+    args = p.parse_args()
+    n = max(1, min(MAX_NUM, args.num_results))
+    category = args.category or None
+    response = search(args.query, n, category, args.highlight_chars)
+    if not response.get("results"):
+        print(f"WARN: Exa returned 0 results for query: {args.query!r}", file=sys.stderr)
+        return 1
+    if args.raw:
+        json.dump(response, sys.stdout, indent=2, ensure_ascii=False)
+    else:
+        candidates = normalize(response, [args.discovered_for])
+        json.dump({"candidates": candidates}, sys.stdout, indent=2, ensure_ascii=False)
+    sys.stdout.write("\n")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

.scider/skills/literature-review-agent/scripts/levenshtein_match.py ADDED Viewed

	@@ -0,0 +1,73 @@

+#!/usr/bin/env python3
+"""
+levenshtein_match.py — Fuzzy title match for citation verification.
+Implements the paper's Rule 1 (App. D.3): a candidate paper passes only if
+its title's Levenshtein ratio against the Semantic Scholar hit's title is
+strictly greater than 70.
+Includes a substring-bypass safety net for short candidate titles (the
+Linformer false-negative case): if the candidate is < 4 words and is
+contained as a substring in the S2 hit's title, return 100.
+Exit code is always 0; the integer ratio is printed to stdout. The caller
+parses it and decides whether to discard.
+Usage:
+    python levenshtein_match.py --candidate "..." --found "..."
+    python levenshtein_match.py --candidate "..." --found "..." --substring-bypass
+"""
+import argparse
+import re
+import sys
+try:
+    import Levenshtein
+except ImportError:
+    print(
+        "ERROR: python-Levenshtein required. Install with: pip install python-Levenshtein",
+        file=sys.stderr,
+    )
+    sys.exit(2)
+def normalize(s: str) -> str:
+    s = s.lower().strip()
+    s = re.sub(r"[^a-z0-9\s]", " ", s)
+    s = re.sub(r"\s+", " ", s)
+    return s
+def ratio(a: str, b: str, substring_bypass: bool = False) -> int:
+    na, nb = normalize(a), normalize(b)
+    r = int(round(Levenshtein.ratio(na, nb) * 100))
+    if substring_bypass and len(na.split()) < 4:
+        if na in nb:
+            return max(r, 95)
+    return r
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument(
+        "--candidate", required=True, help="The original candidate title (from web search)"
+    )
+    p.add_argument("--found", required=True, help="The title returned by Semantic Scholar")
+    p.add_argument(
+        "--substring-bypass",
+        action="store_true",
+        help="Bump short-candidate substring matches to 95",
+    )
+    p.add_argument(
+        "--threshold", type=int, default=70, help="Print PASS/FAIL alongside the ratio (default 70)"
+    )
+    args = p.parse_args()
+    r = ratio(args.candidate, args.found, args.substring_bypass)
+    verdict = "PASS" if r > args.threshold else "FAIL"
+    print(f"{r} {verdict}")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

.scider/skills/literature-review-agent/scripts/pre_dedup_candidates.py ADDED Viewed

	@@ -0,0 +1,156 @@

+#!/usr/bin/env python3
+"""
+pre_dedup_candidates.py — Deduplicate Phase 1 raw candidates by normalized
+title before Phase 2 Semantic Scholar verification.
+Multiple search queries in Phase 1 often return the same papers. Verifying
+duplicates wastes S2 quota (1 QPS hard cap) and adds 30-40% unnecessary
+wall-time. This script removes obvious duplicates — same paper found via
+multiple queries — before the sequential verification loop begins.
+Dedup strategy (in order of preference):
+1. Exact arXiv ID match extracted from source URL or snippet.
+2. Levenshtein ratio >= 92 on normalized titles (high threshold to avoid
+   false collisions between similarly-named papers).
+When two candidates are considered the same, we keep the one that appeared
+earlier in the list and merge their `discovered_for` attribution tags so
+the surviving entry is credited to all originating queries.
+Usage:
+    python pre_dedup_candidates.py \\
+        --in workspace/raw_candidates.json \\
+        --out workspace/deduped_candidates.json
+Input JSON shape:
+    {"candidates": [{"title": "...", "url": "...", "snippet": "...",
+                     "discovered_for": ["intro.1"]}, ...]}
+    OR a bare list.
+"""
+import argparse
+import json
+import re
+import sys
+ARXIV_RE = re.compile(r"arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5})", re.IGNORECASE)
+def norm_title(t: str) -> str:
+    t = re.sub(r"[^a-z0-9 ]", " ", t.lower())
+    return " ".join(t.split())
+def levenshtein_ratio(a: str, b: str) -> float:
+    if not a and not b:
+        return 100.0
+    if not a or not b:
+        return 0.0
+    la, lb = len(a), len(b)
+    if la < lb:
+        a, b = b, a
+        la, lb = lb, la
+    prev = list(range(lb + 1))
+    for i, ca in enumerate(a):
+        curr = [i + 1]
+        for j, cb in enumerate(b):
+            cost = 0 if ca == cb else 1
+            curr.append(min(prev[j + 1] + 1, curr[j] + 1, prev[j] + cost))
+        prev = curr
+    dist = prev[lb]
+    return (1.0 - dist / max(la, lb)) * 100.0
+def extract_arxiv_id(candidate: dict) -> str | None:
+    for text in (candidate.get("url", ""), candidate.get("snippet", "")):
+        m = ARXIV_RE.search(text)
+        if m:
+            return m.group(1)
+    return None
+def make_exact_key(candidate: dict) -> str:
+    """Canonical key: arXiv ID if extractable, else normalized title."""
+    aid = extract_arxiv_id(candidate)
+    if aid:
+        return f"arxiv:{aid}"
+    return f"title:{norm_title(candidate.get('title', ''))}"
+def merge_discovered_for(a: dict, b: dict) -> list:
+    df_a = a.get("discovered_for") or []
+    df_b = b.get("discovered_for") or []
+    return list(dict.fromkeys(df_a + df_b))
+def dedup(candidates: list[dict], title_ratio_threshold: float = 92.0) -> list[dict]:
+    # Pass 1: exact key dedup (arXiv ID or identical normalized title)
+    by_key: dict[str, dict] = {}
+    for c in candidates:
+        key = make_exact_key(c)
+        if key in by_key:
+            by_key[key]["discovered_for"] = merge_discovered_for(by_key[key], c)
+        else:
+            by_key[key] = dict(c)
+    deduped = list(by_key.values())
+    # Pass 2: fuzzy title dedup — O(n²) but n is ~50-100 candidates max
+    normed = [norm_title(c.get("title", "")) for c in deduped]
+    drop: set[int] = set()
+    for i in range(len(deduped)):
+        if i in drop:
+            continue
+        for j in range(i + 1, len(deduped)):
+            if j in drop:
+                continue
+            if levenshtein_ratio(normed[i], normed[j]) >= title_ratio_threshold:
+                deduped[i]["discovered_for"] = merge_discovered_for(deduped[i], deduped[j])
+                drop.add(j)
+    return [c for idx, c in enumerate(deduped) if idx not in drop]
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--in", dest="inp", required=True, help="Raw Phase 1 candidates JSON")
+    p.add_argument("--out", required=True, help="Deduped candidates JSON")
+    p.add_argument(
+        "--title-ratio",
+        type=float,
+        default=92.0,
+        help="Levenshtein ratio threshold for fuzzy title match (default: 92)",
+    )
+    args = p.parse_args()
+    with open(args.inp) as f:
+        raw = json.load(f)
+    if isinstance(raw, list):
+        candidates = raw
+    else:
+        candidates = raw.get("candidates") or raw.get("papers") or []
+    if not isinstance(candidates, list):
+        print("ERROR: input must be a JSON array or object with 'candidates' key", file=sys.stderr)
+        return 1
+    before = len(candidates)
+    result = dedup(candidates, title_ratio_threshold=args.title_ratio)
+    after = len(result)
+    removed = before - after
+    out_obj = {
+        "candidates": result,
+        "n_before_dedup": before,
+        "n_after_dedup": after,
+        "n_removed": removed,
+    }
+    with open(args.out, "w") as f:
+        json.dump(out_obj, f, indent=2, ensure_ascii=False)
+    print(f"OK: {before} candidates → {after} unique ({removed} duplicates removed)")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

.scider/skills/literature-review-agent/scripts/s2_cache.py ADDED Viewed

	@@ -0,0 +1,113 @@

+#!/usr/bin/env python3
+"""
+s2_cache.py — Persistent Semantic Scholar verification cache.
+Problem: Phase 2 verification is throttled to 1 QPS. If a pipeline run
+fails partway through (gate error, network timeout, interrupted session),
+re-running wastes the full S2 wait time again on already-verified papers.
+Solution: a flat JSON cache at workspace/cache/s2_cache.json. On a cache
+HIT the script emits the stored response and exits 0 so the caller can skip
+the live S2 request. On a cache MISS it exits 1. After a live request the
+caller stores the result with --store.
+The cache key is derived from the normalized query title (lowercase,
+alphanumeric only) so minor whitespace differences still hit.
+Usage:
+  CHECK mode — exits 0 + prints JSON if cached, else exits 1:
+    python s2_cache.py --cache workspace/cache/s2_cache.json \\
+        --check "Attention Is All You Need"
+  STORE mode — write a response into the cache:
+    python s2_cache.py --cache workspace/cache/s2_cache.json \\
+        --store "Attention Is All You Need" \\
+        --response '{"paperId": "...", "title": "..."}'
+  STATS mode — print cache size and hit rate summary:
+    python s2_cache.py --cache workspace/cache/s2_cache.json --stats
+"""
+import argparse
+import json
+import os
+import re
+import sys
+def norm_key(title: str) -> str:
+    """Lowercase, alphanumeric-only cache key."""
+    return re.sub(r"[^a-z0-9]", "", title.lower())
+def load_cache(path: str) -> dict:
+    if os.path.isfile(path):
+        with open(path) as f:
+            try:
+                return json.load(f)
+            except json.JSONDecodeError:
+                return {}
+    return {}
+def save_cache(path: str, cache: dict) -> None:
+    os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
+    with open(path, "w") as f:
+        json.dump(cache, f, indent=2, ensure_ascii=False)
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--cache", required=True, help="Path to cache JSON file")
+    mode = p.add_mutually_exclusive_group(required=True)
+    mode.add_argument(
+        "--check",
+        metavar="TITLE",
+        help="Check for title; exit 0 + print JSON if found, else exit 1",
+    )
+    mode.add_argument(
+        "--store", metavar="TITLE", help="Store a response for TITLE (requires --response)"
+    )
+    mode.add_argument("--stats", action="store_true", help="Print cache statistics")
+    p.add_argument(
+        "--response", metavar="JSON", help="S2 response JSON to store (used with --store)"
+    )
+    args = p.parse_args()
+    cache = load_cache(args.cache)
+    if args.stats:
+        print(f"Cache file : {args.cache}")
+        print(f"Entries    : {len(cache)}")
+        if cache:
+            print("Sample keys:", list(cache.keys())[:5])
+        return 0
+    if args.check:
+        key = norm_key(args.check)
+        if key in cache:
+            print(json.dumps(cache[key]))
+            return 0  # HIT
+        return 1  # MISS
+    # --store mode
+    if not args.response:
+        print("ERROR: --store requires --response", file=sys.stderr)
+        return 2
+    try:
+        response = json.loads(args.response)
+    except json.JSONDecodeError as e:
+        print(f"ERROR: invalid JSON in --response: {e}", file=sys.stderr)
+        return 2
+    key = norm_key(args.store)
+    cache[key] = response
+    save_cache(args.cache, cache)
+    print(f"OK: cached '{args.store}' → key '{key}' ({len(cache)} total entries)")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

.scider/skills/literature-review-agent/scripts/s2_search.py ADDED Viewed

	@@ -0,0 +1,208 @@

+#!/usr/bin/env python3
+"""
+s2_search.py — Semantic Scholar title-search helper for Phase 2 verification.
+Queries the Semantic Scholar Graph API for a paper by title and returns the
+top candidate hits as JSON.  Used by the literature-review-agent to verify
+each candidate from Phase 1 before adding it to citation_pool.json.
+API key (optional):
+    If SEMANTIC_SCHOLAR_API_KEY is set in the environment the key is forwarded
+    via the ``x-api-key`` header, which raises the rate limit from ~100 req/5 min
+    (unauthenticated) to 1 req/s sustained with higher burst headroom.
+    If the variable is absent the script falls back to the public unauthenticated
+    endpoint — the pipeline works fine without a key; just keep to ≤1 QPS.
+    Get a free key at: https://api.semanticscholar.org/
+    Then export it once before running the pipeline:
+        export SEMANTIC_SCHOLAR_API_KEY="your-key-here"
+Usage:
+    # check for key and search
+    python s2_search.py --query "Attention is All You Need"
+    # request more hits and extra fields
+    python s2_search.py --query "BERT pre-training" --limit 10 \\
+        --fields title,abstract,year,authors,venue,externalIds,citationCount
+    # pretty-print raw S2 JSON
+    python s2_search.py --query "GPT-4 technical report" --raw
+Exit codes:
+    0  at least one result returned
+    1  HTTP error, network error, or zero results
+    2  usage error (bad arguments)
+"""
+import argparse
+import json
+import os
+import sys
+import time
+import urllib.error
+import urllib.parse
+import urllib.request
+S2_BASE = "https://api.semanticscholar.org/graph/v1"
+DEFAULT_FIELDS = "title,abstract,year,authors,venue,externalIds"
+DEFAULT_LIMIT = 5
+MAX_LIMIT = 100
+_RETRY_SLEEP = 5  # seconds to wait after a 429 before retrying
+def _build_headers() -> dict:
+    headers = {"Accept": "application/json"}
+    api_key = os.environ.get("SEMANTIC_SCHOLAR_API_KEY", "").strip()
+    if api_key:
+        headers["x-api-key"] = api_key
+    return headers
+def search(query: str, limit: int, fields: str, retries: int = 3) -> dict:
+    """
+    Call /paper/search and return the parsed JSON response.
+    Raises SystemExit on unrecoverable errors so the caller (or CLI) gets a
+    clean non-zero exit code.
+    """
+    params = urllib.parse.urlencode(
+        {
+            "query": query,
+            "limit": limit,
+            "fields": fields,
+        }
+    )
+    url = f"{S2_BASE}/paper/search?{params}"
+    headers = _build_headers()
+    for attempt in range(1, retries + 1):
+        req = urllib.request.Request(url, headers=headers, method="GET")
+        try:
+            with urllib.request.urlopen(req, timeout=30) as resp:
+                return json.loads(resp.read().decode("utf-8"))
+        except urllib.error.HTTPError as exc:
+            if exc.code == 429:
+                if attempt < retries:
+                    print(
+                        f"WARN: S2 rate-limited (429). Sleeping {_RETRY_SLEEP}s "
+                        f"before retry {attempt + 1}/{retries}.",
+                        file=sys.stderr,
+                    )
+                    time.sleep(_RETRY_SLEEP)
+                    continue
+                print(
+                    "ERROR: S2 rate-limited (429) and retries exhausted.\n"
+                    "Tip: set SEMANTIC_SCHOLAR_API_KEY to get a higher rate limit.\n"
+                    "     See https://api.semanticscholar.org/ for a free key.",
+                    file=sys.stderr,
+                )
+                sys.exit(1)
+            if exc.code == 404:
+                # not found — return an empty result set (caller handles this)
+                return {"total": 0, "data": []}
+            if exc.code in (500, 502, 503):
+                if attempt < retries:
+                    print(
+                        f"WARN: S2 server error ({exc.code}). Sleeping 30s before "
+                        f"retry {attempt + 1}/{retries}.",
+                        file=sys.stderr,
+                    )
+                    time.sleep(30)
+                    continue
+                print(
+                    f"ERROR: S2 server error ({exc.code}) after {retries} attempts.",
+                    file=sys.stderr,
+                )
+                sys.exit(1)
+            body = exc.read().decode("utf-8", errors="replace")[:400]
+            print(f"ERROR: S2 HTTP {exc.code}: {body}", file=sys.stderr)
+            sys.exit(1)
+        except urllib.error.URLError as exc:
+            print(f"ERROR: Network error reaching Semantic Scholar: {exc.reason}", file=sys.stderr)
+            sys.exit(1)
+    # should never reach here
+    sys.exit(1)
+def main() -> int:
+    p = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    p.add_argument(
+        "--query",
+        required=True,
+        help="Paper title (or search query) to look up on Semantic Scholar",
+    )
+    p.add_argument(
+        "--limit",
+        type=int,
+        default=DEFAULT_LIMIT,
+        help=f"Max hits to return (default {DEFAULT_LIMIT}, max {MAX_LIMIT})",
+    )
+    p.add_argument(
+        "--fields",
+        default=DEFAULT_FIELDS,
+        help=f"Comma-separated S2 fields to request (default: {DEFAULT_FIELDS})",
+    )
+    p.add_argument(
+        "--raw",
+        action="store_true",
+        help="Print the full S2 JSON response unmodified instead of normalized output",
+    )
+    p.add_argument(
+        "--check-key",
+        action="store_true",
+        help="Print whether SEMANTIC_SCHOLAR_API_KEY is set and exit (no network call)",
+    )
+    args = p.parse_args()
+    if args.check_key:
+        key = os.environ.get("SEMANTIC_SCHOLAR_API_KEY", "").strip()
+        if key:
+            masked = key[:4] + "..." + key[-4:] if len(key) > 8 else "****"
+            print(
+                f"SEMANTIC_SCHOLAR_API_KEY is set ({masked}). "
+                "Authenticated mode: higher rate limits."
+            )
+        else:
+            print(
+                "SEMANTIC_SCHOLAR_API_KEY is NOT set. "
+                "Unauthenticated mode: ~100 req/5 min, keep to ≤1 QPS.\n"
+                "To enable higher rate limits:\n"
+                "  1. Get a free key at https://api.semanticscholar.org/\n"
+                '  2. export SEMANTIC_SCHOLAR_API_KEY="your-key-here"'
+            )
+        return 0
+    limit = max(1, min(MAX_LIMIT, args.limit))
+    response = search(args.query, limit, args.fields)
+    if args.raw:
+        json.dump(response, sys.stdout, indent=2, ensure_ascii=False)
+        sys.stdout.write("\n")
+        return 0
+    data = response.get("data") or []
+    if not data:
+        print(
+            f"WARN: Semantic Scholar returned 0 results for query: {args.query!r}",
+            file=sys.stderr,
+        )
+        json.dump({"total": 0, "data": []}, sys.stdout, indent=2)
+        sys.stdout.write("\n")
+        return 1
+    # Emit normalized output (subset of fields used by pipeline)
+    out = {
+        "total": response.get("total", len(data)),
+        "authenticated": bool(os.environ.get("SEMANTIC_SCHOLAR_API_KEY", "").strip()),
+        "data": data,
+    }
+    json.dump(out, sys.stdout, indent=2, ensure_ascii=False)
+    sys.stdout.write("\n")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

.scider/skills/literature-review-agent/scripts/sync_keys.py ADDED Viewed

	@@ -0,0 +1,119 @@

+#!/usr/bin/env python3
+r"""
+sync_keys.py — Synchronize citation keys in a .tex file with the canonical
+bibtex_key values stored in citation_pool.json.
+Problem: The Literature Review Agent writes cite keys in its own format
+(e.g. 'lewis2020rag'), while bibtex_format.py generates canonical keys from
+author + year + first-significant-title-word (e.g. 'lewis2020retrievalaugmented').
+After running bibtex_format.py these two sources are out of sync, causing the
+citation_coverage gate to fail (it looks for \cite{canonical_key} in the .tex).
+This script reads the 'key' -> 'bibtex_key' mapping from citation_pool.json
+and performs a targeted substitution inside \cite{}, \citep{}, \citet{}
+commands in the target .tex file. It handles multi-key citations like
+\cite{a,b,c} correctly.
+Run this immediately after bibtex_format.py, before Step 4 (Section Writing).
+Usage:
+    python sync_keys.py \
+        --pool workspace/citation_pool.json \
+        --tex  workspace/drafts/intro_relwork.tex \
+        --inplace
+    # Without --inplace: prints updated content to stdout (safe preview mode).
+"""
+import argparse
+import json
+import re
+import sys
+# Matches \cite, \citep, \citet, \citealt, \citealp, \citeauthor, \citeyear,
+# starred variants like \cite*, and the optional [prenote][postnote] args.
+CITE_RE = re.compile(
+    r"(\\cite[a-zA-Z*]*)"  # command
+    r"(?:\[[^\]]*\])*"  # optional bracket args (prenote/postnote)
+    r"\{([^}]+)\}"  # required brace arg with keys
+)
+def build_key_map(pool: dict) -> dict[str, str]:
+    """Return {agent_key: bibtex_key} for every paper where they differ."""
+    key_map: dict[str, str] = {}
+    for paper in pool.get("papers", []):
+        old = paper.get("key")
+        new = paper.get("bibtex_key")
+        if old and new and old != new:
+            key_map[old] = new
+    return key_map
+def replace_keys(content: str, key_map: dict[str, str]) -> tuple[str, int]:
+    if not key_map:
+        return content, 0
+    n_replaced = 0
+    def replacer(m: re.Match) -> str:
+        nonlocal n_replaced
+        cmd = m.group(1)
+        keys_str = m.group(2)
+        keys = [k.strip() for k in keys_str.split(",")]
+        new_keys: list[str] = []
+        for k in keys:
+            if k in key_map:
+                new_keys.append(key_map[k])
+                n_replaced += 1
+            else:
+                new_keys.append(k)
+        # Reconstruct original bracket args (they were consumed by the regex
+        # but we don't need to preserve them specially — re-emit as matched)
+        full_match = m.group(0)
+        # Rebuild: command + everything between command and { + new keys
+        bracket_part = full_match[len(cmd) : full_match.index("{")]
+        return f"{cmd}{bracket_part}{{{', '.join(new_keys)}}}"
+    updated = CITE_RE.sub(replacer, content)
+    return updated, n_replaced
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--pool", required=True, help="citation_pool.json")
+    p.add_argument("--tex", required=True, help="Target .tex file to update")
+    p.add_argument(
+        "--inplace", action="store_true", help="Overwrite --tex in place (default: print to stdout)"
+    )
+    args = p.parse_args()
+    with open(args.pool) as f:
+        pool = json.load(f)
+    key_map = build_key_map(pool)
+    if not key_map:
+        print("OK: no key differences in citation_pool.json — nothing to sync")
+        return 0
+    print(f"Key map ({len(key_map)} substitutions):")
+    for old, new in key_map.items():
+        print(f"  {old} → {new}")
+    with open(args.tex) as f:
+        content = f.read()
+    updated, n = replace_keys(content, key_map)
+    if args.inplace:
+        with open(args.tex, "w") as f:
+            f.write(updated)
+        print(f"OK: {n} citation key(s) updated in {args.tex}")
+    else:
+        sys.stdout.write(updated)
+        print(f"\n# sync_keys: {n} substitution(s) would be made", file=sys.stderr)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

.scider/skills/literature-review-agent/scripts/validate_pool.py ADDED Viewed

	@@ -0,0 +1,145 @@

+#!/usr/bin/env python3
+"""
+validate_pool.py — Validate and auto-fix citation_pool.json before it is
+passed to bibtex_format.py or the Section Writing Agent.
+Catches the two most common schema errors produced by the Literature Review
+Agent and fixes them in place with --fix.
+Error 1 — Authors as plain strings (WRONG format for bibtex_format.py):
+    WRONG:   "authors": ["Alice Smith", "Bob Jones"]
+    CORRECT: "authors": [{"name": "Alice Smith"}, {"name": "Bob Jones"}]
+Error 2 — Missing required fields (title, year). These cause bibtex_format.py
+    to emit incomplete entries. Reported as errors, not auto-fixed.
+Also checks that the pool has the top-level keys that downstream scripts
+expect: "papers", "min_cite_paper_count".
+Exit codes:
+    0  Pool is valid (or was fully fixed with --fix)
+    1  Unrecoverable errors remain (missing required fields, no papers)
+Usage:
+    python validate_pool.py --pool workspace/citation_pool.json
+    python validate_pool.py --pool workspace/citation_pool.json --fix
+"""
+import argparse
+import json
+import sys
+REQUIRED_PAPER_FIELDS = ["title", "year"]
+RECOMMENDED_PAPER_FIELDS = ["paperId", "abstract", "venue", "authors"]
+REQUIRED_TOP_FIELDS = ["papers", "min_cite_paper_count"]
+def validate_and_fix(pool: dict, fix: bool) -> tuple[list[str], list[str], int]:
+    """
+    Returns (errors, warnings, n_fixed).
+    If fix=True, mutates pool in place where possible.
+    """
+    errors: list[str] = []
+    warnings: list[str] = []
+    n_fixed = 0
+    # Top-level structure
+    for field in REQUIRED_TOP_FIELDS:
+        if field not in pool:
+            warnings.append(f"top-level field '{field}' missing — was dedupe_by_id.py run?")
+    papers = pool.get("papers", [])
+    if not papers:
+        errors.append("pool['papers'] is empty or missing")
+        return errors, warnings, n_fixed
+    for i, paper in enumerate(papers):
+        label = paper.get("title") or f"paper #{i}"
+        # --- Authors format check ---
+        authors = paper.get("authors")
+        if authors is not None:
+            if not isinstance(authors, list):
+                errors.append(f"[{label}] 'authors' must be a list, got {type(authors).__name__}")
+            elif authors:
+                if isinstance(authors[0], str):
+                    if fix:
+                        paper["authors"] = [{"name": a} for a in authors]
+                        n_fixed += 1
+                    else:
+                        errors.append(
+                            f"[{label}] authors are plain strings "
+                            f'(e.g. "{authors[0]}") — run with --fix to auto-convert'
+                        )
+                elif not isinstance(authors[0], dict):
+                    errors.append(
+                        f"[{label}] authors[0] is {type(authors[0]).__name__}, "
+                        f"expected dict with 'name' key"
+                    )
+        # --- Required fields ---
+        for field in REQUIRED_PAPER_FIELDS:
+            if not paper.get(field):
+                errors.append(f"[{label}] missing required field '{field}'")
+        # --- Recommended fields ---
+        for field in RECOMMENDED_PAPER_FIELDS:
+            if not paper.get(field):
+                warnings.append(f"[{label}] missing recommended field '{field}'")
+    return errors, warnings, n_fixed
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--pool", required=True, help="citation_pool.json path")
+    p.add_argument(
+        "--fix",
+        action="store_true",
+        help="Auto-fix recoverable errors (authors format) and write back",
+    )
+    p.add_argument("--quiet", action="store_true", help="Suppress warnings, only show errors")
+    args = p.parse_args()
+    with open(args.pool) as f:
+        pool = json.load(f)
+    errors, warnings, n_fixed = validate_and_fix(pool, fix=args.fix)
+    if not args.quiet:
+        for w in warnings:
+            print(f"WARN: {w}")
+    had_errors = bool(errors)
+    for e in errors:
+        print(f"ERROR: {e}", file=sys.stderr)
+    if had_errors and not args.fix:
+        print(
+            "\nTip: re-run with --fix to auto-correct recoverable issues (authors format).",
+            file=sys.stderr,
+        )
+        return 1
+    if n_fixed > 0:
+        with open(args.pool, "w") as f:
+            json.dump(pool, f, indent=2, ensure_ascii=False)
+        print(f"OK: {n_fixed} paper(s) auto-fixed and written back to {args.pool}")
+    n = len(pool.get("papers", []))
+    if not had_errors and n_fixed == 0:
+        print(f"OK: {n} papers validated — no errors")
+    elif n_fixed > 0 and not errors:
+        print(f"OK: {n} papers validated after auto-fix")
+    return (
+        0
+        if (
+            not errors
+            or (args.fix and n_fixed > 0 and not [e for e in errors if "missing required" in e])
+        )
+        else 1
+    )
+if __name__ == "__main__":
+    sys.exit(main())

.scider/skills/matplotlib/SKILL.md ADDED Viewed

	@@ -0,0 +1,356 @@

+---
+name: matplotlib
+description: Low-level plotting library for full customization. Use when you need fine-grained control over every plot element, novel plot types, or publication-quality PNG/PDF/SVG export. For quick statistical plots use seaborn.
+allowed_agents: [experiment, native_coding]
+---
+# Matplotlib
+## Overview
+Matplotlib is Python's foundational visualization library for creating static, animated, and interactive plots. This skill provides guidance on using matplotlib effectively, covering both the pyplot interface (MATLAB-style) and the object-oriented API (Figure/Axes), along with best practices for creating publication-quality visualizations.
+## When to Use This Skill
+This skill should be used when:
+- Creating any type of plot or chart (line, scatter, bar, histogram, heatmap, contour, etc.)
+- Generating scientific or statistical visualizations
+- Customizing plot appearance (colors, styles, labels, legends)
+- Creating multi-panel figures with subplots
+- Exporting visualizations to various formats (PNG, PDF, SVG, etc.)
+- Building interactive plots or animations
+- Working with 3D visualizations
+- Integrating plots into Jupyter notebooks or GUI applications
+## Core Concepts
+### The Matplotlib Hierarchy
+Matplotlib uses a hierarchical structure of objects:
+1. **Figure** - The top-level container for all plot elements
+2. **Axes** - The actual plotting area where data is displayed (one Figure can contain multiple Axes)
+3. **Artist** - Everything visible on the figure (lines, text, ticks, etc.)
+4. **Axis** - The number line objects (x-axis, y-axis) that handle ticks and labels
+### Two Interfaces
+**1. pyplot Interface (Implicit, MATLAB-style)**
+```python
+import matplotlib.pyplot as plt
+plt.plot([1, 2, 3, 4])
+plt.ylabel('some numbers')
+plt.show()
+```
+- Convenient for quick, simple plots
+- Maintains state automatically
+- Good for interactive work and simple scripts
+**2. Object-Oriented Interface (Explicit)**
+```python
+import matplotlib.pyplot as plt
+fig, ax = plt.subplots()
+ax.plot([1, 2, 3, 4])
+ax.set_ylabel('some numbers')
+plt.show()
+```
+- **Recommended for most use cases**
+- More explicit control over figure and axes
+- Better for complex figures with multiple subplots
+- Easier to maintain and debug
+## Common Workflows
+### 1. Basic Plot Creation
+**Single plot workflow:**
+```python
+import matplotlib.pyplot as plt
+import numpy as np
+# Create figure and axes (OO interface - RECOMMENDED)
+fig, ax = plt.subplots(figsize=(10, 6))
+# Generate and plot data
+x = np.linspace(0, 2*np.pi, 100)
+ax.plot(x, np.sin(x), label='sin(x)')
+ax.plot(x, np.cos(x), label='cos(x)')
+# Customize
+ax.set_xlabel('x')
+ax.set_ylabel('y')
+ax.set_title('Trigonometric Functions')
+ax.legend()
+ax.grid(True, alpha=0.3)
+# Save and/or display
+plt.savefig('plot.png', dpi=300, bbox_inches='tight')
+plt.show()
+```
+### 2. Multiple Subplots
+**Creating subplot layouts:**
+```python
+# Method 1: Regular grid
+fig, axes = plt.subplots(2, 2, figsize=(12, 10))
+axes[0, 0].plot(x, y1)
+axes[0, 1].scatter(x, y2)
+axes[1, 0].bar(categories, values)
+axes[1, 1].hist(data, bins=30)
+# Method 2: Mosaic layout (more flexible)
+fig, axes = plt.subplot_mosaic([['left', 'right_top'],
+                                 ['left', 'right_bottom']],
+                                figsize=(10, 8))
+axes['left'].plot(x, y)
+axes['right_top'].scatter(x, y)
+axes['right_bottom'].hist(data)
+# Method 3: GridSpec (maximum control)
+from matplotlib.gridspec import GridSpec
+fig = plt.figure(figsize=(12, 8))
+gs = GridSpec(3, 3, figure=fig)
+ax1 = fig.add_subplot(gs[0, :])  # Top row, all columns
+ax2 = fig.add_subplot(gs[1:, 0])  # Bottom two rows, first column
+ax3 = fig.add_subplot(gs[1:, 1:])  # Bottom two rows, last two columns
+```
+### 3. Plot Types and Use Cases
+**Line plots** - Time series, continuous data, trends
+```python
+ax.plot(x, y, linewidth=2, linestyle='--', marker='o', color='blue')
+```
+**Scatter plots** - Relationships between variables, correlations
+```python
+ax.scatter(x, y, s=sizes, c=colors, alpha=0.6, cmap='viridis')
+```
+**Bar charts** - Categorical comparisons
+```python
+ax.bar(categories, values, color='steelblue', edgecolor='black')
+# For horizontal bars:
+ax.barh(categories, values)
+```
+**Histograms** - Distributions
+```python
+ax.hist(data, bins=30, edgecolor='black', alpha=0.7)
+```
+**Heatmaps** - Matrix data, correlations
+```python
+im = ax.imshow(matrix, cmap='coolwarm', aspect='auto')
+plt.colorbar(im, ax=ax)
+```
+**Contour plots** - 3D data on 2D plane
+```python
+contour = ax.contour(X, Y, Z, levels=10)
+ax.clabel(contour, inline=True, fontsize=8)
+```
+**Box plots** - Statistical distributions
+```python
+ax.boxplot([data1, data2, data3], labels=['A', 'B', 'C'])
+```
+**Violin plots** - Distribution densities
+```python
+ax.violinplot([data1, data2, data3], positions=[1, 2, 3])
+```
+For comprehensive plot type examples and variations, refer to `references/plot_types.md`.
+### 4. Styling and Customization
+**Color specification methods:**
+- Named colors: `'red'`, `'blue'`, `'steelblue'`
+- Hex codes: `'#FF5733'`
+- RGB tuples: `(0.1, 0.2, 0.3)`
+- Colormaps: `cmap='viridis'`, `cmap='plasma'`, `cmap='coolwarm'`
+**Using style sheets:**
+```python
+plt.style.use('seaborn-v0_8-darkgrid')  # Apply predefined style
+# Available styles: 'ggplot', 'bmh', 'fivethirtyeight', etc.
+print(plt.style.available)  # List all available styles
+```
+**Customizing with rcParams:**
+```python
+plt.rcParams['font.size'] = 12
+plt.rcParams['axes.labelsize'] = 14
+plt.rcParams['axes.titlesize'] = 16
+plt.rcParams['xtick.labelsize'] = 10
+plt.rcParams['ytick.labelsize'] = 10
+plt.rcParams['legend.fontsize'] = 12
+plt.rcParams['figure.titlesize'] = 18
+```
+**Text and annotations:**
+```python
+ax.text(x, y, 'annotation', fontsize=12, ha='center')
+ax.annotate('important point', xy=(x, y), xytext=(x+1, y+1),
+            arrowprops=dict(arrowstyle='->', color='red'))
+```
+For detailed styling options and colormap guidelines, see `references/styling_guide.md`.
+### 5. Saving Figures
+**Export to various formats:**
+```python
+# High-resolution PNG for presentations/papers
+plt.savefig('figure.png', dpi=300, bbox_inches='tight', facecolor='white')
+# Vector format for publications (scalable)
+plt.savefig('figure.pdf', bbox_inches='tight')
+plt.savefig('figure.svg', bbox_inches='tight')
+# Transparent background
+plt.savefig('figure.png', dpi=300, bbox_inches='tight', transparent=True)
+```
+**Important parameters:**
+- `dpi`: Resolution (300 for publications, 150 for web, 72 for screen)
+- `bbox_inches='tight'`: Removes excess whitespace
+- `facecolor='white'`: Ensures white background (useful for transparent themes)
+- `transparent=True`: Transparent background
+### 6. Working with 3D Plots
+```python
+from mpl_toolkits.mplot3d import Axes3D
+fig = plt.figure(figsize=(10, 8))
+ax = fig.add_subplot(111, projection='3d')
+# Surface plot
+ax.plot_surface(X, Y, Z, cmap='viridis')
+# 3D scatter
+ax.scatter(x, y, z, c=colors, marker='o')
+# 3D line plot
+ax.plot(x, y, z, linewidth=2)
+# Labels
+ax.set_xlabel('X Label')
+ax.set_ylabel('Y Label')
+ax.set_zlabel('Z Label')
+```
+## Best Practices
+### 1. Interface Selection
+- **Use the object-oriented interface** (fig, ax = plt.subplots()) for production code
+- Reserve pyplot interface for quick interactive exploration only
+- Always create figures explicitly rather than relying on implicit state
+### 2. Figure Size and DPI
+- Set figsize at creation: `fig, ax = plt.subplots(figsize=(10, 6))`
+- Use appropriate DPI for output medium:
+  - Screen/notebook: 72-100 dpi
+  - Web: 150 dpi
+  - Print/publications: 300 dpi
+### 3. Layout Management
+- Use `constrained_layout=True` or `tight_layout()` to prevent overlapping elements
+- `fig, ax = plt.subplots(constrained_layout=True)` is recommended for automatic spacing
+### 4. Colormap Selection
+- **Sequential** (viridis, plasma, inferno): Ordered data with consistent progression
+- **Diverging** (coolwarm, RdBu): Data with meaningful center point (e.g., zero)
+- **Qualitative** (tab10, Set3): Categorical/nominal data
+- Avoid rainbow colormaps (jet) - they are not perceptually uniform
+### 5. Accessibility
+- Use colorblind-friendly colormaps (viridis, cividis)
+- Add patterns/hatching for bar charts in addition to colors
+- Ensure sufficient contrast between elements
+- Include descriptive labels and legends
+### 6. Performance
+- For large datasets, use `rasterized=True` in plot calls to reduce file size
+- Use appropriate data reduction before plotting (e.g., downsample dense time series)
+- For animations, use blitting for better performance
+### 7. Code Organization
+```python
+# Good practice: Clear structure
+def create_analysis_plot(data, title):
+    """Create standardized analysis plot."""
+    fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
+    # Plot data
+    ax.plot(data['x'], data['y'], linewidth=2)
+    # Customize
+    ax.set_xlabel('X Axis Label', fontsize=12)
+    ax.set_ylabel('Y Axis Label', fontsize=12)
+    ax.set_title(title, fontsize=14, fontweight='bold')
+    ax.grid(True, alpha=0.3)
+    return fig, ax
+# Use the function
+fig, ax = create_analysis_plot(my_data, 'My Analysis')
+plt.savefig('analysis.png', dpi=300, bbox_inches='tight')
+```
+## Quick Reference Scripts
+This skill includes helper scripts in the `scripts/` directory:
+### `plot_template.py`
+Template script demonstrating various plot types with best practices. Use this as a starting point for creating new visualizations.
+**Usage:**
+```bash
+python scripts/plot_template.py
+```
+### `style_configurator.py`
+Interactive utility to configure matplotlib style preferences and generate custom style sheets.
+**Usage:**
+```bash
+python scripts/style_configurator.py
+```
+## Detailed References
+For comprehensive information, consult the reference documents:
+- **`references/plot_types.md`** - Complete catalog of plot types with code examples and use cases
+- **`references/styling_guide.md`** - Detailed styling options, colormaps, and customization
+- **`references/api_reference.md`** - Core classes and methods reference
+- **`references/common_issues.md`** - Troubleshooting guide for common problems
+## Integration with Other Tools
+Matplotlib integrates well with:
+- **NumPy/Pandas** - Direct plotting from arrays and DataFrames
+- **Seaborn** - High-level statistical visualizations built on matplotlib
+- **Jupyter** - Interactive plotting with `%matplotlib inline` or `%matplotlib widget`
+- **GUI frameworks** - Embedding in Tkinter, Qt, wxPython applications
+## Common Gotchas
+1. **Overlapping elements**: Use `constrained_layout=True` or `tight_layout()`
+2. **State confusion**: Use OO interface to avoid pyplot state machine issues
+3. **Memory issues with many figures**: Close figures explicitly with `plt.close(fig)`
+4. **Font warnings**: Install fonts or suppress warnings with `plt.rcParams['font.sans-serif']`
+5. **DPI confusion**: Remember that figsize is in inches, not pixels: `pixels = dpi * inches`
+## Additional Resources
+- Official documentation: https://matplotlib.org/
+- Gallery: https://matplotlib.org/stable/gallery/index.html
+- Cheatsheets: https://matplotlib.org/cheatsheets/
+- Tutorials: https://matplotlib.org/stable/tutorials/index.html

.scider/skills/matplotlib/references/api_reference.md ADDED Viewed

	@@ -0,0 +1,412 @@

+# Matplotlib API Reference
+This document provides a quick reference for the most commonly used matplotlib classes and methods.
+## Core Classes
+### Figure
+The top-level container for all plot elements.
+**Creation:**
+```python
+fig = plt.figure(figsize=(10, 6), dpi=100, facecolor='white')
+fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6))
+fig, axes = plt.subplots(2, 2, figsize=(12, 10))
+```
+**Key Methods:**
+- `fig.add_subplot(nrows, ncols, index)` - Add a subplot
+- `fig.add_axes([left, bottom, width, height])` - Add axes at specific position
+- `fig.savefig(filename, dpi=300, bbox_inches='tight')` - Save figure
+- `fig.tight_layout()` - Adjust spacing to prevent overlaps
+- `fig.suptitle(title)` - Set figure title
+- `fig.legend()` - Create figure-level legend
+- `fig.colorbar(mappable)` - Add colorbar to figure
+- `plt.close(fig)` - Close figure to free memory
+**Key Attributes:**
+- `fig.axes` - List of all axes in the figure
+- `fig.dpi` - Resolution in dots per inch
+- `fig.figsize` - Figure dimensions in inches (width, height)
+### Axes
+The actual plotting area where data is visualized.
+**Creation:**
+```python
+fig, ax = plt.subplots()  # Single axes
+ax = fig.add_subplot(111)  # Alternative method
+```
+**Plotting Methods:**
+**Line plots:**
+- `ax.plot(x, y, **kwargs)` - Line plot
+- `ax.step(x, y, where='pre'/'mid'/'post')` - Step plot
+- `ax.errorbar(x, y, yerr, xerr)` - Error bars
+**Scatter plots:**
+- `ax.scatter(x, y, s=size, c=color, marker='o', alpha=0.5)` - Scatter plot
+**Bar charts:**
+- `ax.bar(x, height, width=0.8, align='center')` - Vertical bar chart
+- `ax.barh(y, width)` - Horizontal bar chart
+**Statistical plots:**
+- `ax.hist(data, bins=10, density=False)` - Histogram
+- `ax.boxplot(data, labels=None)` - Box plot
+- `ax.violinplot(data)` - Violin plot
+**2D plots:**
+- `ax.imshow(array, cmap='viridis', aspect='auto')` - Display image/matrix
+- `ax.contour(X, Y, Z, levels=10)` - Contour lines
+- `ax.contourf(X, Y, Z, levels=10)` - Filled contours
+- `ax.pcolormesh(X, Y, Z)` - Pseudocolor plot
+**Filling:**
+- `ax.fill_between(x, y1, y2, alpha=0.3)` - Fill between curves
+- `ax.fill_betweenx(y, x1, x2)` - Fill between vertical curves
+**Text and annotations:**
+- `ax.text(x, y, text, fontsize=12)` - Add text
+- `ax.annotate(text, xy=(x, y), xytext=(x2, y2), arrowprops={})` - Annotate with arrow
+**Customization Methods:**
+**Labels and titles:**
+- `ax.set_xlabel(label, fontsize=12)` - Set x-axis label
+- `ax.set_ylabel(label, fontsize=12)` - Set y-axis label
+- `ax.set_title(title, fontsize=14)` - Set axes title
+**Limits and scales:**
+- `ax.set_xlim(left, right)` - Set x-axis limits
+- `ax.set_ylim(bottom, top)` - Set y-axis limits
+- `ax.set_xscale('linear'/'log'/'symlog')` - Set x-axis scale
+- `ax.set_yscale('linear'/'log'/'symlog')` - Set y-axis scale
+**Ticks:**
+- `ax.set_xticks(positions)` - Set x-tick positions
+- `ax.set_xticklabels(labels)` - Set x-tick labels
+- `ax.tick_params(axis='both', labelsize=10)` - Customize tick appearance
+**Grid and spines:**
+- `ax.grid(True, alpha=0.3, linestyle='--')` - Add grid
+- `ax.spines['top'].set_visible(False)` - Hide top spine
+- `ax.spines['right'].set_visible(False)` - Hide right spine
+**Legend:**
+- `ax.legend(loc='best', fontsize=10, frameon=True)` - Add legend
+- `ax.legend(handles, labels)` - Custom legend
+**Aspect and layout:**
+- `ax.set_aspect('equal'/'auto'/ratio)` - Set aspect ratio
+- `ax.invert_xaxis()` - Invert x-axis
+- `ax.invert_yaxis()` - Invert y-axis
+### pyplot Module
+High-level interface for quick plotting.
+**Figure creation:**
+- `plt.figure()` - Create new figure
+- `plt.subplots()` - Create figure and axes
+- `plt.subplot()` - Add subplot to current figure
+**Plotting (uses current axes):**
+- `plt.plot()` - Line plot
+- `plt.scatter()` - Scatter plot
+- `plt.bar()` - Bar chart
+- `plt.hist()` - Histogram
+- (All axes methods available)
+**Display and save:**
+- `plt.show()` - Display figure
+- `plt.savefig()` - Save figure
+- `plt.close()` - Close figure
+**Style:**
+- `plt.style.use(style_name)` - Apply style sheet
+- `plt.style.available` - List available styles
+**State management:**
+- `plt.gca()` - Get current axes
+- `plt.gcf()` - Get current figure
+- `plt.sca(ax)` - Set current axes
+- `plt.clf()` - Clear current figure
+- `plt.cla()` - Clear current axes
+## Line and Marker Styles
+### Line Styles
+- `'-'` or `'solid'` - Solid line
+- `'--'` or `'dashed'` - Dashed line
+- `'-.'` or `'dashdot'` - Dash-dot line
+- `':'` or `'dotted'` - Dotted line
+- `''` or `' '` or `'None'` - No line
+### Marker Styles
+- `'.'` - Point marker
+- `'o'` - Circle marker
+- `'v'`, `'^'`, `'<'`, `'>'` - Triangle markers
+- `'s'` - Square marker
+- `'p'` - Pentagon marker
+- `'*'` - Star marker
+- `'h'`, `'H'` - Hexagon markers
+- `'+'` - Plus marker
+- `'x'` - X marker
+- `'D'`, `'d'` - Diamond markers
+### Color Specifications
+**Single character shortcuts:**
+- `'b'` - Blue
+- `'g'` - Green
+- `'r'` - Red
+- `'c'` - Cyan
+- `'m'` - Magenta
+- `'y'` - Yellow
+- `'k'` - Black
+- `'w'` - White
+**Named colors:**
+- `'steelblue'`, `'coral'`, `'teal'`, etc.
+- See full list: https://matplotlib.org/stable/gallery/color/named_colors.html
+**Other formats:**
+- Hex: `'#FF5733'`
+- RGB tuple: `(0.1, 0.2, 0.3)`
+- RGBA tuple: `(0.1, 0.2, 0.3, 0.5)`
+## Common Parameters
+### Plot Function Parameters
+```python
+ax.plot(x, y,
+    color='blue',           # Line color
+    linewidth=2,            # Line width
+    linestyle='--',         # Line style
+    marker='o',             # Marker style
+    markersize=8,           # Marker size
+    markerfacecolor='red',  # Marker fill color
+    markeredgecolor='black',# Marker edge color
+    markeredgewidth=1,      # Marker edge width
+    alpha=0.7,              # Transparency (0-1)
+    label='data',           # Legend label
+    zorder=2,               # Drawing order
+    rasterized=True         # Rasterize for smaller file size
+)
+```
+### Scatter Function Parameters
+```python
+ax.scatter(x, y,
+    s=50,                   # Size (scalar or array)
+    c='blue',               # Color (scalar, array, or sequence)
+    marker='o',             # Marker style
+    cmap='viridis',         # Colormap (if c is numeric)
+    alpha=0.5,              # Transparency
+    edgecolors='black',     # Edge color
+    linewidths=1,           # Edge width
+    vmin=0, vmax=1,         # Color scale limits
+    label='data'            # Legend label
+)
+```
+### Text Parameters
+```python
+ax.text(x, y, text,
+    fontsize=12,            # Font size
+    fontweight='normal',    # 'normal', 'bold', 'heavy', 'light'
+    fontstyle='normal',     # 'normal', 'italic', 'oblique'
+    fontfamily='sans-serif',# Font family
+    color='black',          # Text color
+    alpha=1.0,              # Transparency
+    ha='center',            # Horizontal alignment: 'left', 'center', 'right'
+    va='center',            # Vertical alignment: 'top', 'center', 'bottom', 'baseline'
+    rotation=0,             # Rotation angle in degrees
+    bbox=dict(              # Background box
+        facecolor='white',
+        edgecolor='black',
+        boxstyle='round'
+    )
+)
+```
+## rcParams Configuration
+Common rcParams settings for global customization:
+```python
+# Font settings
+plt.rcParams['font.family'] = 'sans-serif'
+plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica']
+plt.rcParams['font.size'] = 12
+# Figure settings
+plt.rcParams['figure.figsize'] = (10, 6)
+plt.rcParams['figure.dpi'] = 100
+plt.rcParams['figure.facecolor'] = 'white'
+plt.rcParams['savefig.dpi'] = 300
+plt.rcParams['savefig.bbox'] = 'tight'
+# Axes settings
+plt.rcParams['axes.labelsize'] = 14
+plt.rcParams['axes.titlesize'] = 16
+plt.rcParams['axes.grid'] = True
+plt.rcParams['axes.grid.alpha'] = 0.3
+# Line settings
+plt.rcParams['lines.linewidth'] = 2
+plt.rcParams['lines.markersize'] = 8
+# Tick settings
+plt.rcParams['xtick.labelsize'] = 10
+plt.rcParams['ytick.labelsize'] = 10
+plt.rcParams['xtick.direction'] = 'in'  # 'in', 'out', 'inout'
+plt.rcParams['ytick.direction'] = 'in'
+# Legend settings
+plt.rcParams['legend.fontsize'] = 12
+plt.rcParams['legend.frameon'] = True
+plt.rcParams['legend.framealpha'] = 0.8
+# Grid settings
+plt.rcParams['grid.alpha'] = 0.3
+plt.rcParams['grid.linestyle'] = '--'
+```
+## GridSpec for Complex Layouts
+```python
+from matplotlib.gridspec import GridSpec
+fig = plt.figure(figsize=(12, 8))
+gs = GridSpec(3, 3, figure=fig, hspace=0.3, wspace=0.3)
+# Span multiple cells
+ax1 = fig.add_subplot(gs[0, :])      # Top row, all columns
+ax2 = fig.add_subplot(gs[1:, 0])     # Bottom two rows, first column
+ax3 = fig.add_subplot(gs[1, 1:])     # Middle row, last two columns
+ax4 = fig.add_subplot(gs[2, 1])      # Bottom row, middle column
+ax5 = fig.add_subplot(gs[2, 2])      # Bottom row, right column
+```
+## 3D Plotting
+```python
+from mpl_toolkits.mplot3d import Axes3D
+fig = plt.figure()
+ax = fig.add_subplot(111, projection='3d')
+# Plot types
+ax.plot(x, y, z)                    # 3D line
+ax.scatter(x, y, z)                 # 3D scatter
+ax.plot_surface(X, Y, Z)            # 3D surface
+ax.plot_wireframe(X, Y, Z)          # 3D wireframe
+ax.contour(X, Y, Z)                 # 3D contour
+ax.bar3d(x, y, z, dx, dy, dz)       # 3D bar
+# Customization
+ax.set_xlabel('X')
+ax.set_ylabel('Y')
+ax.set_zlabel('Z')
+ax.view_init(elev=30, azim=45)      # Set viewing angle
+```
+## Animation
+```python
+from matplotlib.animation import FuncAnimation
+fig, ax = plt.subplots()
+line, = ax.plot([], [])
+def init():
+    ax.set_xlim(0, 2*np.pi)
+    ax.set_ylim(-1, 1)
+    return line,
+def update(frame):
+    x = np.linspace(0, 2*np.pi, 100)
+    y = np.sin(x + frame/10)
+    line.set_data(x, y)
+    return line,
+anim = FuncAnimation(fig, update, init_func=init,
+                     frames=100, interval=50, blit=True)
+# Save animation
+anim.save('animation.gif', writer='pillow', fps=20)
+anim.save('animation.mp4', writer='ffmpeg', fps=20)
+```
+## Image Operations
+```python
+# Read and display image
+img = plt.imread('image.png')
+ax.imshow(img)
+# Display matrix as image
+ax.imshow(matrix, cmap='viridis', aspect='auto',
+          interpolation='nearest', origin='lower')
+# Colorbar
+cbar = plt.colorbar(im, ax=ax)
+cbar.set_label('Values')
+# Image extent (set coordinates)
+ax.imshow(img, extent=[x_min, x_max, y_min, y_max])
+```
+## Event Handling
+```python
+# Mouse click event
+def on_click(event):
+    if event.inaxes:
+        print(f'Clicked at x={event.xdata:.2f}, y={event.ydata:.2f}')
+fig.canvas.mpl_connect('button_press_event', on_click)
+# Key press event
+def on_key(event):
+    print(f'Key pressed: {event.key}')
+fig.canvas.mpl_connect('key_press_event', on_key)
+```
+## Useful Utilities
+```python
+# Get current axis limits
+xlims = ax.get_xlim()
+ylims = ax.get_ylim()
+# Set equal aspect ratio
+ax.set_aspect('equal', adjustable='box')
+# Share axes between subplots
+fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
+# Twin axes (two y-axes)
+ax2 = ax1.twinx()
+# Remove tick labels
+ax.set_xticklabels([])
+ax.set_yticklabels([])
+# Scientific notation
+ax.ticklabel_format(style='scientific', axis='y', scilimits=(0,0))
+# Date formatting
+import matplotlib.dates as mdates
+ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
+ax.xaxis.set_major_locator(mdates.DayLocator(interval=7))
+```

.scider/skills/matplotlib/references/common_issues.md ADDED Viewed

	@@ -0,0 +1,563 @@

+# Matplotlib Common Issues and Solutions
+Troubleshooting guide for frequently encountered matplotlib problems.
+## Display and Backend Issues
+### Issue: Plots Not Showing
+**Problem:** `plt.show()` doesn't display anything
+**Solutions:**
+```python
+# 1. Check if backend is properly set (for interactive use)
+import matplotlib
+print(matplotlib.get_backend())
+# 2. Try different backends
+matplotlib.use('TkAgg')  # or 'Qt5Agg', 'MacOSX'
+import matplotlib.pyplot as plt
+# 3. In Jupyter notebooks, use magic command
+%matplotlib inline  # Static images
+# or
+%matplotlib widget  # Interactive plots
+# 4. Ensure plt.show() is called
+plt.plot([1, 2, 3])
+plt.show()
+```
+### Issue: "RuntimeError: main thread is not in main loop"
+**Problem:** Interactive mode issues with threading
+**Solution:**
+```python
+# Switch to non-interactive backend
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+# Or turn off interactive mode
+plt.ioff()
+```
+### Issue: Figures Not Updating Interactively
+**Problem:** Changes not reflected in interactive windows
+**Solution:**
+```python
+# Enable interactive mode
+plt.ion()
+# Draw after each change
+plt.plot(x, y)
+plt.draw()
+plt.pause(0.001)  # Brief pause to update display
+```
+## Layout and Spacing Issues
+### Issue: Overlapping Labels and Titles
+**Problem:** Labels, titles, or tick labels overlap or get cut off
+**Solutions:**
+```python
+# Solution 1: Constrained layout (RECOMMENDED)
+fig, ax = plt.subplots(constrained_layout=True)
+# Solution 2: Tight layout
+fig, ax = plt.subplots()
+plt.tight_layout()
+# Solution 3: Adjust margins manually
+plt.subplots_adjust(left=0.15, right=0.95, top=0.95, bottom=0.15)
+# Solution 4: Save with bbox_inches='tight'
+plt.savefig('figure.png', bbox_inches='tight')
+# Solution 5: Rotate long tick labels
+ax.set_xticklabels(labels, rotation=45, ha='right')
+```
+### Issue: Colorbar Affects Subplot Size
+**Problem:** Adding colorbar shrinks the plot
+**Solution:**
+```python
+# Solution 1: Use constrained layout
+fig, ax = plt.subplots(constrained_layout=True)
+im = ax.imshow(data)
+plt.colorbar(im, ax=ax)
+# Solution 2: Manually specify colorbar dimensions
+from mpl_toolkits.axes_grid1 import make_axes_locatable
+divider = make_axes_locatable(ax)
+cax = divider.append_axes("right", size="5%", pad=0.05)
+plt.colorbar(im, cax=cax)
+# Solution 3: For multiple subplots, share colorbar
+fig, axes = plt.subplots(1, 3, figsize=(15, 4))
+for ax in axes:
+    im = ax.imshow(data)
+fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.95)
+```
+### Issue: Subplots Too Close Together
+**Problem:** Multiple subplots overlapping
+**Solution:**
+```python
+# Solution 1: Use constrained_layout
+fig, axes = plt.subplots(2, 2, constrained_layout=True)
+# Solution 2: Adjust spacing with subplots_adjust
+fig, axes = plt.subplots(2, 2)
+plt.subplots_adjust(hspace=0.4, wspace=0.4)
+# Solution 3: Specify spacing in tight_layout
+plt.tight_layout(h_pad=2.0, w_pad=2.0)
+```
+## Memory and Performance Issues
+### Issue: Memory Leak with Multiple Figures
+**Problem:** Memory usage grows when creating many figures
+**Solution:**
+```python
+# Close figures explicitly
+fig, ax = plt.subplots()
+ax.plot(x, y)
+plt.savefig('plot.png')
+plt.close(fig)  # or plt.close('all')
+# Clear current figure without closing
+plt.clf()
+# Clear current axes
+plt.cla()
+```
+### Issue: Large File Sizes
+**Problem:** Saved figures are too large
+**Solutions:**
+```python
+# Solution 1: Reduce DPI
+plt.savefig('figure.png', dpi=150)  # Instead of 300
+# Solution 2: Use rasterization for complex plots
+ax.plot(x, y, rasterized=True)
+# Solution 3: Use vector format for simple plots
+plt.savefig('figure.pdf')  # or .svg
+# Solution 4: Compress PNG
+plt.savefig('figure.png', dpi=300, optimize=True)
+```
+### Issue: Slow Plotting with Large Datasets
+**Problem:** Plotting takes too long with many points
+**Solutions:**
+```python
+# Solution 1: Downsample data
+from scipy.signal import decimate
+y_downsampled = decimate(y, 10)  # Keep every 10th point
+# Solution 2: Use rasterization
+ax.plot(x, y, rasterized=True)
+# Solution 3: Use line simplification
+ax.plot(x, y)
+for line in ax.get_lines():
+    line.set_rasterized(True)
+# Solution 4: For scatter plots, consider hexbin or 2d histogram
+ax.hexbin(x, y, gridsize=50, cmap='viridis')
+```
+## Font and Text Issues
+### Issue: Font Warnings
+**Problem:** "findfont: Font family [...] not found"
+**Solutions:**
+```python
+# Solution 1: Use available fonts
+from matplotlib.font_manager import findfont, FontProperties
+print(findfont(FontProperties(family='sans-serif')))
+# Solution 2: Rebuild font cache
+import matplotlib.font_manager
+matplotlib.font_manager._rebuild()
+# Solution 3: Suppress warnings
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+# Solution 4: Specify fallback fonts
+plt.rcParams['font.sans-serif'] = ['Arial', 'DejaVu Sans', 'sans-serif']
+```
+### Issue: LaTeX Rendering Errors
+**Problem:** Math text not rendering correctly
+**Solutions:**
+```python
+# Solution 1: Use raw strings with r prefix
+ax.set_xlabel(r'$\alpha$')  # Not '\alpha'
+# Solution 2: Escape backslashes in regular strings
+ax.set_xlabel('$\\alpha$')
+# Solution 3: Disable LaTeX if not installed
+plt.rcParams['text.usetex'] = False
+# Solution 4: Use mathtext instead of full LaTeX
+# Mathtext is always available, no LaTeX installation needed
+ax.text(x, y, r'$\int_0^\infty e^{-x} dx$')
+```
+### Issue: Text Cut Off or Outside Figure
+**Problem:** Labels or annotations appear outside figure bounds
+**Solutions:**
+```python
+# Solution 1: Use bbox_inches='tight'
+plt.savefig('figure.png', bbox_inches='tight')
+# Solution 2: Adjust figure bounds
+plt.subplots_adjust(left=0.15, right=0.85, top=0.85, bottom=0.15)
+# Solution 3: Clip text to axes
+ax.text(x, y, 'text', clip_on=True)
+# Solution 4: Use constrained_layout
+fig, ax = plt.subplots(constrained_layout=True)
+```
+## Color and Colormap Issues
+### Issue: Colorbar Not Matching Plot
+**Problem:** Colorbar shows different range than data
+**Solution:**
+```python
+# Explicitly set vmin and vmax
+im = ax.imshow(data, vmin=0, vmax=1, cmap='viridis')
+plt.colorbar(im, ax=ax)
+# Or use the same norm for multiple plots
+import matplotlib.colors as mcolors
+norm = mcolors.Normalize(vmin=data.min(), vmax=data.max())
+im1 = ax1.imshow(data1, norm=norm, cmap='viridis')
+im2 = ax2.imshow(data2, norm=norm, cmap='viridis')
+```
+### Issue: Colors Look Wrong
+**Problem:** Unexpected colors in plots
+**Solutions:**
+```python
+# Solution 1: Check color specification format
+ax.plot(x, y, color='blue')  # Correct
+ax.plot(x, y, color=(0, 0, 1))  # Correct RGB
+ax.plot(x, y, color='#0000FF')  # Correct hex
+# Solution 2: Verify colormap exists
+print(plt.colormaps())  # List available colormaps
+# Solution 3: For scatter plots, ensure c shape matches
+ax.scatter(x, y, c=colors)  # colors should have same length as x, y
+# Solution 4: Check if alpha is set correctly
+ax.plot(x, y, alpha=1.0)  # 0=transparent, 1=opaque
+```
+### Issue: Reversed Colormap
+**Problem:** Colormap direction is backwards
+**Solution:**
+```python
+# Add _r suffix to reverse any colormap
+ax.imshow(data, cmap='viridis_r')
+```
+## Axis and Scale Issues
+### Issue: Axis Limits Not Working
+**Problem:** `set_xlim` or `set_ylim` not taking effect
+**Solutions:**
+```python
+# Solution 1: Set after plotting
+ax.plot(x, y)
+ax.set_xlim(0, 10)
+ax.set_ylim(-1, 1)
+# Solution 2: Disable autoscaling
+ax.autoscale(False)
+ax.set_xlim(0, 10)
+# Solution 3: Use axis method
+ax.axis([xmin, xmax, ymin, ymax])
+```
+### Issue: Log Scale with Zero or Negative Values
+**Problem:** ValueError when using log scale with data ≤ 0
+**Solutions:**
+```python
+# Solution 1: Filter out non-positive values
+mask = (data > 0)
+ax.plot(x[mask], data[mask])
+ax.set_yscale('log')
+# Solution 2: Use symlog for data with positive and negative values
+ax.set_yscale('symlog')
+# Solution 3: Add small offset
+ax.plot(x, data + 1e-10)
+ax.set_yscale('log')
+```
+### Issue: Dates Not Displaying Correctly
+**Problem:** Date axis shows numbers instead of dates
+**Solution:**
+```python
+import matplotlib.dates as mdates
+import pandas as pd
+# Convert to datetime if needed
+dates = pd.to_datetime(date_strings)
+ax.plot(dates, values)
+# Format date axis
+ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
+ax.xaxis.set_major_locator(mdates.DayLocator(interval=7))
+plt.xticks(rotation=45)
+```
+## Legend Issues
+### Issue: Legend Covers Data
+**Problem:** Legend obscures important parts of plot
+**Solutions:**
+```python
+# Solution 1: Use 'best' location
+ax.legend(loc='best')
+# Solution 2: Place outside plot area
+ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+# Solution 3: Make legend semi-transparent
+ax.legend(framealpha=0.7)
+# Solution 4: Put legend below plot
+ax.legend(bbox_to_anchor=(0.5, -0.15), loc='upper center', ncol=3)
+```
+### Issue: Too Many Items in Legend
+**Problem:** Legend is cluttered with many entries
+**Solutions:**
+```python
+# Solution 1: Only label selected items
+for i, (x, y) in enumerate(data):
+    label = f'Data {i}' if i % 5 == 0 else None
+    ax.plot(x, y, label=label)
+# Solution 2: Use multiple columns
+ax.legend(ncol=3)
+# Solution 3: Create custom legend with fewer entries
+from matplotlib.lines import Line2D
+custom_lines = [Line2D([0], [0], color='r'),
+                Line2D([0], [0], color='b')]
+ax.legend(custom_lines, ['Category A', 'Category B'])
+# Solution 4: Use separate legend figure
+fig_leg = plt.figure(figsize=(3, 2))
+ax_leg = fig_leg.add_subplot(111)
+ax_leg.legend(*ax.get_legend_handles_labels(), loc='center')
+ax_leg.axis('off')
+```
+## 3D Plot Issues
+### Issue: 3D Plots Look Flat
+**Problem:** Difficult to perceive depth in 3D plots
+**Solutions:**
+```python
+# Solution 1: Adjust viewing angle
+ax.view_init(elev=30, azim=45)
+# Solution 2: Add gridlines
+ax.grid(True)
+# Solution 3: Use color for depth
+scatter = ax.scatter(x, y, z, c=z, cmap='viridis')
+# Solution 4: Rotate interactively (if using interactive backend)
+# User can click and drag to rotate
+```
+### Issue: 3D Axis Labels Cut Off
+**Problem:** 3D axis labels appear outside figure
+**Solution:**
+```python
+from mpl_toolkits.mplot3d import Axes3D
+fig = plt.figure(figsize=(10, 8))
+ax = fig.add_subplot(111, projection='3d')
+ax.plot_surface(X, Y, Z)
+# Add padding
+fig.tight_layout(pad=3.0)
+# Or save with tight bounding box
+plt.savefig('3d_plot.png', bbox_inches='tight', pad_inches=0.5)
+```
+## Image and Colorbar Issues
+### Issue: Images Appear Flipped
+**Problem:** Image orientation is wrong
+**Solution:**
+```python
+# Set origin parameter
+ax.imshow(img, origin='lower')  # or 'upper' (default)
+# Or flip array
+ax.imshow(np.flipud(img))
+```
+### Issue: Images Look Pixelated
+**Problem:** Image appears blocky when zoomed
+**Solutions:**
+```python
+# Solution 1: Use interpolation
+ax.imshow(img, interpolation='bilinear')
+# Options: 'nearest', 'bilinear', 'bicubic', 'spline16', 'spline36', etc.
+# Solution 2: Increase DPI when saving
+plt.savefig('figure.png', dpi=300)
+# Solution 3: Use vector format if appropriate
+plt.savefig('figure.pdf')
+```
+## Common Errors and Fixes
+### "TypeError: 'AxesSubplot' object is not subscriptable"
+**Problem:** Trying to index single axes
+```python
+# Wrong
+fig, ax = plt.subplots()
+ax[0].plot(x, y)  # Error!
+# Correct
+fig, ax = plt.subplots()
+ax.plot(x, y)
+```
+### "ValueError: x and y must have same first dimension"
+**Problem:** Data arrays have mismatched lengths
+```python
+# Check shapes
+print(f"x shape: {x.shape}, y shape: {y.shape}")
+# Ensure they match
+assert len(x) == len(y), "x and y must have same length"
+```
+### "AttributeError: 'numpy.ndarray' object has no attribute 'plot'"
+**Problem:** Calling plot on array instead of axes
+```python
+# Wrong
+data.plot(x, y)
+# Correct
+ax.plot(x, y)
+# or for pandas
+data.plot(ax=ax)
+```
+## Best Practices to Avoid Issues
+1. **Always use the OO interface** - Avoid pyplot state machine
+   ```python
+   fig, ax = plt.subplots()  # Good
+   ax.plot(x, y)
+   ```
+2. **Use constrained_layout** - Prevents overlap issues
+   ```python
+   fig, ax = plt.subplots(constrained_layout=True)
+   ```
+3. **Close figures explicitly** - Prevents memory leaks
+   ```python
+   plt.close(fig)
+   ```
+4. **Set figure size at creation** - Better than resizing later
+   ```python
+   fig, ax = plt.subplots(figsize=(10, 6))
+   ```
+5. **Use raw strings for math text** - Avoids escape issues
+   ```python
+   ax.set_xlabel(r'$\alpha$')
+   ```
+6. **Check data shapes before plotting** - Catch size mismatches early
+   ```python
+   assert len(x) == len(y)
+   ```
+7. **Use appropriate DPI** - 300 for print, 150 for web
+   ```python
+   plt.savefig('figure.png', dpi=300)
+   ```
+8. **Test with different backends** - If display issues occur
+   ```python
+   import matplotlib
+   matplotlib.use('TkAgg')
+   ```

.scider/skills/matplotlib/references/plot_types.md ADDED Viewed

	@@ -0,0 +1,476 @@

+# Matplotlib Plot Types Guide
+Comprehensive guide to different plot types in matplotlib with examples and use cases.
+## 1. Line Plots
+**Use cases:** Time series, continuous data, trends, function visualization
+### Basic Line Plot
+```python
+fig, ax = plt.subplots(figsize=(10, 6))
+ax.plot(x, y, linewidth=2, label='Data')
+ax.set_xlabel('X axis')
+ax.set_ylabel('Y axis')
+ax.legend()
+```
+### Multiple Lines
+```python
+ax.plot(x, y1, label='Dataset 1', linewidth=2)
+ax.plot(x, y2, label='Dataset 2', linewidth=2, linestyle='--')
+ax.plot(x, y3, label='Dataset 3', linewidth=2, linestyle=':')
+ax.legend()
+```
+### Line with Markers
+```python
+ax.plot(x, y, marker='o', markersize=8, linestyle='-',
+        linewidth=2, markerfacecolor='red', markeredgecolor='black')
+```
+### Step Plot
+```python
+ax.step(x, y, where='mid', linewidth=2, label='Step function')
+# where options: 'pre', 'post', 'mid'
+```
+### Error Bars
+```python
+ax.errorbar(x, y, yerr=error, fmt='o-', linewidth=2,
+            capsize=5, capthick=2, label='With uncertainty')
+```
+## 2. Scatter Plots
+**Use cases:** Correlations, relationships between variables, clusters, outliers
+### Basic Scatter
+```python
+ax.scatter(x, y, s=50, alpha=0.6)
+```
+### Sized and Colored Scatter
+```python
+scatter = ax.scatter(x, y, s=sizes*100, c=colors,
+                     cmap='viridis', alpha=0.6, edgecolors='black')
+plt.colorbar(scatter, ax=ax, label='Color variable')
+```
+### Categorical Scatter
+```python
+for category in categories:
+    mask = data['category'] == category
+    ax.scatter(data[mask]['x'], data[mask]['y'],
+               label=category, s=50, alpha=0.7)
+ax.legend()
+```
+## 3. Bar Charts
+**Use cases:** Categorical comparisons, discrete data, counts
+### Vertical Bar Chart
+```python
+ax.bar(categories, values, color='steelblue',
+       edgecolor='black', linewidth=1.5)
+ax.set_ylabel('Values')
+```
+### Horizontal Bar Chart
+```python
+ax.barh(categories, values, color='coral',
+        edgecolor='black', linewidth=1.5)
+ax.set_xlabel('Values')
+```
+### Grouped Bar Chart
+```python
+x = np.arange(len(categories))
+width = 0.35
+ax.bar(x - width/2, values1, width, label='Group 1')
+ax.bar(x + width/2, values2, width, label='Group 2')
+ax.set_xticks(x)
+ax.set_xticklabels(categories)
+ax.legend()
+```
+### Stacked Bar Chart
+```python
+ax.bar(categories, values1, label='Part 1')
+ax.bar(categories, values2, bottom=values1, label='Part 2')
+ax.bar(categories, values3, bottom=values1+values2, label='Part 3')
+ax.legend()
+```
+### Bar Chart with Error Bars
+```python
+ax.bar(categories, values, yerr=errors, capsize=5,
+       color='steelblue', edgecolor='black')
+```
+### Bar Chart with Patterns
+```python
+bars1 = ax.bar(x - width/2, values1, width, label='Group 1',
+               color='white', edgecolor='black', hatch='//')
+bars2 = ax.bar(x + width/2, values2, width, label='Group 2',
+               color='white', edgecolor='black', hatch='\\\\')
+```
+## 4. Histograms
+**Use cases:** Distributions, frequency analysis
+### Basic Histogram
+```python
+ax.hist(data, bins=30, edgecolor='black', alpha=0.7)
+ax.set_xlabel('Value')
+ax.set_ylabel('Frequency')
+```
+### Multiple Overlapping Histograms
+```python
+ax.hist(data1, bins=30, alpha=0.5, label='Dataset 1')
+ax.hist(data2, bins=30, alpha=0.5, label='Dataset 2')
+ax.legend()
+```
+### Normalized Histogram (Density)
+```python
+ax.hist(data, bins=30, density=True, alpha=0.7,
+        edgecolor='black', label='Empirical')
+# Overlay theoretical distribution
+from scipy.stats import norm
+x = np.linspace(data.min(), data.max(), 100)
+ax.plot(x, norm.pdf(x, data.mean(), data.std()),
+        'r-', linewidth=2, label='Normal fit')
+ax.legend()
+```
+### 2D Histogram (Hexbin)
+```python
+hexbin = ax.hexbin(x, y, gridsize=30, cmap='Blues')
+plt.colorbar(hexbin, ax=ax, label='Counts')
+```
+### 2D Histogram (hist2d)
+```python
+h = ax.hist2d(x, y, bins=30, cmap='Blues')
+plt.colorbar(h[3], ax=ax, label='Counts')
+```
+## 5. Box and Violin Plots
+**Use cases:** Statistical distributions, outlier detection, comparing distributions
+### Box Plot
+```python
+ax.boxplot([data1, data2, data3],
+           labels=['Group A', 'Group B', 'Group C'],
+           showmeans=True, meanline=True)
+ax.set_ylabel('Values')
+```
+### Horizontal Box Plot
+```python
+ax.boxplot([data1, data2, data3], vert=False,
+           labels=['Group A', 'Group B', 'Group C'])
+ax.set_xlabel('Values')
+```
+### Violin Plot
+```python
+parts = ax.violinplot([data1, data2, data3],
+                      positions=[1, 2, 3],
+                      showmeans=True, showmedians=True)
+ax.set_xticks([1, 2, 3])
+ax.set_xticklabels(['Group A', 'Group B', 'Group C'])
+```
+## 6. Heatmaps
+**Use cases:** Matrix data, correlations, intensity maps
+### Basic Heatmap
+```python
+im = ax.imshow(matrix, cmap='coolwarm', aspect='auto')
+plt.colorbar(im, ax=ax, label='Values')
+ax.set_xlabel('X')
+ax.set_ylabel('Y')
+```
+### Heatmap with Annotations
+```python
+im = ax.imshow(matrix, cmap='coolwarm')
+plt.colorbar(im, ax=ax)
+# Add text annotations
+for i in range(matrix.shape[0]):
+    for j in range(matrix.shape[1]):
+        text = ax.text(j, i, f'{matrix[i, j]:.2f}',
+                       ha='center', va='center', color='black')
+```
+### Correlation Matrix
+```python
+corr = data.corr()
+im = ax.imshow(corr, cmap='RdBu_r', vmin=-1, vmax=1)
+plt.colorbar(im, ax=ax, label='Correlation')
+# Set tick labels
+ax.set_xticks(range(len(corr)))
+ax.set_yticks(range(len(corr)))
+ax.set_xticklabels(corr.columns, rotation=45, ha='right')
+ax.set_yticklabels(corr.columns)
+```
+## 7. Contour Plots
+**Use cases:** 3D data on 2D plane, topography, function visualization
+### Contour Lines
+```python
+contour = ax.contour(X, Y, Z, levels=10, cmap='viridis')
+ax.clabel(contour, inline=True, fontsize=8)
+plt.colorbar(contour, ax=ax)
+```
+### Filled Contours
+```python
+contourf = ax.contourf(X, Y, Z, levels=20, cmap='viridis')
+plt.colorbar(contourf, ax=ax)
+```
+### Combined Contours
+```python
+contourf = ax.contourf(X, Y, Z, levels=20, cmap='viridis', alpha=0.8)
+contour = ax.contour(X, Y, Z, levels=10, colors='black',
+                     linewidths=0.5, alpha=0.4)
+ax.clabel(contour, inline=True, fontsize=8)
+plt.colorbar(contourf, ax=ax)
+```
+## 8. Pie Charts
+**Use cases:** Proportions, percentages (use sparingly)
+### Basic Pie Chart
+```python
+ax.pie(sizes, labels=labels, autopct='%1.1f%%',
+       startangle=90, colors=colors)
+ax.axis('equal')  # Equal aspect ratio ensures circular pie
+```
+### Exploded Pie Chart
+```python
+explode = (0.1, 0, 0, 0)  # Explode first slice
+ax.pie(sizes, explode=explode, labels=labels,
+       autopct='%1.1f%%', shadow=True, startangle=90)
+ax.axis('equal')
+```
+### Donut Chart
+```python
+ax.pie(sizes, labels=labels, autopct='%1.1f%%',
+       wedgeprops=dict(width=0.5), startangle=90)
+ax.axis('equal')
+```
+## 9. Polar Plots
+**Use cases:** Cyclic data, directional data, radar charts
+### Basic Polar Plot
+```python
+theta = np.linspace(0, 2*np.pi, 100)
+r = np.abs(np.sin(2*theta))
+ax = plt.subplot(111, projection='polar')
+ax.plot(theta, r, linewidth=2)
+```
+### Radar Chart
+```python
+categories = ['A', 'B', 'C', 'D', 'E']
+values = [4, 3, 5, 2, 4]
+# Add first value to the end to close the polygon
+angles = np.linspace(0, 2*np.pi, len(categories), endpoint=False)
+values_closed = np.concatenate((values, [values[0]]))
+angles_closed = np.concatenate((angles, [angles[0]]))
+ax = plt.subplot(111, projection='polar')
+ax.plot(angles_closed, values_closed, 'o-', linewidth=2)
+ax.fill(angles_closed, values_closed, alpha=0.25)
+ax.set_xticks(angles)
+ax.set_xticklabels(categories)
+```
+## 10. Stream and Quiver Plots
+**Use cases:** Vector fields, flow visualization
+### Quiver Plot (Vector Field)
+```python
+ax.quiver(X, Y, U, V, alpha=0.8)
+ax.set_xlabel('X')
+ax.set_ylabel('Y')
+ax.set_aspect('equal')
+```
+### Stream Plot
+```python
+ax.streamplot(X, Y, U, V, density=1.5, color='k', linewidth=1)
+ax.set_xlabel('X')
+ax.set_ylabel('Y')
+ax.set_aspect('equal')
+```
+## 11. Fill Between
+**Use cases:** Uncertainty bounds, confidence intervals, areas under curves
+### Fill Between Two Curves
+```python
+ax.plot(x, y, 'k-', linewidth=2, label='Mean')
+ax.fill_between(x, y - std, y + std, alpha=0.3,
+                label='±1 std dev')
+ax.legend()
+```
+### Fill Between with Condition
+```python
+ax.plot(x, y1, label='Line 1')
+ax.plot(x, y2, label='Line 2')
+ax.fill_between(x, y1, y2, where=(y2 >= y1),
+                alpha=0.3, label='y2 > y1', interpolate=True)
+ax.legend()
+```
+## 12. 3D Plots
+**Use cases:** Three-dimensional data visualization
+### 3D Scatter
+```python
+from mpl_toolkits.mplot3d import Axes3D
+fig = plt.figure(figsize=(10, 8))
+ax = fig.add_subplot(111, projection='3d')
+scatter = ax.scatter(x, y, z, c=colors, cmap='viridis',
+                     marker='o', s=50)
+plt.colorbar(scatter, ax=ax)
+ax.set_xlabel('X')
+ax.set_ylabel('Y')
+ax.set_zlabel('Z')
+```
+### 3D Surface Plot
+```python
+fig = plt.figure(figsize=(10, 8))
+ax = fig.add_subplot(111, projection='3d')
+surf = ax.plot_surface(X, Y, Z, cmap='viridis',
+                       edgecolor='none', alpha=0.9)
+plt.colorbar(surf, ax=ax)
+ax.set_xlabel('X')
+ax.set_ylabel('Y')
+ax.set_zlabel('Z')
+```
+### 3D Wireframe
+```python
+fig = plt.figure(figsize=(10, 8))
+ax = fig.add_subplot(111, projection='3d')
+ax.plot_wireframe(X, Y, Z, color='black', linewidth=0.5)
+ax.set_xlabel('X')
+ax.set_ylabel('Y')
+ax.set_zlabel('Z')
+```
+### 3D Contour
+```python
+fig = plt.figure(figsize=(10, 8))
+ax = fig.add_subplot(111, projection='3d')
+ax.contour(X, Y, Z, levels=15, cmap='viridis')
+ax.set_xlabel('X')
+ax.set_ylabel('Y')
+ax.set_zlabel('Z')
+```
+## 13. Specialized Plots
+### Stem Plot
+```python
+ax.stem(x, y, linefmt='C0-', markerfmt='C0o', basefmt='k-')
+ax.set_xlabel('X')
+ax.set_ylabel('Y')
+```
+### Filled Polygon
+```python
+vertices = [(0, 0), (1, 0), (1, 1), (0, 1)]
+from matplotlib.patches import Polygon
+polygon = Polygon(vertices, closed=True, edgecolor='black',
+                  facecolor='lightblue', alpha=0.5)
+ax.add_patch(polygon)
+ax.set_xlim(-0.5, 1.5)
+ax.set_ylim(-0.5, 1.5)
+```
+### Staircase Plot
+```python
+ax.stairs(values, edges, fill=True, alpha=0.5)
+```
+### Broken Barh (Gantt-style)
+```python
+ax.broken_barh([(10, 50), (100, 20), (130, 10)], (10, 9),
+               facecolors='tab:blue')
+ax.broken_barh([(10, 20), (50, 50), (120, 30)], (20, 9),
+               facecolors='tab:orange')
+ax.set_ylim(5, 35)
+ax.set_xlim(0, 200)
+ax.set_xlabel('Time')
+ax.set_yticks([15, 25])
+ax.set_yticklabels(['Task 1', 'Task 2'])
+```
+## 14. Time Series Plots
+### Basic Time Series
+```python
+import pandas as pd
+import matplotlib.dates as mdates
+ax.plot(dates, values, linewidth=2)
+ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
+ax.xaxis.set_major_locator(mdates.DayLocator(interval=7))
+plt.xticks(rotation=45)
+ax.set_xlabel('Date')
+ax.set_ylabel('Value')
+```
+### Time Series with Shaded Regions
+```python
+ax.plot(dates, values, linewidth=2)
+# Shade weekends or specific periods
+ax.axvspan(start_date, end_date, alpha=0.2, color='gray')
+```
+## Plot Selection Guide
+| Data Type | Recommended Plot | Alternative Options |
+|-----------|-----------------|---------------------|
+| Single continuous variable | Histogram, KDE | Box plot, Violin plot |
+| Two continuous variables | Scatter plot | Hexbin, 2D histogram |
+| Time series | Line plot | Area plot, Step plot |
+| Categorical vs continuous | Bar chart, Box plot | Violin plot, Strip plot |
+| Two categorical variables | Heatmap | Grouped bar chart |
+| Three continuous variables | 3D scatter, Contour | Color-coded scatter |
+| Proportions | Bar chart | Pie chart (use sparingly) |
+| Distributions comparison | Box plot, Violin plot | Overlaid histograms |
+| Correlation matrix | Heatmap | Clustered heatmap |
+| Vector field | Quiver plot, Stream plot | - |
+| Function visualization | Line plot, Contour | 3D surface |

.scider/skills/matplotlib/references/styling_guide.md ADDED Viewed

	@@ -0,0 +1,589 @@

+# Matplotlib Styling Guide
+Comprehensive guide for styling and customizing matplotlib visualizations.
+## Colormaps
+### Colormap Categories
+**1. Perceptually Uniform Sequential**
+Best for ordered data that progresses from low to high values.
+- `viridis` (default, colorblind-friendly)
+- `plasma`
+- `inferno`
+- `magma`
+- `cividis` (optimized for colorblind viewers)
+**Usage:**
+```python
+im = ax.imshow(data, cmap='viridis')
+scatter = ax.scatter(x, y, c=values, cmap='plasma')
+```
+**2. Sequential**
+Traditional colormaps for ordered data.
+- `Blues`, `Greens`, `Reds`, `Oranges`, `Purples`
+- `YlOrBr`, `YlOrRd`, `OrRd`, `PuRd`
+- `BuPu`, `GnBu`, `PuBu`, `YlGnBu`
+**3. Diverging**
+Best for data with a meaningful center point (e.g., zero, mean).
+- `coolwarm` (blue to red)
+- `RdBu` (red-blue)
+- `RdYlBu` (red-yellow-blue)
+- `RdYlGn` (red-yellow-green)
+- `PiYG`, `PRGn`, `BrBG`, `PuOr`, `RdGy`
+**Usage:**
+```python
+# Center colormap at zero
+im = ax.imshow(data, cmap='coolwarm', vmin=-1, vmax=1)
+```
+**4. Qualitative**
+Best for categorical/nominal data without inherent ordering.
+- `tab10` (10 distinct colors)
+- `tab20` (20 distinct colors)
+- `Set1`, `Set2`, `Set3`
+- `Pastel1`, `Pastel2`
+- `Dark2`, `Accent`, `Paired`
+**Usage:**
+```python
+colors = plt.cm.tab10(np.linspace(0, 1, n_categories))
+for i, category in enumerate(categories):
+    ax.plot(x, y[i], color=colors[i], label=category)
+```
+**5. Cyclic**
+Best for cyclic data (e.g., phase, angle).
+- `twilight`
+- `twilight_shifted`
+- `hsv`
+### Colormap Best Practices
+1. **Avoid `jet` colormap** - Not perceptually uniform, misleading
+2. **Use perceptually uniform colormaps** - `viridis`, `plasma`, `cividis`
+3. **Consider colorblind users** - Use `viridis`, `cividis`, or test with colorblind simulators
+4. **Match colormap to data type**:
+   - Sequential: increasing/decreasing data
+   - Diverging: data with meaningful center
+   - Qualitative: categories
+5. **Reverse colormaps** - Add `_r` suffix: `viridis_r`, `coolwarm_r`
+### Creating Custom Colormaps
+```python
+from matplotlib.colors import LinearSegmentedColormap
+# From color list
+colors = ['blue', 'white', 'red']
+n_bins = 100
+cmap = LinearSegmentedColormap.from_list('custom', colors, N=n_bins)
+# From RGB values
+colors = [(0, 0, 1), (1, 1, 1), (1, 0, 0)]  # RGB tuples
+cmap = LinearSegmentedColormap.from_list('custom', colors)
+# Use the custom colormap
+ax.imshow(data, cmap=cmap)
+```
+### Discrete Colormaps
+```python
+import matplotlib.colors as mcolors
+# Create discrete colormap from continuous
+cmap = plt.cm.viridis
+bounds = np.linspace(0, 10, 11)
+norm = mcolors.BoundaryNorm(bounds, cmap.N)
+im = ax.imshow(data, cmap=cmap, norm=norm)
+```
+## Style Sheets
+### Using Built-in Styles
+```python
+# List available styles
+print(plt.style.available)
+# Apply a style
+plt.style.use('seaborn-v0_8-darkgrid')
+# Apply multiple styles (later styles override earlier ones)
+plt.style.use(['seaborn-v0_8-whitegrid', 'seaborn-v0_8-poster'])
+# Temporarily use a style
+with plt.style.context('ggplot'):
+    fig, ax = plt.subplots()
+    ax.plot(x, y)
+```
+### Popular Built-in Styles
+- `default` - Matplotlib's default style
+- `classic` - Classic matplotlib look (pre-2.0)
+- `seaborn-v0_8-*` - Seaborn-inspired styles
+  - `seaborn-v0_8-darkgrid`, `seaborn-v0_8-whitegrid`
+  - `seaborn-v0_8-dark`, `seaborn-v0_8-white`
+  - `seaborn-v0_8-ticks`, `seaborn-v0_8-poster`, `seaborn-v0_8-talk`
+- `ggplot` - ggplot2-inspired style
+- `bmh` - Bayesian Methods for Hackers style
+- `fivethirtyeight` - FiveThirtyEight style
+- `grayscale` - Grayscale style
+### Creating Custom Style Sheets
+Create a file named `custom_style.mplstyle`:
+```
+# custom_style.mplstyle
+# Figure
+figure.figsize: 10, 6
+figure.dpi: 100
+figure.facecolor: white
+# Font
+font.family: sans-serif
+font.sans-serif: Arial, Helvetica
+font.size: 12
+# Axes
+axes.labelsize: 14
+axes.titlesize: 16
+axes.facecolor: white
+axes.edgecolor: black
+axes.linewidth: 1.5
+axes.grid: True
+axes.axisbelow: True
+# Grid
+grid.color: gray
+grid.linestyle: --
+grid.linewidth: 0.5
+grid.alpha: 0.3
+# Lines
+lines.linewidth: 2
+lines.markersize: 8
+# Ticks
+xtick.labelsize: 10
+ytick.labelsize: 10
+xtick.direction: in
+ytick.direction: in
+xtick.major.size: 6
+ytick.major.size: 6
+xtick.minor.size: 3
+ytick.minor.size: 3
+# Legend
+legend.fontsize: 12
+legend.frameon: True
+legend.framealpha: 0.8
+legend.fancybox: True
+# Savefig
+savefig.dpi: 300
+savefig.bbox: tight
+savefig.facecolor: white
+```
+Load and use:
+```python
+plt.style.use('path/to/custom_style.mplstyle')
+```
+## rcParams Configuration
+### Global Configuration
+```python
+import matplotlib.pyplot as plt
+# Configure globally
+plt.rcParams['figure.figsize'] = (10, 6)
+plt.rcParams['font.size'] = 12
+plt.rcParams['axes.labelsize'] = 14
+# Or update multiple at once
+plt.rcParams.update({
+    'figure.figsize': (10, 6),
+    'font.size': 12,
+    'axes.labelsize': 14,
+    'axes.titlesize': 16,
+    'lines.linewidth': 2
+})
+```
+### Temporary Configuration
+```python
+# Context manager for temporary changes
+with plt.rc_context({'font.size': 14, 'lines.linewidth': 2.5}):
+    fig, ax = plt.subplots()
+    ax.plot(x, y)
+```
+### Common rcParams
+**Figure settings:**
+```python
+plt.rcParams['figure.figsize'] = (10, 6)
+plt.rcParams['figure.dpi'] = 100
+plt.rcParams['figure.facecolor'] = 'white'
+plt.rcParams['figure.edgecolor'] = 'white'
+plt.rcParams['figure.autolayout'] = False
+plt.rcParams['figure.constrained_layout.use'] = True
+```
+**Font settings:**
+```python
+plt.rcParams['font.family'] = 'sans-serif'
+plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']
+plt.rcParams['font.size'] = 12
+plt.rcParams['font.weight'] = 'normal'
+```
+**Axes settings:**
+```python
+plt.rcParams['axes.facecolor'] = 'white'
+plt.rcParams['axes.edgecolor'] = 'black'
+plt.rcParams['axes.linewidth'] = 1.5
+plt.rcParams['axes.grid'] = True
+plt.rcParams['axes.labelsize'] = 14
+plt.rcParams['axes.titlesize'] = 16
+plt.rcParams['axes.labelweight'] = 'normal'
+plt.rcParams['axes.spines.top'] = True
+plt.rcParams['axes.spines.right'] = True
+```
+**Line settings:**
+```python
+plt.rcParams['lines.linewidth'] = 2
+plt.rcParams['lines.linestyle'] = '-'
+plt.rcParams['lines.marker'] = 'None'
+plt.rcParams['lines.markersize'] = 6
+```
+**Save settings:**
+```python
+plt.rcParams['savefig.dpi'] = 300
+plt.rcParams['savefig.format'] = 'png'
+plt.rcParams['savefig.bbox'] = 'tight'
+plt.rcParams['savefig.pad_inches'] = 0.1
+plt.rcParams['savefig.transparent'] = False
+```
+## Color Palettes
+### Named Color Sets
+```python
+# Tableau colors
+tableau_colors = plt.cm.tab10.colors
+# CSS4 colors (subset)
+css_colors = ['steelblue', 'coral', 'teal', 'goldenrod', 'crimson']
+# Manual definition
+custom_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
+```
+### Color Cycles
+```python
+# Set default color cycle
+from cycler import cycler
+colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
+plt.rcParams['axes.prop_cycle'] = cycler(color=colors)
+# Or combine color and line style
+plt.rcParams['axes.prop_cycle'] = cycler(color=colors) + cycler(linestyle=['-', '--', ':', '-.'])
+```
+### Palette Generation
+```python
+# Evenly spaced colors from colormap
+n_colors = 5
+colors = plt.cm.viridis(np.linspace(0, 1, n_colors))
+# Use in plot
+for i, (x, y) in enumerate(data):
+    ax.plot(x, y, color=colors[i])
+```
+## Typography
+### Font Configuration
+```python
+# Set font family
+plt.rcParams['font.family'] = 'serif'
+plt.rcParams['font.serif'] = ['Times New Roman', 'DejaVu Serif']
+# Or sans-serif
+plt.rcParams['font.family'] = 'sans-serif'
+plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica']
+# Or monospace
+plt.rcParams['font.family'] = 'monospace'
+plt.rcParams['font.monospace'] = ['Courier New', 'DejaVu Sans Mono']
+```
+### Font Properties in Text
+```python
+from matplotlib import font_manager
+# Specify font properties
+ax.text(x, y, 'Text',
+        fontsize=14,
+        fontweight='bold',  # 'normal', 'bold', 'heavy', 'light'
+        fontstyle='italic',  # 'normal', 'italic', 'oblique'
+        fontfamily='serif')
+# Use specific font file
+prop = font_manager.FontProperties(fname='path/to/font.ttf')
+ax.text(x, y, 'Text', fontproperties=prop)
+```
+### Mathematical Text
+```python
+# LaTeX-style math
+ax.set_title(r'$\alpha > \beta$')
+ax.set_xlabel(r'$\mu \pm \sigma$')
+ax.text(x, y, r'$\int_0^\infty e^{-x} dx = 1$')
+# Subscripts and superscripts
+ax.set_ylabel(r'$y = x^2 + 2x + 1$')
+ax.text(x, y, r'$x_1, x_2, \ldots, x_n$')
+# Greek letters
+ax.text(x, y, r'$\alpha, \beta, \gamma, \delta, \epsilon$')
+```
+### Using Full LaTeX
+```python
+# Enable full LaTeX rendering (requires LaTeX installation)
+plt.rcParams['text.usetex'] = True
+plt.rcParams['text.latex.preamble'] = r'\usepackage{amsmath}'
+ax.set_title(r'\textbf{Bold Title}')
+ax.set_xlabel(r'Time $t$ (s)')
+```
+## Spines and Grids
+### Spine Customization
+```python
+# Hide specific spines
+ax.spines['top'].set_visible(False)
+ax.spines['right'].set_visible(False)
+# Move spine position
+ax.spines['left'].set_position(('outward', 10))
+ax.spines['bottom'].set_position(('data', 0))
+# Change spine color and width
+ax.spines['left'].set_color('red')
+ax.spines['bottom'].set_linewidth(2)
+```
+### Grid Customization
+```python
+# Basic grid
+ax.grid(True)
+# Customized grid
+ax.grid(True, which='major', linestyle='--', linewidth=0.8, alpha=0.3)
+ax.grid(True, which='minor', linestyle=':', linewidth=0.5, alpha=0.2)
+# Grid for specific axis
+ax.grid(True, axis='x')  # Only vertical lines
+ax.grid(True, axis='y')  # Only horizontal lines
+# Grid behind or in front of data
+ax.set_axisbelow(True)  # Grid behind data
+```
+## Legend Customization
+### Legend Positioning
+```python
+# Location strings
+ax.legend(loc='best')  # Automatic best position
+ax.legend(loc='upper right')
+ax.legend(loc='upper left')
+ax.legend(loc='lower right')
+ax.legend(loc='lower left')
+ax.legend(loc='center')
+ax.legend(loc='upper center')
+ax.legend(loc='lower center')
+ax.legend(loc='center left')
+ax.legend(loc='center right')
+# Precise positioning (bbox_to_anchor)
+ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')  # Outside plot area
+ax.legend(bbox_to_anchor=(0.5, -0.15), loc='upper center', ncol=3)  # Below plot
+```
+### Legend Styling
+```python
+ax.legend(
+    fontsize=12,
+    frameon=True,           # Show frame
+    framealpha=0.9,         # Frame transparency
+    fancybox=True,          # Rounded corners
+    shadow=True,            # Shadow effect
+    ncol=2,                 # Number of columns
+    title='Legend Title',   # Legend title
+    title_fontsize=14,      # Title font size
+    edgecolor='black',      # Frame edge color
+    facecolor='white'       # Frame background color
+)
+```
+### Custom Legend Entries
+```python
+from matplotlib.lines import Line2D
+# Create custom legend handles
+custom_lines = [Line2D([0], [0], color='red', lw=2),
+                Line2D([0], [0], color='blue', lw=2, linestyle='--'),
+                Line2D([0], [0], marker='o', color='w', markerfacecolor='green', markersize=10)]
+ax.legend(custom_lines, ['Label 1', 'Label 2', 'Label 3'])
+```
+## Layout and Spacing
+### Constrained Layout
+```python
+# Preferred method (automatic adjustment)
+fig, axes = plt.subplots(2, 2, constrained_layout=True)
+```
+### Tight Layout
+```python
+# Alternative method
+fig, axes = plt.subplots(2, 2)
+plt.tight_layout(pad=1.5, h_pad=2.0, w_pad=2.0)
+```
+### Manual Adjustment
+```python
+# Fine-grained control
+plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1,
+                    hspace=0.3, wspace=0.4)
+```
+## Professional Publication Style
+Example configuration for publication-quality figures:
+```python
+# Publication style configuration
+plt.rcParams.update({
+    # Figure
+    'figure.figsize': (8, 6),
+    'figure.dpi': 100,
+    'savefig.dpi': 300,
+    'savefig.bbox': 'tight',
+    'savefig.pad_inches': 0.1,
+    # Font
+    'font.family': 'sans-serif',
+    'font.sans-serif': ['Arial', 'Helvetica'],
+    'font.size': 11,
+    # Axes
+    'axes.labelsize': 12,
+    'axes.titlesize': 14,
+    'axes.linewidth': 1.5,
+    'axes.grid': False,
+    'axes.spines.top': False,
+    'axes.spines.right': False,
+    # Lines
+    'lines.linewidth': 2,
+    'lines.markersize': 8,
+    # Ticks
+    'xtick.labelsize': 10,
+    'ytick.labelsize': 10,
+    'xtick.major.size': 6,
+    'ytick.major.size': 6,
+    'xtick.major.width': 1.5,
+    'ytick.major.width': 1.5,
+    'xtick.direction': 'in',
+    'ytick.direction': 'in',
+    # Legend
+    'legend.fontsize': 10,
+    'legend.frameon': True,
+    'legend.framealpha': 1.0,
+    'legend.edgecolor': 'black'
+})
+```
+## Dark Theme
+```python
+# Dark background style
+plt.style.use('dark_background')
+# Or manual configuration
+plt.rcParams.update({
+    'figure.facecolor': '#1e1e1e',
+    'axes.facecolor': '#1e1e1e',
+    'axes.edgecolor': 'white',
+    'axes.labelcolor': 'white',
+    'text.color': 'white',
+    'xtick.color': 'white',
+    'ytick.color': 'white',
+    'grid.color': 'gray',
+    'legend.facecolor': '#1e1e1e',
+    'legend.edgecolor': 'white'
+})
+```
+## Color Accessibility
+### Colorblind-Friendly Palettes
+```python
+# Use colorblind-friendly colormaps
+colorblind_friendly = ['viridis', 'plasma', 'cividis']
+# Colorblind-friendly discrete colors
+cb_colors = ['#0173B2', '#DE8F05', '#029E73', '#CC78BC',
+             '#CA9161', '#949494', '#ECE133', '#56B4E9']
+# Test with simulation tools or use these validated palettes
+```
+### High Contrast
+```python
+# Ensure sufficient contrast
+plt.rcParams['axes.edgecolor'] = 'black'
+plt.rcParams['axes.linewidth'] = 2
+plt.rcParams['xtick.major.width'] = 2
+plt.rcParams['ytick.major.width'] = 2
+```

.scider/skills/matplotlib/scripts/plot_template.py ADDED Viewed

	@@ -0,0 +1,446 @@

+#!/usr/bin/env python3
+"""
+Matplotlib Plot Template
+Comprehensive template demonstrating various plot types and best practices.
+Use this as a starting point for creating publication-quality visualizations.
+Usage:
+    python plot_template.py [--plot-type TYPE] [--style STYLE] [--output FILE]
+Plot types:
+    line, scatter, bar, histogram, heatmap, contour, box, violin, 3d, all
+"""
+import argparse
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.gridspec import GridSpec
+def set_publication_style():
+    """Configure matplotlib for publication-quality figures."""
+    plt.rcParams.update(
+        {
+            "figure.figsize": (10, 6),
+            "figure.dpi": 100,
+            "savefig.dpi": 300,
+            "savefig.bbox": "tight",
+            "font.size": 11,
+            "axes.labelsize": 12,
+            "axes.titlesize": 14,
+            "xtick.labelsize": 10,
+            "ytick.labelsize": 10,
+            "legend.fontsize": 10,
+            "lines.linewidth": 2,
+            "axes.linewidth": 1.5,
+        }
+    )
+def generate_sample_data():
+    """Generate sample data for demonstrations."""
+    np.random.seed(42)
+    x = np.linspace(0, 10, 100)
+    y1 = np.sin(x)
+    y2 = np.cos(x)
+    scatter_x = np.random.randn(200)
+    scatter_y = np.random.randn(200)
+    categories = ["A", "B", "C", "D", "E"]
+    bar_values = np.random.randint(10, 100, len(categories))
+    hist_data = np.random.normal(0, 1, 1000)
+    matrix = np.random.rand(10, 10)
+    X, Y = np.meshgrid(np.linspace(-3, 3, 100), np.linspace(-3, 3, 100))
+    Z = np.sin(np.sqrt(X**2 + Y**2))
+    return {
+        "x": x,
+        "y1": y1,
+        "y2": y2,
+        "scatter_x": scatter_x,
+        "scatter_y": scatter_y,
+        "categories": categories,
+        "bar_values": bar_values,
+        "hist_data": hist_data,
+        "matrix": matrix,
+        "X": X,
+        "Y": Y,
+        "Z": Z,
+    }
+def create_line_plot(data, ax=None):
+    """Create line plot with best practices."""
+    if ax is None:
+        fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
+    ax.plot(
+        data["x"], data["y1"], label="sin(x)", linewidth=2, marker="o", markevery=10, markersize=6
+    )
+    ax.plot(data["x"], data["y2"], label="cos(x)", linewidth=2, linestyle="--")
+    ax.set_xlabel("x")
+    ax.set_ylabel("y")
+    ax.set_title("Line Plot Example")
+    ax.legend(loc="best", framealpha=0.9)
+    ax.grid(True, alpha=0.3, linestyle="--")
+    # Remove top and right spines for cleaner look
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    if ax is None:
+        return fig
+    return ax
+def create_scatter_plot(data, ax=None):
+    """Create scatter plot with color and size variations."""
+    if ax is None:
+        fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
+    # Color based on distance from origin
+    colors = np.sqrt(data["scatter_x"] ** 2 + data["scatter_y"] ** 2)
+    sizes = 50 * (1 + np.abs(data["scatter_x"]))
+    scatter = ax.scatter(
+        data["scatter_x"],
+        data["scatter_y"],
+        c=colors,
+        s=sizes,
+        alpha=0.6,
+        cmap="viridis",
+        edgecolors="black",
+        linewidth=0.5,
+    )
+    ax.set_xlabel("X")
+    ax.set_ylabel("Y")
+    ax.set_title("Scatter Plot Example")
+    ax.grid(True, alpha=0.3, linestyle="--")
+    # Add colorbar
+    cbar = plt.colorbar(scatter, ax=ax)
+    cbar.set_label("Distance from origin")
+    if ax is None:
+        return fig
+    return ax
+def create_bar_chart(data, ax=None):
+    """Create bar chart with error bars and styling."""
+    if ax is None:
+        fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
+    x_pos = np.arange(len(data["categories"]))
+    errors = np.random.randint(5, 15, len(data["categories"]))
+    bars = ax.bar(
+        x_pos,
+        data["bar_values"],
+        yerr=errors,
+        color="steelblue",
+        edgecolor="black",
+        linewidth=1.5,
+        capsize=5,
+        alpha=0.8,
+    )
+    # Color bars by value
+    colors = plt.cm.viridis(data["bar_values"] / data["bar_values"].max())
+    for bar, color in zip(bars, colors):
+        bar.set_facecolor(color)
+    ax.set_xlabel("Category")
+    ax.set_ylabel("Values")
+    ax.set_title("Bar Chart Example")
+    ax.set_xticks(x_pos)
+    ax.set_xticklabels(data["categories"])
+    ax.grid(True, axis="y", alpha=0.3, linestyle="--")
+    # Remove top and right spines
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    if ax is None:
+        return fig
+    return ax
+def create_histogram(data, ax=None):
+    """Create histogram with density overlay."""
+    if ax is None:
+        fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
+    n, bins, patches = ax.hist(
+        data["hist_data"], bins=30, density=True, alpha=0.7, edgecolor="black", color="steelblue"
+    )
+    # Overlay theoretical normal distribution
+    from scipy.stats import norm
+    mu, std = norm.fit(data["hist_data"])
+    x_theory = np.linspace(data["hist_data"].min(), data["hist_data"].max(), 100)
+    ax.plot(
+        x_theory,
+        norm.pdf(x_theory, mu, std),
+        "r-",
+        linewidth=2,
+        label=f"Normal fit (μ={mu:.2f}, σ={std:.2f})",
+    )
+    ax.set_xlabel("Value")
+    ax.set_ylabel("Density")
+    ax.set_title("Histogram with Normal Fit")
+    ax.legend()
+    ax.grid(True, axis="y", alpha=0.3, linestyle="--")
+    if ax is None:
+        return fig
+    return ax
+def create_heatmap(data, ax=None):
+    """Create heatmap with colorbar and annotations."""
+    if ax is None:
+        fig, ax = plt.subplots(figsize=(10, 8), constrained_layout=True)
+    im = ax.imshow(data["matrix"], cmap="coolwarm", aspect="auto", vmin=0, vmax=1)
+    # Add colorbar
+    cbar = plt.colorbar(im, ax=ax)
+    cbar.set_label("Value")
+    # Optional: Add text annotations
+    # for i in range(data['matrix'].shape[0]):
+    #     for j in range(data['matrix'].shape[1]):
+    #         text = ax.text(j, i, f'{data["matrix"][i, j]:.2f}',
+    #                       ha='center', va='center', color='black', fontsize=8)
+    ax.set_xlabel("X Index")
+    ax.set_ylabel("Y Index")
+    ax.set_title("Heatmap Example")
+    if ax is None:
+        return fig
+    return ax
+def create_contour_plot(data, ax=None):
+    """Create contour plot with filled contours and labels."""
+    if ax is None:
+        fig, ax = plt.subplots(figsize=(10, 8), constrained_layout=True)
+    # Filled contours
+    contourf = ax.contourf(data["X"], data["Y"], data["Z"], levels=20, cmap="viridis", alpha=0.8)
+    # Contour lines
+    contour = ax.contour(
+        data["X"], data["Y"], data["Z"], levels=10, colors="black", linewidths=0.5, alpha=0.4
+    )
+    # Add labels to contour lines
+    ax.clabel(contour, inline=True, fontsize=8)
+    # Add colorbar
+    cbar = plt.colorbar(contourf, ax=ax)
+    cbar.set_label("Z value")
+    ax.set_xlabel("X")
+    ax.set_ylabel("Y")
+    ax.set_title("Contour Plot Example")
+    ax.set_aspect("equal")
+    if ax is None:
+        return fig
+    return ax
+def create_box_plot(data, ax=None):
+    """Create box plot comparing distributions."""
+    if ax is None:
+        fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
+    # Generate multiple distributions
+    box_data = [np.random.normal(0, std, 100) for std in range(1, 5)]
+    bp = ax.boxplot(
+        box_data,
+        labels=["Group 1", "Group 2", "Group 3", "Group 4"],
+        patch_artist=True,
+        showmeans=True,
+        boxprops=dict(facecolor="lightblue", edgecolor="black"),
+        medianprops=dict(color="red", linewidth=2),
+        meanprops=dict(marker="D", markerfacecolor="green", markersize=8),
+    )
+    ax.set_xlabel("Groups")
+    ax.set_ylabel("Values")
+    ax.set_title("Box Plot Example")
+    ax.grid(True, axis="y", alpha=0.3, linestyle="--")
+    if ax is None:
+        return fig
+    return ax
+def create_violin_plot(data, ax=None):
+    """Create violin plot showing distribution shapes."""
+    if ax is None:
+        fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
+    # Generate multiple distributions
+    violin_data = [np.random.normal(0, std, 100) for std in range(1, 5)]
+    parts = ax.violinplot(violin_data, positions=range(1, 5), showmeans=True, showmedians=True)
+    # Customize colors
+    for pc in parts["bodies"]:
+        pc.set_facecolor("lightblue")
+        pc.set_alpha(0.7)
+        pc.set_edgecolor("black")
+    ax.set_xlabel("Groups")
+    ax.set_ylabel("Values")
+    ax.set_title("Violin Plot Example")
+    ax.set_xticks(range(1, 5))
+    ax.set_xticklabels(["Group 1", "Group 2", "Group 3", "Group 4"])
+    ax.grid(True, axis="y", alpha=0.3, linestyle="--")
+    if ax is None:
+        return fig
+    return ax
+def create_3d_plot():
+    """Create 3D surface plot."""
+    from mpl_toolkits.mplot3d import Axes3D
+    fig = plt.figure(figsize=(12, 9))
+    ax = fig.add_subplot(111, projection="3d")
+    # Generate data
+    X = np.linspace(-5, 5, 50)
+    Y = np.linspace(-5, 5, 50)
+    X, Y = np.meshgrid(X, Y)
+    Z = np.sin(np.sqrt(X**2 + Y**2))
+    # Create surface plot
+    surf = ax.plot_surface(X, Y, Z, cmap="viridis", edgecolor="none", alpha=0.9)
+    # Add colorbar
+    fig.colorbar(surf, ax=ax, shrink=0.5)
+    ax.set_xlabel("X")
+    ax.set_ylabel("Y")
+    ax.set_zlabel("Z")
+    ax.set_title("3D Surface Plot Example")
+    # Set viewing angle
+    ax.view_init(elev=30, azim=45)
+    plt.tight_layout()
+    return fig
+def create_comprehensive_figure():
+    """Create a comprehensive figure with multiple subplots."""
+    data = generate_sample_data()
+    fig = plt.figure(figsize=(16, 12), constrained_layout=True)
+    gs = GridSpec(3, 3, figure=fig)
+    # Create subplots
+    ax1 = fig.add_subplot(gs[0, :2])  # Line plot - top left, spans 2 columns
+    create_line_plot(data, ax1)
+    ax2 = fig.add_subplot(gs[0, 2])  # Bar chart - top right
+    create_bar_chart(data, ax2)
+    ax3 = fig.add_subplot(gs[1, 0])  # Scatter plot - middle left
+    create_scatter_plot(data, ax3)
+    ax4 = fig.add_subplot(gs[1, 1])  # Histogram - middle center
+    create_histogram(data, ax4)
+    ax5 = fig.add_subplot(gs[1, 2])  # Box plot - middle right
+    create_box_plot(data, ax5)
+    ax6 = fig.add_subplot(gs[2, :2])  # Contour plot - bottom left, spans 2 columns
+    create_contour_plot(data, ax6)
+    ax7 = fig.add_subplot(gs[2, 2])  # Heatmap - bottom right
+    create_heatmap(data, ax7)
+    fig.suptitle("Comprehensive Matplotlib Template", fontsize=18, fontweight="bold")
+    return fig
+def main():
+    """Main function to run the template."""
+    parser = argparse.ArgumentParser(description="Matplotlib plot template")
+    parser.add_argument(
+        "--plot-type",
+        type=str,
+        default="all",
+        choices=[
+            "line",
+            "scatter",
+            "bar",
+            "histogram",
+            "heatmap",
+            "contour",
+            "box",
+            "violin",
+            "3d",
+            "all",
+        ],
+        help="Type of plot to create",
+    )
+    parser.add_argument("--style", type=str, default="default", help="Matplotlib style to use")
+    parser.add_argument("--output", type=str, default="plot.png", help="Output filename")
+    args = parser.parse_args()
+    # Set style
+    if args.style != "default":
+        plt.style.use(args.style)
+    else:
+        set_publication_style()
+    # Generate data
+    data = generate_sample_data()
+    # Create plot based on type
+    plot_functions = {
+        "line": create_line_plot,
+        "scatter": create_scatter_plot,
+        "bar": create_bar_chart,
+        "histogram": create_histogram,
+        "heatmap": create_heatmap,
+        "contour": create_contour_plot,
+        "box": create_box_plot,
+        "violin": create_violin_plot,
+    }
+    if args.plot_type == "3d":
+        fig = create_3d_plot()
+    elif args.plot_type == "all":
+        fig = create_comprehensive_figure()
+    else:
+        fig = plot_functions[args.plot_type](data)
+    # Save figure
+    plt.savefig(args.output, dpi=300, bbox_inches="tight")
+    print(f"Plot saved to {args.output}")
+    # Display
+    plt.show()
+if __name__ == "__main__":
+    main()

.scider/skills/matplotlib/scripts/style_configurator.py ADDED Viewed

	@@ -0,0 +1,413 @@

+#!/usr/bin/env python3
+"""
+Matplotlib Style Configurator
+Interactive utility to configure matplotlib style preferences and generate
+custom style sheets. Creates a preview of the style and optionally saves
+it as a .mplstyle file.
+Usage:
+    python style_configurator.py [--preset PRESET] [--output FILE] [--preview]
+Presets:
+    publication, presentation, web, dark, minimal
+"""
+import argparse
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.gridspec import GridSpec
+# Predefined style presets
+STYLE_PRESETS = {
+    "publication": {
+        "figure.figsize": (8, 6),
+        "figure.dpi": 100,
+        "savefig.dpi": 300,
+        "savefig.bbox": "tight",
+        "font.family": "sans-serif",
+        "font.sans-serif": ["Arial", "Helvetica"],
+        "font.size": 11,
+        "axes.labelsize": 12,
+        "axes.titlesize": 14,
+        "axes.linewidth": 1.5,
+        "axes.grid": False,
+        "axes.spines.top": False,
+        "axes.spines.right": False,
+        "lines.linewidth": 2,
+        "lines.markersize": 8,
+        "xtick.labelsize": 10,
+        "ytick.labelsize": 10,
+        "xtick.direction": "in",
+        "ytick.direction": "in",
+        "xtick.major.size": 6,
+        "ytick.major.size": 6,
+        "xtick.major.width": 1.5,
+        "ytick.major.width": 1.5,
+        "legend.fontsize": 10,
+        "legend.frameon": True,
+        "legend.framealpha": 1.0,
+        "legend.edgecolor": "black",
+    },
+    "presentation": {
+        "figure.figsize": (12, 8),
+        "figure.dpi": 100,
+        "savefig.dpi": 150,
+        "font.size": 16,
+        "axes.labelsize": 20,
+        "axes.titlesize": 24,
+        "axes.linewidth": 2,
+        "lines.linewidth": 3,
+        "lines.markersize": 12,
+        "xtick.labelsize": 16,
+        "ytick.labelsize": 16,
+        "legend.fontsize": 16,
+        "axes.grid": True,
+        "grid.alpha": 0.3,
+    },
+    "web": {
+        "figure.figsize": (10, 6),
+        "figure.dpi": 96,
+        "savefig.dpi": 150,
+        "font.size": 11,
+        "axes.labelsize": 12,
+        "axes.titlesize": 14,
+        "lines.linewidth": 2,
+        "axes.grid": True,
+        "grid.alpha": 0.2,
+        "grid.linestyle": "--",
+    },
+    "dark": {
+        "figure.facecolor": "#1e1e1e",
+        "figure.edgecolor": "#1e1e1e",
+        "axes.facecolor": "#1e1e1e",
+        "axes.edgecolor": "white",
+        "axes.labelcolor": "white",
+        "text.color": "white",
+        "xtick.color": "white",
+        "ytick.color": "white",
+        "grid.color": "gray",
+        "grid.alpha": 0.3,
+        "axes.grid": True,
+        "legend.facecolor": "#1e1e1e",
+        "legend.edgecolor": "white",
+        "savefig.facecolor": "#1e1e1e",
+    },
+    "minimal": {
+        "figure.figsize": (10, 6),
+        "axes.spines.top": False,
+        "axes.spines.right": False,
+        "axes.spines.left": False,
+        "axes.spines.bottom": False,
+        "axes.grid": False,
+        "xtick.bottom": True,
+        "ytick.left": True,
+        "axes.axisbelow": True,
+        "lines.linewidth": 2.5,
+        "font.size": 12,
+    },
+}
+def generate_preview_data():
+    """Generate sample data for style preview."""
+    np.random.seed(42)
+    x = np.linspace(0, 10, 100)
+    y1 = np.sin(x) + 0.1 * np.random.randn(100)
+    y2 = np.cos(x) + 0.1 * np.random.randn(100)
+    scatter_x = np.random.randn(100)
+    scatter_y = 2 * scatter_x + np.random.randn(100)
+    categories = ["A", "B", "C", "D", "E"]
+    bar_values = [25, 40, 30, 55, 45]
+    return {
+        "x": x,
+        "y1": y1,
+        "y2": y2,
+        "scatter_x": scatter_x,
+        "scatter_y": scatter_y,
+        "categories": categories,
+        "bar_values": bar_values,
+    }
+def create_style_preview(style_dict=None):
+    """Create a preview figure demonstrating the style."""
+    if style_dict:
+        plt.rcParams.update(style_dict)
+    data = generate_preview_data()
+    fig = plt.figure(figsize=(14, 10))
+    gs = GridSpec(2, 2, figure=fig, hspace=0.3, wspace=0.3)
+    # Line plot
+    ax1 = fig.add_subplot(gs[0, 0])
+    ax1.plot(data["x"], data["y1"], label="sin(x)", marker="o", markevery=10)
+    ax1.plot(data["x"], data["y2"], label="cos(x)", linestyle="--")
+    ax1.set_xlabel("X axis")
+    ax1.set_ylabel("Y axis")
+    ax1.set_title("Line Plot")
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    # Scatter plot
+    ax2 = fig.add_subplot(gs[0, 1])
+    colors = np.sqrt(data["scatter_x"] ** 2 + data["scatter_y"] ** 2)
+    scatter = ax2.scatter(
+        data["scatter_x"], data["scatter_y"], c=colors, cmap="viridis", alpha=0.6, s=50
+    )
+    ax2.set_xlabel("X axis")
+    ax2.set_ylabel("Y axis")
+    ax2.set_title("Scatter Plot")
+    cbar = plt.colorbar(scatter, ax=ax2)
+    cbar.set_label("Distance")
+    ax2.grid(True, alpha=0.3)
+    # Bar chart
+    ax3 = fig.add_subplot(gs[1, 0])
+    bars = ax3.bar(data["categories"], data["bar_values"], edgecolor="black", linewidth=1)
+    # Color bars with gradient
+    colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(bars)))
+    for bar, color in zip(bars, colors):
+        bar.set_facecolor(color)
+    ax3.set_xlabel("Categories")
+    ax3.set_ylabel("Values")
+    ax3.set_title("Bar Chart")
+    ax3.grid(True, axis="y", alpha=0.3)
+    # Multiple line plot with fills
+    ax4 = fig.add_subplot(gs[1, 1])
+    ax4.plot(data["x"], data["y1"], label="Signal 1", linewidth=2)
+    ax4.fill_between(data["x"], data["y1"] - 0.2, data["y1"] + 0.2, alpha=0.3, label="±1 std")
+    ax4.plot(data["x"], data["y2"], label="Signal 2", linewidth=2)
+    ax4.fill_between(data["x"], data["y2"] - 0.2, data["y2"] + 0.2, alpha=0.3)
+    ax4.set_xlabel("X axis")
+    ax4.set_ylabel("Y axis")
+    ax4.set_title("Time Series with Uncertainty")
+    ax4.legend()
+    ax4.grid(True, alpha=0.3)
+    fig.suptitle("Style Preview", fontsize=16, fontweight="bold")
+    return fig
+def save_style_file(style_dict, filename):
+    """Save style dictionary as .mplstyle file."""
+    with open(filename, "w") as f:
+        f.write("# Custom matplotlib style\n")
+        f.write("# Generated by style_configurator.py\n\n")
+        # Group settings by category
+        categories = {
+            "Figure": ["figure."],
+            "Font": ["font."],
+            "Axes": ["axes."],
+            "Lines": ["lines."],
+            "Markers": ["markers."],
+            "Ticks": ["tick.", "xtick.", "ytick."],
+            "Grid": ["grid."],
+            "Legend": ["legend."],
+            "Savefig": ["savefig."],
+            "Text": ["text."],
+        }
+        for category, prefixes in categories.items():
+            category_items = {
+                k: v for k, v in style_dict.items() if any(k.startswith(p) for p in prefixes)
+            }
+            if category_items:
+                f.write(f"# {category}\n")
+                for key, value in sorted(category_items.items()):
+                    # Format value appropriately
+                    if isinstance(value, (list, tuple)):
+                        value_str = ", ".join(str(v) for v in value)
+                    elif isinstance(value, bool):
+                        value_str = str(value)
+                    else:
+                        value_str = str(value)
+                    f.write(f"{key}: {value_str}\n")
+                f.write("\n")
+    print(f"Style saved to {filename}")
+def print_style_info(style_dict):
+    """Print information about the style."""
+    print("\n" + "=" * 60)
+    print("STYLE CONFIGURATION")
+    print("=" * 60)
+    categories = {
+        "Figure Settings": ["figure."],
+        "Font Settings": ["font."],
+        "Axes Settings": ["axes."],
+        "Line Settings": ["lines."],
+        "Grid Settings": ["grid."],
+        "Legend Settings": ["legend."],
+    }
+    for category, prefixes in categories.items():
+        category_items = {
+            k: v for k, v in style_dict.items() if any(k.startswith(p) for p in prefixes)
+        }
+        if category_items:
+            print(f"\n{category}:")
+            for key, value in sorted(category_items.items()):
+                print(f"  {key}: {value}")
+    print("\n" + "=" * 60 + "\n")
+def list_available_presets():
+    """Print available style presets."""
+    print("\nAvailable style presets:")
+    print("-" * 40)
+    descriptions = {
+        "publication": "Optimized for academic publications",
+        "presentation": "Large fonts for presentations",
+        "web": "Optimized for web display",
+        "dark": "Dark background theme",
+        "minimal": "Minimal, clean style",
+    }
+    for preset, desc in descriptions.items():
+        print(f"  {preset:15s} - {desc}")
+    print("-" * 40 + "\n")
+def interactive_mode():
+    """Run interactive mode to customize style settings."""
+    print("\n" + "=" * 60)
+    print("MATPLOTLIB STYLE CONFIGURATOR - Interactive Mode")
+    print("=" * 60)
+    list_available_presets()
+    preset = input("Choose a preset to start from (or 'custom' for default): ").strip().lower()
+    if preset in STYLE_PRESETS:
+        style_dict = STYLE_PRESETS[preset].copy()
+        print(f"\nStarting from '{preset}' preset")
+    else:
+        style_dict = {}
+        print("\nStarting from default matplotlib style")
+    print("\nCommon settings you might want to customize:")
+    print("  1. Figure size")
+    print("  2. Font sizes")
+    print("  3. Line widths")
+    print("  4. Grid settings")
+    print("  5. Color scheme")
+    print("  6. Done, show preview")
+    while True:
+        choice = input("\nSelect option (1-6): ").strip()
+        if choice == "1":
+            width = input("  Figure width (inches, default 10): ").strip() or "10"
+            height = input("  Figure height (inches, default 6): ").strip() or "6"
+            style_dict["figure.figsize"] = (float(width), float(height))
+        elif choice == "2":
+            base = input("  Base font size (default 12): ").strip() or "12"
+            style_dict["font.size"] = float(base)
+            style_dict["axes.labelsize"] = float(base) + 2
+            style_dict["axes.titlesize"] = float(base) + 4
+        elif choice == "3":
+            lw = input("  Line width (default 2): ").strip() or "2"
+            style_dict["lines.linewidth"] = float(lw)
+        elif choice == "4":
+            grid = input("  Enable grid? (y/n): ").strip().lower()
+            style_dict["axes.grid"] = grid == "y"
+            if style_dict["axes.grid"]:
+                alpha = input("  Grid transparency (0-1, default 0.3): ").strip() or "0.3"
+                style_dict["grid.alpha"] = float(alpha)
+        elif choice == "5":
+            print("  Theme options: 1=Light, 2=Dark")
+            theme = input("  Select theme (1-2): ").strip()
+            if theme == "2":
+                style_dict.update(STYLE_PRESETS["dark"])
+        elif choice == "6":
+            break
+    return style_dict
+def main():
+    """Main function."""
+    parser = argparse.ArgumentParser(
+        description="Matplotlib style configurator",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Show available presets
+  python style_configurator.py --list
+  # Preview a preset
+  python style_configurator.py --preset publication --preview
+  # Save a preset as .mplstyle file
+  python style_configurator.py --preset publication --output my_style.mplstyle
+  # Interactive mode
+  python style_configurator.py --interactive
+        """,
+    )
+    parser.add_argument(
+        "--preset",
+        type=str,
+        choices=list(STYLE_PRESETS.keys()),
+        help="Use a predefined style preset",
+    )
+    parser.add_argument("--output", type=str, help="Save style to .mplstyle file")
+    parser.add_argument("--preview", action="store_true", help="Show style preview")
+    parser.add_argument("--list", action="store_true", help="List available presets")
+    parser.add_argument("--interactive", action="store_true", help="Run in interactive mode")
+    args = parser.parse_args()
+    if args.list:
+        list_available_presets()
+        # Also show currently available matplotlib styles
+        print("\nBuilt-in matplotlib styles:")
+        print("-" * 40)
+        for style in sorted(plt.style.available):
+            print(f"  {style}")
+        return
+    if args.interactive:
+        style_dict = interactive_mode()
+    elif args.preset:
+        style_dict = STYLE_PRESETS[args.preset].copy()
+        print(f"Using '{args.preset}' preset")
+    else:
+        print("No preset or interactive mode specified. Showing default preview.")
+        style_dict = {}
+    if style_dict:
+        print_style_info(style_dict)
+    if args.output:
+        save_style_file(style_dict, args.output)
+    if args.preview or args.interactive:
+        print("Creating style preview...")
+        fig = create_style_preview(style_dict if style_dict else None)
+        if args.output:
+            preview_filename = args.output.replace(".mplstyle", "_preview.png")
+            plt.savefig(preview_filename, dpi=150, bbox_inches="tight")
+            print(f"Preview saved to {preview_filename}")
+        plt.show()
+if __name__ == "__main__":
+    main()